In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd, numpy as np
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit,StratifiedKFold, RepeatedStratifiedKFold
from pathlib import Path
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

from fastai.text import *

In [3]:
def get_score(y_valid, valid_pred):
    print(f"f1_score : {f1_score(y_valid, valid_pred,average='macro')}")
    print(f"accuracy: {accuracy_score(y_valid, valid_pred)}")    

In [4]:
path = Path('data')
TEST = True

In [6]:
reviews = (TextList.from_csv(path, 'train.csv', cols='text')
                         .split_none()
                         .label_from_df(cols=3))

#######DURING TEST#######
if TEST:
    reviews.add_test(TextList.from_csv(path, 'test.csv', cols='text'));

In [7]:
train_df = pd.read_csv(path/'train.csv')
#######DURING TEST#######
if TEST:
    test_df = pd.read_csv(path/'test.csv')

DRUGS

In [8]:
train_df.drop('unique_hash',axis=1,inplace=True)
if TEST: test_df.drop('unique_hash',axis=1,inplace=True)

In [178]:
features = [feature_exists,feature_count,feature_length,
            feature_others,feature_other_exists]

for feature in tqdm(features):
    train_df[feature.__name__] = train_df[['text','drug']].apply(feature,axis=1)
    if TEST:
        test_df[feature.__name__] = test_df[['text','drug']].apply(feature,axis=1)

In [10]:
train_texts, valid_texts, y_train, y_valid = \
        train_test_split(reviews.train.x, reviews.train.y.items, random_state=17)

#######DURING TEST#######
if TEST: train_texts, valid_texts, y_train, y_valid = reviews.train.x,reviews.test.x,reviews.train.y.items,reviews.test.y.items

In [11]:
docs = reviews.train.x
train_words = [[docs.vocab.itos[o] for o in doc.data] for doc in train_texts]
valid_words = [[docs.vocab.itos[o] for o in doc.data] for doc in valid_texts]

vec = TfidfVectorizer(ngram_range=(1,1),preprocessor=noop, tokenizer=noop)
trn_term_doc = vec.fit_transform(train_words)
test_term_doc = vec.transform(valid_words)

In [12]:
y = reviews.train.y

positive = y.c2i[0]
negative = y.c2i[1]
neutral  = y.c2i[2]

In [13]:
def pr(y_i, y):
    p = np.squeeze(np.asarray(x[y==y_i].sum(0)))
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(x,y):
    r = np.log(pr(True,y) / pr(False,y))
    m = LogisticRegression(C=1, dual=True,solver='liblinear',class_weight='balanced')
    x_nb = x.multiply(r)
#     X_train = hstack([x_nb, X_train_drugs,train_features])
    return m.fit(x_nb, y), r

In [14]:
x = trn_term_doc
test_x = test_term_doc

label_cols = [positive, negative, neutral]
preds = np.zeros((test_term_doc.shape[0], len(label_cols)))
preds_trn = np.zeros((x.shape[0], len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(x,y_train == j)
    
    x_nb_test = test_x.multiply(r)
    X_valid = hstack([x_nb_test])
    preds[:,i] = m.predict_proba(X_valid)[:,1]

predictions = np.argmax(preds,axis=1)
if not TEST:get_score(y_valid,predictions)

fit 0
fit 1
fit 2


In [17]:
np.save('pred_NB',preds)

In [19]:
pred_meta = np.load('pred_meta.npy')
pred_NB = np.load('pred_NB.npy')

In [29]:
predictions2 = np.argmax(0.3 * pred_meta  + 0.7 *pred_NB,axis=1)

In [267]:
################# K-FOLD ##################3
# x = trn_term_doc
# test_x = test_term_doc

# skf = StratifiedKFold()

# label_cols = [positive, negative, neutral]
# preds = np.zeros((test_term_doc.shape[0], len(label_cols)))
# for u,v in skf.split(x,y_train):
#     X_trn, X_tst = x[u], x[v]
#     y_trn, y_tst = y_train[u], y_train[v]
#     preds_val = np.zeros((y_tst.shape[0], len(label_cols)))
    
#     for i, j in enumerate(label_cols):
#         print('fit', j)
#         m,r = get_mdl(X_trn,y_trn == j)

#         preds_val[:,i] = m.predict_proba(X_tst.multiply(r))[:,1]
#         preds[:,i] += m.predict_proba(test_x.multiply(r))[:,1]

#     val_preds = np.argmax(preds_val,axis=1)
#     print(f"F1_score: {f1_score(y_tst,val_preds,average='macro')}")

# predictions = np.argmax(preds,axis=1)
# print(f"Final F1_score: {f1_score(y_valid,predictions,average='macro')}")
# # get_score(y_valid,predictions)

fit 0
fit 1
fit 2
F1_score: 0.4649968826266031
fit 0
fit 1
fit 2
F1_score: 0.4846167950128346
fit 0
fit 1
fit 2
F1_score: 0.44431495601500126
Final F1_score: 0.48160290246319165


|f1_score|accuracy| Summary|
|---|---|---|
|0.4425933158406717|0.6977272727272728| baseline
|0.4427632571071596|0.7| CountVectorizer
|0.3454188866557859|0.7204545454545455| CountVectorizer-ngram(1,2)
|0.44433905378177824|0.7007575757575758| CountVectorizer + Drug
|0.3602603402216243 | 0.7265151515151516| Tf-idf + Drug
|0.47709544489951083 | 0.678 | class_weight=balanced
|0.438708402946439 | 0.6946969696969697 | feature_engineering
|0.47644661545431627 |0.6803030303030303 | feature_engeneering+ balanced
|0.4715854875669423|0.6734848484848485 | basline + balanced
|0.50|0.687| tf-idf 1,1 balanced
|0.42883925131396183|0.720454545454545| tf-ids 1,2 balanced

In [30]:
#######DURING TEST#######
I+=1
path_sub =Path('Submissions')
sample = pd.read_csv(path/'sample_submission.csv')
sample['sentiment'] = predictions3
sample.to_csv(path_sub/f'submission{I}.csv',index=False)

In [26]:
I = 16