In [2]:
import pandas as pd
import numpy as np


In [3]:
train_data = pd.read_csv('datasets/train.csv')
test_data = pd.read_csv('datasets/test.csv')

In [4]:
data_all = train_data.append(test_data, ignore_index = True, sort = False)

In [5]:
data_all.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2.0
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2.0
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2.0
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2.0
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1.0


In [9]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
stops = set(stopwords.words("english"))
not_stop =["aren't", "couldn't", "didn't", "doesn't", "don't", "hadn't",
          "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't",
          "no", "nor", "shan't", "shouldn't", "wasn't", "weren't", "wouldn't"]
for i in not_stop:
    stops.remove(i)
    
stemmer = SnowballStemmer("english")

def clean_to_text(raw_text):
    review_text = BeautifulSoup(raw_text, "html.parser").get_text()
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if not w in stops]
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    return (' '.join(stemming_words))

In [10]:
data_all["clean_text"] = data_all["text"].apply(clean_to_text)

In [11]:
data_all.head()

Unnamed: 0,unique_hash,text,drug,sentiment,clean_text
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2.0,autoimmun diseas tend come cluster gilenya fee...
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2.0,complet understand want tri result report lect...
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2.0,interest target p receptor rather like fingoli...
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2.0,interest grand merci wonder lemtrada ocrevus s...
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1.0,hi everybodi latest mri result brain cervic co...


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(data_all['clean_text'])
X_train_count.shape

(8203, 31545)

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
X_train_tfidf.shape

(8203, 31545)

In [16]:
y = data_all['sentiment']

In [21]:
train_bow = X_train_count[:5279, :]
test_bow = X_train_count[5279:, :]

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_bow, 
                                                  train_data['sentiment'], 
                                                  random_state = 42,
                                                  test_size = 0.3)

In [23]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, y_train)

In [24]:
prediction = clf.predict_proba(X_val)

In [25]:
prediction

array([[6.81917703e-01, 2.05227092e-03, 3.16030026e-01],
       [8.08813817e-06, 6.08209465e-10, 9.99991911e-01],
       [2.84229089e-04, 1.70934486e-04, 9.99544836e-01],
       ...,
       [3.39525732e-09, 2.84314992e-04, 9.99715682e-01],
       [7.05668058e-10, 4.27899612e-08, 9.99999957e-01],
       [1.20729679e-08, 4.10707864e-03, 9.95892909e-01]])

In [33]:
from sklearn.metrics import f1_score
prediction_int = prediction[:,1]>0.3
prediction_int = prediction_int.astype(np.int)
f1_score(y_val, prediction_int, average = "macro")

  'precision', 'predicted', average, warn_for)


0.1828669845707441

In [34]:
test_pred = clf.predict_proba(test_bow)
test_pred_int = test_pred[:,1]>=0.3
test_pred_int = test_pred_int.astype(np.int)
test_data['sentiment'] = test_pred_int
submission = test_data[['unique_hash', 'sentiment']]
submission.to_csv('nb_submission.csv', index = False)


In [35]:
train_tfidf = X_train_tfidf[:5279, :]
test_tfidf = X_train_tfidf[5279:, :]

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_tfidf, 
                                                  train_data['sentiment'], 
                                                  random_state = 42,
                                                  test_size = 0.3)



In [37]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier().fit(X_train, y_train)

In [38]:
sgd

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [41]:
prediction =sgd.predict(X_val)

In [42]:
prediction

array([2, 2, 2, ..., 2, 2, 2])

In [43]:
np.mean(prediction == test_tfidf)

0.0

In [44]:
test_pred = sgd.predict(test_tfidf)
test_data['sentiment'] = test_pred
submission = test_data[['unique_hash', 'sentiment']]
submission.to_csv('sgd_submission.csv', index= False)

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

text_clf = Pipeline([
    ('vect', CountVectorizer),
    ('tfidf', TfidfTransformer),
    ('svc', LinearSVC(random_state = 0, tol = 1e-5))
])

In [49]:
prediction = text_clf.fit(X_train, y_train).predict(X_val)

AttributeError: _validate_params not found

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_tfidf, 
                                                  train_data['sentiment'], 
                                                  random_state = 42,
                                                  test_size = 0.3)

In [51]:
from sklearn.svm import LinearSVC
lsvc = LinearSVC(random_state = 0, tol = 1e-5).fit(X_train, y_train)

In [52]:
prediction = lsvc.predict(X_val)

In [53]:
prediction

array([2, 2, 2, ..., 2, 2, 2])

In [54]:
test_pred = lsvc.predict(test_tfidf)
test_data['sentiment'] = test_pred
submission = test_data[['unique_hash', 'sentiment']]
submission.to_csv('lsvc_submission.csv', index= False)

In [56]:
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

clf = LGBMClassifier(n_estimators = 2000,
                    learning_rate = .9,
                    num_leaves = 30,
                    subsample = .9,
                    max_depth = 7,
                    reg_alpha = 0.1,
                    reg_lambda = 0.1,
                    min_split_weight = 0.01,
                    min_child_weight = 2,
                    silent = -1,
                    verbose = -1)
prediction = clf.fit(X_train, y_train).predict(X_val)

In [57]:
test_pred = clf.predict(test_tfidf)
test_data['sentiment'] = test_pred
submission = test_data[['unique_hash', 'sentiment']]
submission.to_csv('light_submission.csv', index= False)

In [60]:
from textblob import TextBlob
from tqdm import tqdm

texts = data_all['clean_text']
predict_sentiment = []
for text in tqdm(texts):
    blob = TextBlob(text)
    predict_sentiment += [blob.sentiment.polarity]
data_all['predict_sentiment'] = predict_sentiment

100%|██████████| 8203/8203 [01:14<00:00, 109.55it/s]


In [61]:
data_all.head()

Unnamed: 0,unique_hash,text,drug,sentiment,clean_text,predict_sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2.0,autoimmun diseas tend come cluster gilenya fee...,0.366667
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2.0,complet understand want tri result report lect...,0.167708
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2.0,interest target p receptor rather like fingoli...,0.093837
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2.0,interest grand merci wonder lemtrada ocrevus s...,0.5
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1.0,hi everybodi latest mri result brain cervic co...,0.288636


In [62]:
texts = data_all['text']
predict_sentiment = []
for text in tqdm(texts):
    blob = TextBlob(text)
    predict_sentiment += [blob.sentiment.polarity]
data_all['predict_sentiment2'] = predict_sentiment

100%|██████████| 8203/8203 [02:20<00:00, 58.50it/s]


In [63]:
data_all.head()

Unnamed: 0,unique_hash,text,drug,sentiment,clean_text,predict_sentiment,predict_sentiment2
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2.0,autoimmun diseas tend come cluster gilenya fee...,0.366667,0.229167
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2.0,complet understand want tri result report lect...,0.167708,0.287625
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2.0,interest target p receptor rather like fingoli...,0.093837,0.182391
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2.0,interest grand merci wonder lemtrada ocrevus s...,0.5,0.575
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1.0,hi everybodi latest mri result brain cervic co...,0.288636,0.228182


In [66]:
import string
data_all['count_sent'] = data_all['text'].apply(lambda x: len(re.findall("\n",str(x)))+1)
data_all['count_word'] = data_all['clean_text'].apply(lambda x: len(str(x).split()))
data_all['count_unique_word']=data_all["clean_text"].apply(lambda x: len(set(str(x).split())))
data_all['count_letters']=data_all["clean_text"].apply(lambda x: len(str(x)))
data_all["count_punctuations"] = data_all["text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
data_all["count_words_upper"] = data_all["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
data_all["count_words_title"] = data_all["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
data_all["count_stopwords"] = data_all["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stops]))
data_all["mean_word_len"] = data_all["clean_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [67]:
data_all.head()

Unnamed: 0,unique_hash,text,drug,sentiment,clean_text,predict_sentiment,predict_sentiment2,count_sent,count_word,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2.0,autoimmun diseas tend come cluster gilenya fee...,0.366667,0.229167,1,38,35,219,15,1,5,28,4.789474
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2.0,complet understand want tri result report lect...,0.167708,0.287625,1,92,77,577,24,5,11,94,5.282609
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2.0,interest target p receptor rather like fingoli...,0.093837,0.182391,1,65,46,388,57,4,11,48,4.984615
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2.0,interest grand merci wonder lemtrada ocrevus s...,0.5,0.575,1,13,13,79,4,1,3,8,5.153846
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1.0,hi everybodi latest mri result brain cervic co...,0.288636,0.228182,1,57,46,339,15,8,23,51,4.964912


In [68]:
data_train = data_all[:5279]
data_test = data_all[5279:]

In [80]:
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
y = data_train['sentiment']
#features = ["predict_sentiment", "predict_sentiment2", "mean_word_len", "count_stopwords",
#            "count_words_title", "count_words_upper", "count_punctuations", "count_letters",
#           'count_unique_word', "count_word", 'count_sent']
features = ["predict_sentiment",  "mean_word_len", "count_stopwords",
            "count_words_title", "count_words_upper", "count_punctuations", "count_letters",
           'count_unique_word', "count_word", 'count_sent']
sub_preds = np.zeros(data_test.shape[0])
X_train, X_val, y_train, y_val = train_test_split(data_train[features], y, random_state = 42, test_size = 0.2)
clf = LGBMClassifier(n_estimators = 2000,
                    learning_rate = .9,
                    num_leaves = 30,
                    subsample = .9,
                    max_depth = 7,
                    reg_alpha = 0.1,
                    reg_lambda = 0.1,
                    min_split_weight = 0.01,
                    min_child_weight = 2,
                    silent = -1,
                    verbose = -1)
prediction = clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_val, y_val)],
                    verbose=100, early_stopping_rounds=100)



Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.0429921	valid_1's multi_logloss: 1.20257
Early stopping, best iteration is:
[1]	training's multi_logloss: 0.704574	valid_1's multi_logloss: 0.74903


In [82]:
prediction = clf.predict(data_test[features])

In [83]:
prediction

array([2., 2., 2., ..., 2., 2., 2.])

In [78]:
from sklearn.metrics import f1_score
f1_score(y_val, prediction, average='macro')


TypeError: Expected sequence or array-like, got estimator LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.9, max_depth=7,
               min_child_samples=20, min_child_weight=2, min_split_gain=0.0,
               min_split_weight=0.01, n_estimators=2000, n_jobs=-1,
               num_leaves=30, objective=None, random_state=None, reg_alpha=0.1,
               reg_lambda=0.1, silent=-1, subsample=0.9,
               subsample_for_bin=200000, subsample_freq=0, verbose=-1)

In [84]:
test_data['sentiment'] = prediction
submission = test_data[['unique_hash', 'sentiment']]
submission.to_csv('textbloblight_submission1.csv', index= False)

In [74]:
f1_score(y_val, prediction, average = "macro")

ValueError: Found input variables with inconsistent numbers of samples: [1056, 2924]