# Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats

# Import the dataset

In [2]:
drug_data = pd.read_csv('train_F3WbcTw.csv')

In [3]:
drug_data

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1
5,a043780c757966243779bf3c0d11bf6eef721971,I can’t give you advice about Lemtrada because...,cladribine,2
6,be5a13376933a7f9bbf8e801c31691092f63260a,Reply posted for JessZidek. Hi Jess Sorry to r...,humira,0
7,08c3c0c702fc97d290204b37798ac62005da5626,Well as expected my Neurologist wants me to st...,gilenya,2
8,8fd3d7ad80791c9343e5cf8a83bd1adf6577d516,Why do you think that FIngolimod was such a mi...,fingolimod,1
9,793c5af7cc8332df17eb602247d886fbd1c80f89,Thank you so much…I’m learning a lot here at G...,tagrisso,2


# Pre-processing the data

In [4]:
drug_data.head()

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2
3,bd22104dfa9ec80db4099523e03fae7a52735eb6,"Very interesting, grand merci. Now I wonder wh...",ocrevus,2
4,b227688381f9b25e5b65109dd00f7f895e838249,"Hi everybody, My latest MRI results for Brain ...",gilenya,1


# Sentiment analysis using scikit-learn

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# RegEx to clean up text

In [6]:
pd.set_option('display.width', 1000)
rx_pat = r"(\\r)|(\\n)|(\\t)|(\\f)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(&#039;)|(\d\s)|(\d)|(\/)"
rx_pat_wSpace = r"(\-)|(\\)|(\s{2,})"
    
drug_data['text'].replace(regex=True,inplace=True,to_replace=rx_pat, value=r'')
drug_data['text'].replace(regex=True,inplace=True,to_replace=rx_pat_wSpace, value=r' ')
drug_data.text.head(5)



0    Autoimmune diseases tend to come in clusters A...
1    I can completely understand why you’d want to ...
2    Interesting that it only targets SP receptors ...
3    Very interesting grand merci Now I wonder wher...
4    Hi everybody My latest MRI results for Brain a...
Name: text, dtype: object

# Inspect the Cleaned set

In [7]:
drug_data['text'] = drug_data['text'].str.lower()

drug_data['text'].head(5)

0    autoimmune diseases tend to come in clusters a...
1    i can completely understand why you’d want to ...
2    interesting that it only targets sp receptors ...
3    very interesting grand merci now i wonder wher...
4    hi everybody my latest mri results for brain a...
Name: text, dtype: object

# Feature Extraction

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(drug_data['text'])

# Define my X & create my matrix with n things and n features
X = cv.transform(drug_data['text'])

# Define my y 
y = drug_data['sentiment']

In [9]:
X

<5279x40982 sparse matrix of type '<class 'numpy.int64'>'
	with 771403 stored elements in Compressed Sparse Row format>

In [10]:
y.head()

0    2
1    2
2    2
3    2
4    1
Name: sentiment, dtype: int64

In [11]:
np.size(X,0)

5279

In [12]:
np.size(y,0)

5279

# Build the model

In [154]:
np.random.seed()

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8)

for c in [0.001, 0.01, 0.05, 0.25, 0.5, 1, 1.5, 2, 2.5, 3, 5, 10]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print('Accuracy for C=%s: %s'
          % (c, accuracy_score(y_val, lr.predict(X_val))))


Accuracy for C=0.001: 0.7329545454545454
Accuracy for C=0.01: 0.7348484848484849
Accuracy for C=0.05: 0.7244318181818182
Accuracy for C=0.25: 0.7111742424242424
Accuracy for C=0.5: 0.7007575757575758
Accuracy for C=1: 0.7017045454545454
Accuracy for C=1.5: 0.6979166666666666
Accuracy for C=2: 0.6931818181818182
Accuracy for C=2.5: 0.6922348484848485
Accuracy for C=3: 0.6893939393939394
Accuracy for C=5: 0.6875
Accuracy for C=10: 0.6856060606060606


# Pick a model with a C value

In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

final_model = LogisticRegression(C=5)
final_model.fit(X_train, y_train)
print('Final Accuracy: %s'
      % accuracy_score(y_test, final_model.predict(X_test)))

Final Accuracy: 0.6799242424242424


In [156]:
sentiment_train = final_model.predict(X_test)
sentiment_train

array([2, 2, 2, ..., 2, 2, 2])

In [157]:
sentiment_train_df = pd.DataFrame(sentiment_train)
sentiment_train_df

Unnamed: 0,0
0,2
1,2
2,2
3,2
4,2
5,2
6,2
7,2
8,2
9,2


# Inspect the weights of each token

In [158]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
print("\n")
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

('glad', 1.8708455395332027)
('switching', 1.5090472965643853)
('breakthrough', 1.4624217341663632)
('came', 1.460503731502011)
('reduced', 1.453055822845417)
('approved', 1.4269604910716491)
('grow', 1.4181149779455096)
('deteriorated', 1.396989603581911)
('problems', 1.3760056229771058)
('year', 1.3755015276951474)


('fairly', -1.7558397850925254)
('unfortunately', -1.577648702944383)
('immunotherapy', -1.5754237995116436)
('last', -1.491691993743095)
('past', -1.4905482232594198)
('ct', -1.4015824081003936)
('alimta', -1.3697946013031232)
('wishing', -1.3642072656442608)
('antibody', -1.309550340139164)
('stopped', -1.2747017994663281)


# Test the Model with Test data

In [159]:
drug_test_data = pd.read_csv('test_tOlRoBf.csv')
drug_test_data

Unnamed: 0,unique_hash,text,drug
0,9e9a8166b84114aca147bf409f6f956635034c08,"256 (previously stable on natalizumab), with 5...",fingolimod
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,On fingolimod and have been since December 201...,fingolimod
2,50b6d851bcff4f35afe354937949e9948975adf7,Apparently it's shingles! :-/ I do have a few ...,humira
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,If the Docetaxel doing once a week x3 weeks th...,tagrisso
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,"CC, Stelara worked in a matter of days for me....",stelara
5,b1950d27d94ceff4e9bf8c7d1fd4b11b35ede4d7,"Janssen Biotech, Inc. has just received FDA ap...",stelara
6,abafc5b6c5aac6f777cf265e5c7dd80fb793e6bc,"I just had the, ” I thought things would be be...",ocrevus
7,e5550693e72a8335d723ca5fc64da91e1256fb0b,Dec.26 2018 (Basha Fowler) I was diagnosed in ...,tagrisso
8,ee8c500f6402331ff12b0b29d943b6d1699a0b8d,"Hi, I started Gilenya about 7 weeks ago and ha...",gilenya
9,d261600ba4fc022fac12748845deed56822ff195,My uncle is still going through treatment. 2 k...,keytruda


# RegEx to clean up text for Test data

In [160]:
drug_test_data['text'].replace(regex=True,inplace=True,to_replace=rx_pat, value=r'')
drug_test_data['text'].replace(regex=True,inplace=True,to_replace=rx_pat_wSpace, value=r' ')
drug_test_data.text.head(5)

0    previously stable on natalizumab with % switch...
1    On fingolimod and have been since December the...
2    Apparently its shingles   I do have a few red ...
3    If the Docetaxel doing once a week xweeks then...
4    CC Stelara worked in a matter of days for me I...
Name: text, dtype: object

# Inspect the Cleaned set for Test data

In [161]:
drug_test_data['text'] = drug_test_data['text'].str.lower()

drug_test_data['text'].head(5)

0    previously stable on natalizumab with % switch...
1    on fingolimod and have been since december the...
2    apparently its shingles   i do have a few red ...
3    if the docetaxel doing once a week xweeks then...
4    cc stelara worked in a matter of days for me i...
Name: text, dtype: object

# Feature Extraction for Test data

In [162]:
# Define my X & create my matrix with n things and n features
Xtest = cv.transform(drug_test_data['text'])

In [163]:
Xtest

<2924x40982 sparse matrix of type '<class 'numpy.int64'>'
	with 455596 stored elements in Compressed Sparse Row format>

In [164]:
np.size(Xtest,0)

2924

In [165]:
print(Xtest)

  (0, 13077)	1
  (0, 24420)	1
  (0, 25955)	1
  (0, 28954)	1
  (0, 34655)	1
  (0, 35810)	1
  (0, 37141)	1
  (0, 40110)	1
  (1, 1542)	1
  (1, 1882)	1
  (1, 3361)	1
  (1, 3603)	1
  (1, 4820)	1
  (1, 5028)	1
  (1, 8752)	1
  (1, 9140)	1
  (1, 10381)	1
  (1, 10458)	1
  (1, 12817)	1
  (1, 13077)	1
  (1, 15276)	1
  (1, 15577)	1
  (1, 15584)	1
  (1, 18906)	1
  (1, 18992)	1
  :	:
  (2923, 30388)	1
  (2923, 31594)	1
  (2923, 32073)	1
  (2923, 32185)	1
  (2923, 32841)	1
  (2923, 34148)	1
  (2923, 35392)	1
  (2923, 35909)	1
  (2923, 36004)	1
  (2923, 36249)	1
  (2923, 36274)	1
  (2923, 36350)	1
  (2923, 36584)	1
  (2923, 36597)	1
  (2923, 36613)	1
  (2923, 36989)	1
  (2923, 37141)	1
  (2923, 37243)	1
  (2923, 38963)	1
  (2923, 39520)	1
  (2923, 39554)	1
  (2923, 39689)	1
  (2923, 39868)	1
  (2923, 40031)	1
  (2923, 40725)	1


In [166]:
X_test

<1056x40982 sparse matrix of type '<class 'numpy.int64'>'
	with 160380 stored elements in Compressed Sparse Row format>

In [167]:
print(X_test)

  (0, 40725)	1
  (0, 40708)	1
  (0, 40251)	1
  (0, 40176)	1
  (0, 40110)	1
  (0, 40098)	1
  (0, 39903)	1
  (0, 39109)	1
  (0, 38963)	1
  (0, 38859)	1
  (0, 38721)	1
  (0, 38368)	1
  (0, 37175)	1
  (0, 37141)	1
  (0, 36989)	1
  (0, 36770)	1
  (0, 36670)	1
  (0, 36622)	1
  (0, 36613)	1
  (0, 36597)	1
  (0, 36587)	1
  (0, 36584)	1
  (0, 36121)	1
  (0, 36004)	1
  (0, 35609)	1
  :	:
  (1055, 19359)	1
  (1055, 18992)	1
  (1055, 18906)	1
  (1055, 17844)	1
  (1055, 17673)	1
  (1055, 17392)	1
  (1055, 14398)	1
  (1055, 14330)	1
  (1055, 13511)	1
  (1055, 13201)	1
  (1055, 10039)	1
  (1055, 9620)	1
  (1055, 9431)	1
  (1055, 8464)	1
  (1055, 6894)	1
  (1055, 6789)	1
  (1055, 6424)	1
  (1055, 4732)	1
  (1055, 3776)	1
  (1055, 3721)	1
  (1055, 2882)	1
  (1055, 2320)	1
  (1055, 2134)	1
  (1055, 1894)	1
  (1055, 1542)	1


In [168]:
sentiment_prediction_test_data = final_model.predict(Xtest)

In [169]:
sentiment_prediction_test_data_df = pd.DataFrame(sentiment_prediction_test_data)

In [170]:
sentiment_prediction_test_data_df = sentiment_prediction_test_data_df.replace('0','Positive')

In [171]:
sentiment_prediction_test_data_df = sentiment_prediction_test_data_df.replace('1','Negative')

In [172]:
sentiment_prediction_test_data_df = sentiment_prediction_test_data_df.replace('2','Neutral')

In [173]:
sentiment = sentiment_prediction_test_data_df.rename(columns = {0:'sentiment'})

In [174]:
sentiment

Unnamed: 0,sentiment
0,2
1,2
2,2
3,2
4,2
5,2
6,1
7,2
8,2
9,1


In [175]:
sentiment

Unnamed: 0,sentiment
0,2
1,2
2,2
3,2
4,2
5,2
6,1
7,2
8,2
9,1


In [176]:
unique_hash = drug_test_data['unique_hash']
unique_hash_df = pd.DataFrame(unique_hash)

In [177]:
unique_hash_df

Unnamed: 0,unique_hash
0,9e9a8166b84114aca147bf409f6f956635034c08
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a
2,50b6d851bcff4f35afe354937949e9948975adf7
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae
4,8b37d169dee5bdae27060949242fb54feb6a7f7f
5,b1950d27d94ceff4e9bf8c7d1fd4b11b35ede4d7
6,abafc5b6c5aac6f777cf265e5c7dd80fb793e6bc
7,e5550693e72a8335d723ca5fc64da91e1256fb0b
8,ee8c500f6402331ff12b0b29d943b6d1699a0b8d
9,d261600ba4fc022fac12748845deed56822ff195


# Concatenate the DataFrames

In [178]:
sentiment = pd.concat([unique_hash_df, sentiment], axis=1)

In [179]:
sentiment

Unnamed: 0,unique_hash,sentiment
0,9e9a8166b84114aca147bf409f6f956635034c08,2
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,2
2,50b6d851bcff4f35afe354937949e9948975adf7,2
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,2
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,2
5,b1950d27d94ceff4e9bf8c7d1fd4b11b35ede4d7,2
6,abafc5b6c5aac6f777cf265e5c7dd80fb793e6bc,1
7,e5550693e72a8335d723ca5fc64da91e1256fb0b,2
8,ee8c500f6402331ff12b0b29d943b6d1699a0b8d,2
9,d261600ba4fc022fac12748845deed56822ff195,1


# Export the output to csv File

In [180]:
sentiment.to_csv("innoplexus_sentiment_contest.csv", index=False)