In [1]:
import pandas as pd
import numpy as np

In [2]:
df_drug = pd.read_csv('df_drug.csv')
df_drug.head()

Unnamed: 0.1,Unnamed: 0,Id,rating,cleanReview,label
0,0,163740,10.0,ive tried antidepressants years citalopram flu...,positive
1,1,206473,8.0,my son crohns disease done well asacol he comp...,positive
2,2,159672,9.0,quick reduction symptoms,positive
3,3,39293,9.0,contrave combines drugs used alcohol smoking o...,positive
4,4,97768,9.0,i birth control one cycle after reading review...,positive


In [3]:
df_drug.shape

(215063, 5)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer #Vectorize the review column

In [5]:
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
features = tfidf.fit_transform(df_drug.cleanReview.values.astype('U'))
labels   = df_drug.label

In [6]:
features.shape

(215063, 1635780)

In [7]:
from sklearn.preprocessing import Normalizer #Train model using LinearSVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
model = LinearSVC('l2')
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size=0.25,random_state=0)
normalize = Normalizer()
x_train = normalize.fit_transform(x_train)
x_test = normalize.transform(x_test)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [8]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.894877803817


In [9]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test,y_pred)
conf_mat

array([[11752,   237,  1449],
       [  908,  5034,  2005],
       [  777,   276, 31328]])

!pip install mlxtend

In [10]:
from mlxtend.plotting import plot_confusion_matrix
fig,ax = plot_confusion_matrix(conf_mat=conf_mat,colorbar=True,show_absolute=True,cmap='viridis')

In [11]:
from  sklearn.metrics import classification_report
print(classification_report(y_test,y_pred,target_names= df_drug['label'].unique()))

             precision    recall  f1-score   support

   positive       0.87      0.87      0.87     13438
   negative       0.91      0.63      0.75      7947
    neutral       0.90      0.97      0.93     32381

avg / total       0.90      0.89      0.89     53766



Model Training using Deep Learning

In [12]:
df_drug['label2']=df_drug['rating'].map(lambda x:int(2) if  x >=8 else int(1) if x<=4 else int(0) )

2 is positive, 1 is negative, 0 is neutral

In [13]:
df_drug.head()

Unnamed: 0.1,Unnamed: 0,Id,rating,cleanReview,label,label2
0,0,163740,10.0,ive tried antidepressants years citalopram flu...,positive,2
1,1,206473,8.0,my son crohns disease done well asacol he comp...,positive,2
2,2,159672,9.0,quick reduction symptoms,positive,2
3,3,39293,9.0,contrave combines drugs used alcohol smoking o...,positive,2
4,4,97768,9.0,i birth control one cycle after reading review...,positive,2


In [14]:
X,y = (df_drug['cleanReview'].values.astype('U'),df_drug['label2'].values)

In [15]:
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [16]:
tk = Tokenizer(lower=True)
tk.fit_on_texts(X)
X_seq = tk.texts_to_sequences(X)
X_pad = pad_sequences(X_seq,maxlen=100,padding='post')
X_pad

array([[   7,   68,  763, ...,    0,    0,    0],
       [  13,  569, 1941, ...,    0,    0,    0],
       [ 859, 1896,  112, ...,    0,    0,    0],
       ..., 
       [2053,  497,   15, ...,    0,    0,    0],
       [   7, 1074,   34, ...,    0,    0,    0],
       [   7,  375,  426, ...,    0,    0,    0]], dtype=int32)

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_pad,y,test_size=0.25,random_state=1)

In [18]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,Dropout

In [19]:
X.shape, y.shape

((215063,), (215063,))

In [20]:
vocabulary_size = len(tk.word_counts.keys())+1
vocabulary_size

77201

In [21]:
embedding_size=32
max_words = 100

In [22]:
from keras.utils import to_categorical
import numpy as np

In [23]:
y_train

array([1, 2, 2, ..., 1, 1, 2])

In [24]:
y_train = np.array(y_train)
y_train = to_categorical(y_train,num_classes=3)

In [25]:
y_test = np.array(y_test)
y_test = to_categorical(y_test,num_classes=3)

In [26]:
model = Sequential()
model.add(Embedding(vocabulary_size,embedding_size,input_length=max_words))
model.add(LSTM(200,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [27]:
from keras.callbacks import EarlyStopping
earlystopper = EarlyStopping(monitor='val_loss',patience=1)

In [28]:
History = model.fit(X_train,y_train,batch_size=64,validation_split=0.25,epochs=10,callbacks=[earlystopper])


Train on 120972 samples, validate on 40325 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
scores = model.evaluate(X_test,y_test,verbose=0)
scores[1]

0.78743071829780897