In [4]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from warnings import filterwarnings
filterwarnings('ignore')
import pandas as pd # data preprocessing
import itertools # confusion matrix
import string
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

# Importing libraries

In [120]:
df=pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [121]:
df.condition.value_counts()

Birth Control                                              28788
Depression                                                  9069
Pain                                                        6145
Anxiety                                                     5904
Acne                                                        5588
                                                           ...  
Dissociative Identity Disorde                                  1
Hydrocephalus                                                  1
Hyperlipoproteinemia Type III, Elevated beta-VLDL   IDL        1
Q Feve                                                         1
Neutropenia                                                    1
Name: condition, Length: 884, dtype: int64

In [122]:
df_train = df[(df['condition']=='Birth Control') | (df['condition']=='Depression') | (df['condition']=='High Blood Pressure')|(df['condition']=='Diabetes, Type 2')]

In [123]:
df.shape

(161297, 7)

In [124]:
df_train.shape

(42732, 7)

In [125]:
X = df_train.drop(['Unnamed: 0','drugName','rating','date','usefulCount'],axis=1)

# EDA

In [126]:
X.condition.value_counts()

Birth Control          28788
Depression              9069
Diabetes, Type 2        2554
High Blood Pressure     2321
Name: condition, dtype: int64

In [127]:
X.head()

Unnamed: 0,condition,review
2,Birth Control,"""I used to take another oral contraceptive, wh..."
3,Birth Control,"""This is my first time using any form of birth..."
9,Birth Control,"""I had been on the pill for many years. When m..."
11,Depression,"""I have taken anti-depressants for years, with..."
14,Birth Control,"""Started Nexplanon 2 months ago because I have..."


In [128]:
X=X.reset_index()
X['index']=X.index
X.drop('index',axis=1,inplace=True)
X

Unnamed: 0,condition,review
0,Birth Control,"""I used to take another oral contraceptive, wh..."
1,Birth Control,"""This is my first time using any form of birth..."
2,Birth Control,"""I had been on the pill for many years. When m..."
3,Depression,"""I have taken anti-depressants for years, with..."
4,Birth Control,"""Started Nexplanon 2 months ago because I have..."
...,...,...
42727,Birth Control,"""I have had the Nexplanon since Dec. 27, 2016 ..."
42728,"Diabetes, Type 2","""I just got diagnosed with type 2. My doctor p..."
42729,Depression,"""This is the third med I&#039;ve tried for anx..."
42730,High Blood Pressure,"""I have only been on Tekturna for 9 days. The ..."


In [129]:
X['len_sent']=X.review.apply(lambda x:len(x))
X['len_sent'].max()

5723

In [130]:
# segregating dataframe for analyzing individual condition
X_birth=X[(X['condition']=='Birth Control')]
X_dep=X[(X['condition']=='Depression')]
X_bp=X[(X['condition']=='High Blood Pressure')]
X_diab=X[(X['condition']=='Diabetes, Type 2')]

## Converting the review documents to vectors

In [131]:
import re
import spacy
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')
corpus=[]
for i in range(len(X['review'])):
    review = re.sub('[^a-zA-Z]', ' ',X['review'][i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not nlp.vocab[word].is_stop]
    review = ' '.join(review)
    corpus.append(review)
    
    

## Onehot Representation

In [132]:
### Vocabulary size
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 


In [133]:
len(onehot_repr)

42732

## Embedding Representation

In [145]:
#sent_length=5750
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[2931 4129 2685 ...  278  698  347]
 [1160 4742  143 ... 2110 4105  766]
 [1456 1146 2018 ... 3818  111 2907]
 ...
 [   0    0 2463 ... 2400 2831 4782]
 [ 160 4189 4689 ... 4185 4189 2868]
 [3472 4556 2110 ...  757 4301 1605]]


In [146]:
embedded_docs.shape,y.shape

((42732, 20), (42732,))

In [147]:
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
y_encoded= label_encoder.fit_transform(y)
y_encoded

array([0, 0, 0, ..., 1, 3, 0])

## Creating features and Target Variable

In [148]:
X_feat=embedded_docs
y=X['condition']
X_train, X_test, y_train, y_test = train_test_split(embedded_docs, y_encoded,stratify=y_encoded,test_size=0.2, random_state=0)

In [149]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 20, 40)            200000    
                                                                 
 lstm_3 (LSTM)               (None, 100)               56400     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 256,501
Trainable params: 256,501
Non-trainable params: 0
_________________________________________________________________
None


## Model Training

In [150]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7c62e394b0>

## Adding Dropout

In [151]:
from tensorflow.keras.layers import Dropout
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

## Performance Metrics And Accuracy

In [152]:
#y_pred=model.predict_classes(X_test)
predict_x=model.predict(X_test) 
y_pred=np.argmax(predict_x,axis=1)



In [153]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6736866736866737

In [154]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [159]:
cm

array([[5758,    0,    0,    0],
       [1814,    0,    0,    0],
       [ 511,    0,    0,    0],
       [ 464,    0,    0,    0]])