#  Predicting drug rating using LSTM Network in Keras

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model                  # for one-hot encoding of the labels
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D, Embedding
from keras.models import Sequential
from keras.layers import SimpleRNN, GRU, LSTM # Import layers from Keras

Using TensorFlow backend.
  % self._get_c_name())


### Reading the data

In [2]:
raw_train = pd.read_csv("traindata.csv", index_col =0)

In [3]:
raw_train["Name"].value_counts()

lexapro                       63
prozac                        46
zoloft                        45
retin-a                       45
synthroid                     38
paxil                         38
propecia                      38
effexor                       33
cymbalta                      33
doxycycline                   33
chantix                       32
wellbutrin                    32
neurontin                     31
ambien                        30
lamictal                      29
lipitor                       29
effexor-xr                    29
accutane                      28
imitrex                       27
renova                        27
nexium                        27
adipex-p                      25
topamax                       25
flonase                       25
differin                      25
vyvanse                       24
seroquel                      23
retin-a-micro                 23
zyrtec                        23
xanax                         23
          

In [4]:
raw_test = pd.read_csv("testdata.csv", index_col =0)

In [5]:
raw_test.Name.value_counts()

paxil                   20
effexor-xr              17
accutane                16
synthroid               15
effexor                 13
differin                13
lipitor                 13
chantix                 12
wellbutrin-xl           12
citalopram              12
tazorac                 11
mobic                   11
lexapro                 11
wellbutrin              10
nexium                  10
lamictal                10
celexa                  10
topamax                 10
retin-a                 10
trazodone               10
zyrtec                   9
yasmin                   9
fosamax                  9
seroquel                 9
prinivil                 9
adderall                 9
provigil                 9
lyrica                   9
amoxil                   8
prednisone               8
                        ..
naltrexone               1
trental                  1
glyquin-xm               1
zegerid                  1
depakene                 1
capoten                  1
h

In [6]:
raw_train.head()

Unnamed: 0,Name,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview,rating
0,enalapril,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ...",low
1,ortho-tri-cyclen,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest...",low
2,ponstel,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...,high
3,prilosec,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...,low
4,lyrica,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above,low


In [7]:
raw_test.head()

Unnamed: 0,Name,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview,Rating
0,biaxin,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...,high
1,lamictal,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...,high
2,depakene,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...,low
3,sarafem,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...,high
4,accutane,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...,high


In [8]:
raw_train.shape

(3107, 8)

In [9]:
print(raw_train['rating'].value_counts())

high      1780
medium     666
low        661
Name: rating, dtype: int64


In [10]:
print(raw_test['Rating'].value_counts())

high      549
medium    246
low       241
Name: Rating, dtype: int64


In [11]:
raw_train.isnull().sum()

Name                 0
effectiveness        0
sideEffects          0
condition            1
benefitsReview       0
sideEffectsReview    2
commentsReview       8
rating               0
dtype: int64

In [12]:
raw_train = raw_train.dropna()

In [13]:
raw_train = raw_train.reset_index(drop =True)


In [14]:
raw_train.shape

(3096, 8)

In [15]:
raw_test.isnull().sum()

Name                 0
effectiveness        0
sideEffects          0
condition            0
benefitsReview       0
sideEffectsReview    0
commentsReview       0
Rating               0
dtype: int64

In [16]:
raw_train["final"] = raw_train["effectiveness"]+" "+raw_train["sideEffects"]+" "+raw_train["condition"]+" "+ raw_train["benefitsReview"]+" "+raw_train["sideEffectsReview"]+" "+raw_train["commentsReview"]+" "+ raw_train["Name"]
raw_test["final"] = raw_test["effectiveness"]+" "+raw_test["sideEffects"]+" "+raw_test["condition"]+" "+ raw_test["benefitsReview"]+" "+raw_test["sideEffectsReview"]+" "+raw_test["commentsReview"]+" "+ raw_test["Name"]

In [17]:
raw_train.head()

Unnamed: 0,Name,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview,rating,final
0,enalapril,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ...",low,Highly Effective Mild Side Effects management ...
1,ortho-tri-cyclen,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest...",low,Highly Effective Severe Side Effects birth pre...
2,ponstel,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...,high,Highly Effective No Side Effects menstrual cra...
3,prilosec,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...,low,Marginally Effective Mild Side Effects acid re...
4,lyrica,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above,low,Marginally Effective Severe Side Effects fibro...


In [18]:
raw_train.dtypes

Name                 object
effectiveness        object
sideEffects          object
condition            object
benefitsReview       object
sideEffectsReview    object
commentsReview       object
rating               object
final                object
dtype: object

In [19]:
raw_train["final"][0]

'Highly Effective Mild Side Effects management of congestive heart failure slowed the progression of left ventricular dysfunction into overt heart failure \r\r\nalone or with other agents in the managment of hypertension \r\r\nmangagement of congestive heart failur cough, hypotension , proteinuria, impotence , renal failure , angina pectoris , tachycardia , eosinophilic pneumonitis, tastes disturbances , anusease anorecia , weakness fatigue insominca weakness monitor blood pressure , weight and asses for resolution of fluid enalapril'

In [20]:
raw_test["final"][0]

'Considerably Effective Mild Side Effects sinus infection The antibiotic may have destroyed bacteria causing my sinus infection.  But it may also have been caused by a virus, so its hard to say. Some back pain, some nauseau. Took the antibiotics for 14 days. Sinus infection was gone after the 6th day. biaxin'

In [21]:
train_final = raw_train[["final","rating"]]

In [22]:
train_final.head()

Unnamed: 0,final,rating
0,Highly Effective Mild Side Effects management ...,low
1,Highly Effective Severe Side Effects birth pre...,low
2,Highly Effective No Side Effects menstrual cra...,high
3,Marginally Effective Mild Side Effects acid re...,low
4,Marginally Effective Severe Side Effects fibro...,low


In [23]:
test_final = raw_test[["final","Rating"]]

In [24]:
test_final.head()

Unnamed: 0,final,Rating
0,Considerably Effective Mild Side Effects sinus...,high
1,Highly Effective Mild Side Effects bipolar dis...,high
2,Moderately Effective Severe Side Effects bipol...,low
3,Highly Effective No Side Effects bi-polar / an...,high
4,Highly Effective Mild Side Effects nodular acn...,high


In [25]:
from sklearn.preprocessing import LabelEncoder

X1 = train_final.final
Y1 = train_final.rating
X2 = test_final.final
Y2 = test_final.Rating

le = LabelEncoder()
Y1 = le.fit_transform(Y1)
Y1 = Y1.reshape(-1,1)

Y2 = le.fit_transform(Y2)
Y2 = Y2.reshape(-1,1)

In [26]:
keys = le.classes_
values = le.transform(le.classes_)
dictionary = dict(zip(keys, values))
print(dictionary)

{'high': 0, 'medium': 2, 'low': 1}


## Splitting into train and val

In [27]:
from sklearn.model_selection import train_test_split

X = X1
y = Y1

X_test = X2
y_test = Y2

In [28]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state= 123,stratify=y)

In [29]:
print(X_train.shape)
print(y_train.shape)

(2631,)
(2631, 1)


In [30]:
X_train = X_train.reset_index(drop =True)
X_val = X_val.reset_index(drop =True)
X_test = X_test.reset_index(drop =True)

a = pd.DataFrame(y_train)
b = pd.DataFrame(X_train)
c = pd.DataFrame(y_val)
d = pd.DataFrame(X_val)
e = pd.DataFrame(y_test)
f = pd.DataFrame(X_test)


train_data = pd.concat([b,a], axis=1, join_axes=[b.index])

val_data = pd.concat([d,c], axis=1, join_axes=[d.index])

test_data = pd.concat([f,e], axis=1, join_axes=[f.index])

train_data.columns = ['review','rating']
val_data.columns = ['review','rating']
test_data.columns = ['review','rating']

In [31]:
print(train_data.head())
print(val_data.head())
print(test_data.head())

                                              review  rating
0  Considerably Effective Mild Side Effects add E...       0
1  Highly Effective Extremely Severe Side Effects...       0
2  Marginally Effective Severe Side Effects fibro...       1
3  Moderately Effective Moderate Side Effects dep...       1
4  Highly Effective No Side Effects acne Acne hea...       0
                                              review  rating
0  Moderately Effective Extremely Severe Side Eff...       2
1  Marginally Effective Moderate Side Effects sci...       2
2  Moderately Effective Extremely Severe Side Eff...       1
3  Considerably Effective Extremely Severe Side E...       2
4  Considerably Effective Extremely Severe Side E...       1
                                              review  rating
0  Considerably Effective Mild Side Effects sinus...       0
1  Highly Effective Mild Side Effects bipolar dis...       0
2  Moderately Effective Severe Side Effects bipol...       1
3  Highly Effective No S

# Preprocessing the data

In [32]:
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords

def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
    #
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 3. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 4. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/B49gpu5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
train_num_reviews = train_data["review"].size
val_num_reviews = val_data["review"].size
test_num_reviews = test_data["review"].size

clean_train = []
clean_val = []
clean_test = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
# Call our function for each one, and add the result to the list of
# clean reviews
for i in range( 0, train_num_reviews ):
    clean_train.append( review_to_words( train_data["review"][i] ) )


for i in range( 0, val_num_reviews ):
    clean_val.append( review_to_words( val_data["review"][i] ) )
    

for i in range( 0, test_num_reviews ):
    clean_test.append( review_to_words( test_data["review"][i] ) )


In [34]:
p = pd.DataFrame(clean_train)
train_data.review = p
q = pd.DataFrame(clean_val)
val_data.review = q
r = pd.DataFrame(clean_test)
test_data.review = r

In [35]:
val_data.head()

Unnamed: 0,review,rating
0,moderately effective extremely severe side eff...,2
1,marginally effective moderate side effects sci...,2
2,moderately effective extremely severe side eff...,1
3,considerably effective extremely severe side e...,2
4,considerably effective extremely severe side e...,1


In [36]:
train_data['rating'] = train_data.rating.astype('category')

In [37]:
train_data.dtypes

review      object
rating    category
dtype: object

### Converting unstructured text to structured numeric form
This includes:
1. Tokenizing
2. Converting sequence of words to sequence of word indeces
3. Converting varing length sequences to fixed length sequences through padding

In [38]:
# Numebr of unique words based on their frequency
max_words = 8000 
max_seq_len = 200         # Maximum length of words to be present in each row
embedding_size = 100

In [39]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words) #Tokenizer is used to tokenize text
tokenizer.fit_on_texts(train_data.review)

train_seq = tokenizer.texts_to_sequences(train_data.review)
val_seq = tokenizer.texts_to_sequences(val_data.review)
test_seq = tokenizer.texts_to_sequences(test_data.review)

word_index = tokenizer.word_index               # dictionary containing words and their index
# print(tokenizer.word_index)                   # print to check
print('Found %s unique tokens.' % len(word_index)) # total words in the corpus
train_pad = pad_sequences(train_seq, maxlen=max_seq_len) # get only the top frequent words on train
val_pad = pad_sequences(val_seq, maxlen=max_seq_len) # get only the top frequent words on validation
test_pad = pad_sequences(test_seq, maxlen=max_seq_len)   # get only the top frequent words on test

print(train_pad.shape)
print(val_pad.shape)
print(test_pad.shape)

Found 12385 unique tokens.
(2631, 200)
(465, 200)
(1036, 200)


In [40]:
print(le.classes_)
print(np.unique(y_train, return_counts=True))
print(np.unique(y_val, return_counts=True))
print(np.unique(y_test, return_counts=True))

['high' 'low' 'medium']
(array([0, 1, 2]), array([1508,  558,  565]))
(array([0, 1, 2]), array([267,  98, 100]))
(array([0, 1, 2]), array([549, 241, 246]))


In [41]:
train_labels = to_categorical(np.asarray(y_train))
val_labels = to_categorical(np.asarray(y_val))
test_labels = to_categorical(np.asarray(y_test))
print('Shape of data tensor:', train_data.shape)
print('Shape of train label tensor:', train_labels.shape)
print('Shape of validation label tensor:', val_labels.shape)
print('Shape of test label tensor:', test_labels.shape)

('Shape of data tensor:', (2631, 2))
('Shape of train label tensor:', (2631, 3))
('Shape of validation label tensor:', (465, 3))
('Shape of test label tensor:', (1036, 3))


## Building lstm model

In [40]:
model = Sequential()
model.add(Embedding(max_words,
                    embedding_size,
                    input_length=max_seq_len))
model.add(LSTM(50, return_sequences=True)) # Add an LSTM layer
model.add(LSTM(100, return_sequences=False))
#model.add(Dense(25, activation='relu'))
model.add(Dense(3, activation='softmax')) # Add an ouput layer. Since classification, 3 nodes for 3 classes.

In [42]:
from keras.optimizers import sgd
sgd = sgd(lr=0.01)

In [49]:
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          800000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200, 50)           30200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 890,903
Trainable params: 890,903
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(train_pad, train_labels,
          batch_size=64,
          epochs=15,
          validation_data=(val_pad, val_labels))

Train on 2631 samples, validate on 465 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f74310>

In [70]:
model.save('lstm_text.h5')

In [51]:
scores = model.evaluate(test_pad, test_labels, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 52.99%


## Building model with conv layer

In [66]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(max_words,
                    embedding_size,
                    input_length=max_seq_len))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(3, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='RMSprop',    metrics=['accuracy'])
    return model_conv
model_conv = create_conv_model()

In [67]:
model_conv.fit(train_pad, train_labels,
          batch_size=64,
          epochs=3,
          validation_data=(val_pad, val_labels))

Train on 2631 samples, validate on 465 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x30e58b10>

In [68]:
scores1 = model_conv.evaluate(test_pad, test_labels, verbose=0)
print("Accuracy: %.2f%%" % (scores1[1]*100))

Accuracy: 72.39%


In [69]:
model_conv.save('text_cnn.h5')

In [42]:
# Numebr of unique words based on their frequency
max_words = 8000 
max_seq_len = 200         # Maximum length of words to be present in each row
embedding_size = 100

## Building a model with pre trained word embeddings

In [43]:
import os 
os.chdir('/home/B49gpu5/B492393/Cute/drug')

In [44]:
GLOVE_DIR = './glove6b100dtxt/'

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors.
Found 400000 word vectors.


In [45]:
# prepare embedding matrix
num_words = min(max_words, len(word_index))
embedding_matrix = np.zeros((num_words, embedding_size))
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_seq_len,
                            trainable=False)

In [46]:
print('Training model.')

# train a 1D convnet with global maxpooling using Glove vectors initialized
sequence_input = Input(shape=(max_seq_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPool1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(3, activation='softmax')(x)

from keras.optimizers import Adam
adam = Adam(lr=0.001)
model1 = Model(sequence_input, preds)
model1.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])



Training model.


In [47]:
model1_pre = model1.fit(train_pad, train_labels,
              batch_size=128,
              epochs=10,
              validation_data=(val_pad, val_labels)).history

Train on 2631 samples, validate on 465 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
scores = model1_pre.evaluate(test_p, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))