In [1]:
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install nltk
# !pip install gensim
# !pip install python-Levenshtein

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import gensim

import nltk
nltk.download("all")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import string
from datetime import datetime
from ast import literal_eval
import re
import json

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/dipu/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /home/dipu/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/dipu/nltk_

[nltk_data]    |   Package omw-1.4 is already up-to-date!
[nltk_data]    | Downloading package opinion_lexicon to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package opinion_lexicon is already up-to-date!
[nltk_data]    | Downloading package panlex_swadesh to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package panlex_swadesh is already up-to-date!
[nltk_data]    | Downloading package paradigms to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package paradigms is already up-to-date!
[nltk_data]    | Downloading package pe08 to /home/dipu/nltk_data...
[nltk_data]    |   Package pe08 is already up-to-date!
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     /home/dipu/nltk_data...
[nltk_data]    |   Package perluniprops is already up-to-date!
[nltk_data]    | Downloading package pil to /home/dipu/nltk_data...
[nltk_data]    |   Package pil is already up-to-date!
[nltk_data]    | Downloading package pl196x to /

In [2]:
train_path = "./data/drugsComTrain_raw.tsv"
test_path = "./data/drugsComTest_raw.tsv"

df = pd.read_csv(train_path, delimiter="\t", index_col=0)
df_test = pd.read_csv(test_path, delimiter="\t", index_col = 0)

df.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount
206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [3]:
# checking if there are null values
print(df.info())
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 161297 entries, 206461 to 215220
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   drugName     161297 non-null  object 
 1   condition    160398 non-null  object 
 2   review       161297 non-null  object 
 3   rating       161297 non-null  float64
 4   date         161297 non-null  object 
 5   usefulCount  161297 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 8.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 53766 entries, 163740 to 113712
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   drugName     53766 non-null  object 
 1   condition    53471 non-null  object 
 2   review       53766 non-null  object 
 3   rating       53766 non-null  float64
 4   date         53766 non-null  object 
 5   usefulCount  53766 non-null  int64  
dtypes: float64(1), int6

In [4]:
# drop rows having null value in "condition"
df = df.dropna()
df_test = df_test.dropna()

In [5]:
sh_train = df.shape
sh_test = df_test.shape
total = sh_train[0] + sh_test[0];

print("Train data shape:", sh_train)
print("Test data shape:", sh_test)
print("Train/Test split: {0:.2f} / {1:.2f}".format(sh_train[0]/total, sh_test[0]/total))

Train data shape: (160398, 6)
Test data shape: (53471, 6)
Train/Test split: 0.75 / 0.25


In [6]:
replacements = {
    "&#039;": "'",
    "&amp;": "and",
    "&quot;": '"'
}

emoticon = json.load(open('emoticon.json', 'r'))
stop_words = stopwords.words('english')
table = str.maketrans('', '', string.punctuation)
porter = PorterStemmer()

# Function to modify and clean unnecessary data in the review
def clean_review(review):
    review = review.lower()
    
    # replacing specific words
    for key, value in replacements.items():
        review = review.replace(key, value)
        
    # replacing emoticon with respective word
    for key, value in emoticon.items():
        review = review.replace(key, value)
    
    # splitting review into words
    tokens = word_tokenize(review)
    
    # remove punctuation from each word
    no_punc_words = [w.translate(table) for w in tokens]
    
    # removing non-alphabetical words
    words = [word for word in no_punc_words if word.isalpha()]
    
    # filter out stop words
    words = [w for w in words if not w in stop_words]
    
    # stemming of words (i.e. reducing each word to its root or base)
    stemmed = [porter.stem(word) for word in words]
    
    # removing words of length 1
    long_words = [word for word in stemmed if len(word) > 1]
    
    return long_words

In [7]:
# #Apply 'clean_review' function to the 'review' column
# df["review"] = df["review"].apply(clean_review)
# df_test["review"] = df_test["review"].apply(clean_review)

# df.head()

In [8]:
months = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def date_to_num_months(date):
    date_split = date.split()
    date_split[1] = date_split[1][:-1]
    
    date_month = 1
    date_year = int(date_split[2])
    
    for i in range(len(months)):
        if(months[i] == date_split[0]):
            date_month = i+1
            break
    
    current_month = datetime.now().month
    current_year = datetime.now().year
    
    months_passed = (current_year - date_year) * 12 + (current_month - date_month)
    return months_passed

In [9]:
# # Apply 'clean_review' function to the 'date' column
# df['monthsAfterReview'] = df['date'].apply(date_to_num_months)
# df_test['monthsAfterReview'] = df_test['date'].apply(date_to_num_months)

# # dropping 'date' column as it is unnecessary from now
# df = df.drop(columns=['date'])
# df_test = df_test.drop(columns=['date'])

# df.head()

In [10]:
# df["idx"] = range(df.shape[0])
# df_test["idx"] = range(df_test.shape[0])

# df = df.set_index("idx", drop=True)
# df_test = df_test.set_index("idx", drop=True)

In [11]:
# X_train = df.drop('rating', axis=1)
# y_train = df['rating']

# X_test = df_test.drop('rating', axis=1)
# y_test = df_test['rating']

# display(X_train.head())
# display(y_train.head())

In [12]:
# X_train.to_csv("X_train.csv", sep='\t')
# X_test.to_csv("X_test.csv", sep='\t')

X_train = pd.read_csv("X_train.csv", sep='\t', index_col=0)
y_train = df['rating']
X_test = pd.read_csv("X_test.csv", sep='\t', index_col=0)
y_test = df_test['rating']

X_train["review"] = X_train["review"].apply(lambda x: literal_eval(x))
X_test["review"] = X_test["review"].apply(lambda x: literal_eval(x))

X_train.head()

Unnamed: 0_level_0,drugName,condition,review,usefulCount,monthsAfterReview
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Valsartan,Left Ventricular Dysfunction,"[side, effect, take, combin, bystol, mg, fish,...",27,121
1,Guanfacine,ADHD,"[son, halfway, fourth, week, intuniv, becam, c...",192,146
2,Lybrel,Birth Control,"[use, take, anoth, oral, contracept, pill, cyc...",17,150
3,Ortho Evra,Birth Control,"[first, time, use, form, birth, control, glad,...",10,79
4,Buprenorphine / naloxone,Opiate Dependence,"[suboxon, complet, turn, life, around, feel, h...",37,67


In [13]:
# Word2Vec
w2v_model = gensim.models.Word2Vec(
    window = 5,
    min_count = 1,
    workers = 4 #cpu thread
)

w2v_model.build_vocab(X_train.review, progress_per=1000)

print("Epochs:", w2v_model.epochs)
print("Corpus count:", w2v_model.corpus_count)

Epochs: 5
Corpus count: 160398


In [14]:
w2v_model.train(X_train.review, total_examples=w2v_model.corpus_count,
            epochs=w2v_model.epochs)

(27869579, 32327960)

In [15]:
w2v_model.save("./word2vec-drug-review-dataset.model")

In [23]:
# w2v_model.wv.most_similar("good")
w2v_model.wv.similarity(w1="great", w2="good")

0.72054905

In [17]:
w2v_model.wv.most_similar("food")

[('snack', 0.732597291469574),
 ('chocol', 0.6895112991333008),
 ('meal', 0.6783545017242432),
 ('junk', 0.6321313381195068),
 ('eat', 0.6227915287017822),
 ('sweet', 0.622411847114563),
 ('cracker', 0.6200509071350098),
 ('starv', 0.6062787175178528),
 ('salad', 0.6018261313438416),
 ('fruit', 0.5915340185165405)]

In [18]:
w2v_model.wv.doesnt_match("doctor nurse therapist pilot".split())

'pilot'

In [19]:
# https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca

# Store the vectors for train data in following file
word2vec_filename = 'train_review_word2vec.csv'
# with open(word2vec_filename, 'w+') as word2vec_file:
#     for index, row in X_train.iterrows():
#         print(index)
#         model_vector = (np.mean([w2v_model.wv[token] for token in row['review']], axis=0)).tolist()
#         if index == 0:
#             header = ",".join(str(ele) for ele in range(1000))
#             word2vec_file.write(header)
#             word2vec_file.write("\n")
#         # Check if the line exists else it is vector of zeros
#         if type(model_vector) is list:  
#             line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
#         else:
#             line1 = ",".join([str(0) for i in range(1000)])
#         word2vec_file.write(line1)
#         word2vec_file.write('\n')

In [20]:
import time
#Import the DecisionTreeeClassifier
from sklearn.tree import DecisionTreeClassifier
# Load from the filename
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

word2vec_df.fillna(0, inplace = True)

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_df, y_train)
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 28.138476133346558


0.5580962896347046