# Importing libraries and dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atuli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atuli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading the Dataset

In [2]:
data = pd.read_csv(r'C:\Users\atuli\Downloads\webmd.csv (1)\webmd.csv')
data.head()

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I'm a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 % powder,144731,2,3,why did my PTINR go from a normal of 2.5 to ov...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 % powder,144731,2,2,FALLING AND DON'T REALISE IT,1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 % powder,144731,1,1,My grandfather was prescribed this medication ...,1,Male,,1


In [3]:
data.columns

Index(['Age', 'Condition', 'Date', 'Drug', 'DrugId', 'EaseofUse',
       'Effectiveness', 'Reviews', 'Satisfaction', 'Sex', 'Sides',
       'UsefulCount'],
      dtype='object')

In [4]:
data['Reviews']= data['Reviews'].str.lower()
data['Reviews']

0         i'm a retired physician and of all the meds i ...
1         cleared me right up even with my throat hurtin...
2         why did my ptinr go from a normal of 2.5 to ov...
3                              falling and don't realise it
4         my grandfather was prescribed this medication ...
                                ...                        
362801    i took the whole 12 weeks.i could have stopped...
362802    my dad strated taking chantix about 1 month ag...
362803    as long as i was on chantix, i didn't smoke.  ...
362804    started this medication oct 5th 2008. haven't ...
362805    one year ago, i tried chantix and had to stop ...
Name: Reviews, Length: 362806, dtype: object

In [5]:
#Checking for null values in the dataset

data.isnull().sum()

Age               0
Condition         0
Date              0
Drug              0
DrugId            0
EaseofUse         0
Effectiveness     0
Reviews          37
Satisfaction      0
Sex               0
Sides             0
UsefulCount       0
dtype: int64

# Data Preprocessing

In [6]:
data['Reviews']=data['Reviews'].apply(str)

In [7]:
#Cleaning the reviews given by the drug user
def clean_reviews(text):
    text=re.sub(r'@[A-Za-z0-9]+','',text)
    text=re.sub(r'#','',text)
    text=re.sub(r'RT[\s]+','',text)
    text=re.sub(r'https?:\/\/\S+','',text)
    text=re.sub(r'[^\w\s]','',text)
    
    return text

data['Reviews']=data['Reviews'].apply(clean_reviews)
data.head()

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,im a retired physician and of all the meds i h...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 % powder,144731,2,3,why did my ptinr go from a normal of 25 to ove...,3,Female,,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 % powder,144731,2,2,falling and dont realise it,1,Female,,0
4,35-44,Other,1/6/2009,warfarin (bulk) 100 % powder,144731,1,1,my grandfather was prescribed this medication ...,1,Male,,1


In [8]:
#Tokenizing reviews into individual sentences
review=data['Reviews'].apply(lambda x: x.split())

In [9]:
#Dropping Null values
data['Reviews'] = data['Reviews'].dropna()

In [10]:
data.isnull().sum()

Age              0
Condition        0
Date             0
Drug             0
DrugId           0
EaseofUse        0
Effectiveness    0
Reviews          0
Satisfaction     0
Sex              0
Sides            0
UsefulCount      0
dtype: int64

In [11]:
#Removing Stopwords

import spacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
stopwords= en.Defaults.stop_words

In [12]:
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [13]:
review= review.apply(lambda sentence: [word for word in sentence if word not in stopwords ])
review.head()

0    [im, retired, physician, meds, tried, allergie...
1    [cleared, right, throat, hurting, went, away, ...
2                             [ptinr, normal, 25, 100]
3                             [falling, dont, realise]
4    [grandfather, prescribed, medication, coumadin...
Name: Reviews, dtype: object

In [14]:
#Stemming
stemmer = PorterStemmer()
review = review.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
review.head()

0    [im, retir, physician, med, tri, allergi, seas...
1    [clear, right, throat, hurt, went, away, take,...
2                             [ptinr, normal, 25, 100]
3                                 [fall, dont, realis]
4    [grandfath, prescrib, medic, coumadin, assist,...
Name: Reviews, dtype: object

In [15]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
review= review.apply(lambda sentence:[lemmatizer.lemmatize(word) for word in sentence])
review.head()

0    [im, retir, physician, med, tri, allergi, seas...
1    [clear, right, throat, hurt, went, away, take,...
2                             [ptinr, normal, 25, 100]
3                                 [fall, dont, realis]
4    [grandfath, prescrib, medic, coumadin, assist,...
Name: Reviews, dtype: object

# Encoding satisfaction ratings

In [16]:
for i in range(0,len(data)):
    if (data.at[i,'Satisfaction']>3):
        data['Satisfaction'][i] = "pos"
    elif (data.at[i,'Satisfaction']==3):
        data['Satisfaction'][i] = 'neutral'
    else:
        data['Satisfaction'][i] = 'neg'
print(data['Satisfaction'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


0             pos
1             pos
2         neutral
3             neg
4             neg
           ...   
362801        pos
362802        neg
362803        neg
362804        pos
362805        pos
Name: Satisfaction, Length: 362806, dtype: object


# Adding polarity of each review based on its satisfaction rating

In [18]:
polarity_dict={'pos':1,'neutral':0 ,'neg':2 }
data['polarity']=data.Satisfaction.map(polarity_dict)
data.head(10)

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount,polarity
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,im a retired physician and of all the meds i h...,pos,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0,1
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,pos,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1,1
2,65-74,Other,7/16/2012,warfarin (bulk) 100 % powder,144731,2,3,why did my ptinr go from a normal of 25 to ove...,neutral,Female,,0,0
3,75 or over,Other,9/23/2010,warfarin (bulk) 100 % powder,144731,2,2,falling and dont realise it,neg,Female,,0,2
4,35-44,Other,1/6/2009,warfarin (bulk) 100 % powder,144731,1,1,my grandfather was prescribed this medication ...,neg,Male,,1,2
5,55-64,Other,7/19/2008,warfarin (bulk) 100 % powder,144731,4,4,help heart condition operation well,pos,Male,,0,1
6,25-34,Birth Control,6/15/2017,wymzya fe,163180,5,5,havent gotten pregnant so it does its job i wa...,neg,Female,"Nausea , vomiting , headache , bloating , ...",0,2
7,45-54,Disease of Ovaries with Cysts,1/30/2017,wymzya fe,163180,5,5,i have take this for 5 years age 4550 to preve...,pos,Female,"Nausea , vomiting , headache , bloating , ...",0,1
8,25-34,Acne,4/27/2016,wymzya fe,163180,4,2,,neg,Female,"Nausea , vomiting , headache , bloating , ...",1,2
9,55-64,Stuffy Nose,10/29/2012,"12 hour nasal relief spray, non-aerosol",9800,4,2,the 12 hour spray only works for me for 6 hours,neg,Male,"Temporary burning, stinging, dryness in the no...",0,2


In [19]:
review_corpus = []
for i in range(len(data)):
    r = data.iloc[i,7]
    r = "".join(r)
    review_corpus.append(r)

In [20]:
review_corpus

['im a retired physician and of all the meds i have tried for my allergies seasonal and not  this one is the most effective for me  when i first began using this drug some years ago  tiredness as a problem but is not currently',
 'cleared me right up even with my throat hurting it went away after taking the medicine',
 'why did my ptinr go from a normal of 25 to over \n100',
 'falling and dont realise it',
 'my grandfather was prescribed this medication coumadin to assist in blood thinning due to a heart and thyroid condition  his primary doctor was aware that he was on an aspirin regiment and still prescribed this medicine it caused his blood to thin out to much and he ended up internally bleeding to death  if you are going to take this medicine please ask your doctors about possible side effects or drug interactions',
 'help heart condition operation well',
 'havent gotten pregnant so it does its job i was switched to this brand from another generic i get nauseous and generally feel 

# Feature extraction using TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect= TfidfVectorizer().fit(review_corpus)               #unigram
features= vect.transform(review_corpus)

In [22]:
vectorizer = TfidfVectorizer(ngram_range = (3,3))         #trigram
features1 = vectorizer.fit_transform(review_corpus)

In [23]:
labels=data['polarity']

# Performing Train-Test split

In [24]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state = 42,test_size=0.25)
features1_train, features1_test, labels1_train, labels1_test = train_test_split(features1, labels, random_state = 42,test_size=0.25)

In [25]:
features.shape

(362806, 182855)

In [26]:
features1.shape

(362806, 6436606)

In [27]:
labels.shape

(362806,)

# **Applying ML models to unigram model**

In [28]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
lr = LogisticRegression()
lr.fit(features_train, labels_train)
lr_score=lr.score(features_test, labels_test)
labels_pred_lr = lr.predict(features_test)
confusion_matrix(labels_test,labels_pred_lr)
lr_score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7038874556239113

In [29]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(features_train, labels_train)
knn_score = knn.score(features_test, labels_test)
labels_pred_knn = knn.predict(features_test)
confusion_matrix(labels_test,labels_pred_knn)
knn_score

0.5118850741990254

In [30]:
# Naive bayes
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
mnb= MultinomialNB()
mnb.fit(features_train,labels_train)
mnb_score = metrics.accuracy_score(mnb.predict(features_test),labels_test)
print(mnb_score)

from sklearn.naive_bayes import BernoulliNB
bnb= BernoulliNB()
bnb.fit(features_train,labels_train)
bnb_score= metrics.accuracy_score(bnb.predict(features_test),labels_test)
print(bnb_score)

0.6701506030738021
0.6117285175630085


# Applying ML models to trigram model

In [31]:
#Logistic Regression
lr.fit(features1_train, labels_train)
lr1_score=lr.score(features1_test, labels_test)
labels_pred_lr1 = lr.predict(features1_test)
confusion_matrix(labels_test,labels_pred_lr1)
lr1_score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7450001102511521

In [32]:
#KNN
knn.fit(features1_train, labels_train)
knn_score1 = knn.score(features1_test, labels_test)
labels_pred_knn1 = knn.predict(features1_test)
confusion_matrix(labels_test,labels_pred_knn1)
knn_score

0.5118850741990254

In [33]:
# Naive bayes

from sklearn.naive_bayes import MultinomialNB
mnb1= MultinomialNB()
mnb1.fit(features1_train,labels1_train)
mnb1_score=metrics.accuracy_score(mnb1.predict(features1_test),labels1_test)
print(mnb1_score)

from sklearn.naive_bayes import BernoulliNB
bnb1= BernoulliNB()
bnb1.fit(features1_train,labels1_train)
bnb1_score=metrics.accuracy_score(bnb1.predict(features1_test),labels1_test)
print(bnb1_score)

0.726830720381028
0.6254327357720888
