# Drug dataset - Sentiment Analysis 

In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn import metrics
from nltk.corpus import stopwords
from collections import Counter
import warnings; warnings.simplefilter('ignore')
import nltk
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from sklearn.datasets import make_classification
from sklearn.metrics import RocCurveDisplay, plot_roc_curve
from sklearn.svm import SVC
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/sabeiro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabeiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sabeiro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to /home/sabeiro/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sabeiro/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [128]:
df_train = pd.read_csv("data/drugsComTrain_raw.tsv", sep='\t', index_col=[0])
df_test = pd.read_csv("data/drugsComTest_raw.tsv", sep='\t', index_col=[0])
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df['length'] = list(map(lambda x: len(str(x).split()), df['review']))
df = df.drop_duplicates(subset=["condition" ,"review", "rating"]).reset_index(drop=True)
df.isnull().any()

drugName       False
condition       True
review         False
rating         False
date           False
usefulCount    False
length         False
dtype: bool

### keeping good quality data only

In [129]:
drugC = df.drugName.value_counts()
drugC = drugC[drugC>=5]
df = df.loc[df['drugName'].isin(drugC.index),]
condC = df.condition.value_counts()
condC = condC[condC>=5]
df = df.loc[df['condition'].isin(condC.index),]
df = df.loc[df['usefulCount']>6,]

In [130]:
df.loc[:,'condition'] = df['condition'].apply(lambda x: 'unknown' if re.search("users found",x) else str(x).lower())
df.loc[:,'drugName'] = df['drugName'].apply(lambda x: str(x).lower())
df.loc[:,'review'] = df['review'].apply(lambda x: str(x).lower())
df.review = df.review.str.lower()
df["condition"].fillna("unknown", axis=0, inplace=True)

In [131]:
df["condition"].nunique()

481

## Text Cleaning

In [141]:
df["review"] = df.review.str.replace('"', "")
df["review"] = df.review.str.replace('&#039;', "")
df.review = df.review.str.replace(r'[^\x00-\x7F]+',' ')
#df.review = df.review.str.replace(r'^\s+|\s+?$','')
df.review = df.review.str.replace(r'\s+',' ')
df.review = df.review.str.replace(r'\.{2,}', '')
df.review = df.review.str.replace(r'\d+', ' ')
df.review = df.review.str.replace(r"\s*'\s*\w*", ' ')
df.review = df.review.str.replace(r'\W+', ' ')
df.review = df.review.str.replace(r'\s+', ' ')
df.review = df.review.str.replace(r'^\s+|\s+?$', '')
df

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,length
0,valsartan,left ventricular dysfunction,it has no side effect i take it in combination...,9.0,"May 20, 2012",27,17
1,guanfacine,adhd,my son is halfway through his fourth week of i...,8.0,"April 27, 2010",192,141
2,lybrel,birth control,i used to take another oral contraceptive whic...,5.0,"December 14, 2009",17,134
3,ortho evra,birth control,this is my first time using any form of birth ...,8.0,"November 3, 2015",10,89
4,buprenorphine / naloxone,opiate dependence,suboxone has completely turned my life around ...,9.0,"November 27, 2016",37,124
...,...,...,...,...,...,...,...
130279,hydroxyzine,anxiety,i have a rare disease called systemic mastocyt...,10.0,"March 26, 2015",26,44
130280,methadone,pain,have been taking it for years milligrams no si...,10.0,"August 2, 2011",16,12
130281,clomipramine,panic disorde,ive been on clomipramine years now and basical...,10.0,"February 19, 2013",28,49
130283,tamoxifen,"breast cancer, prevention",i have taken tamoxifen for years side effects ...,10.0,"September 13, 2014",43,97


In [193]:
stop_spec = ['taking','pain','effects','first','started','like','months','get','days','time','would','one','weeks','took','week','also','got','month']
stop_spec.extend(['day','years','life','went','year','hours','going','used','lbs','getting','try','use','make','say'])

In [194]:
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))
from sklearn.feature_extraction import text
stop = text.ENGLISH_STOP_WORDS
stop_words.extend(['im', 'ive', 'it', 'mg', 'quot'])
stop_words.extend(stop)
stop_words.extend(stop_spec)
stop_words = list(set(stop_words))
for i in range(len(stop_words)):
    stop_words[i] = re.sub("'","",stop_words[i])
pat = r'\b(?:{})\b'.format('|'.join(stop_words))
pat

'\\b(?:he|thereafter|doesnt|thin|a|their|or|mustn|wasn|who|except|thereby|mg|herself|thatll|anyway|needn|among|may|either|see|wouldn|never|mightnt|six|amoungst|otherwise|shouldnt|taking|seem|didn|should|each|myself|had|every|together|move|o|whereupon|wouldnt|so|wherever|thick|whether|at|through|get|doesn|twenty|however|forty|mostly|something|its|aren|last|latterly|me|few|such|yourself|nothing|anyone|its|around|now|than|by|sincere|shant|seemed|got|least|this|against|shouldve|amount|per|two|mill|someone|you|one|here|until|when|other|they|shan|nine|about|along|isnt|ltd|name|did|behind|thru|weeks|elsewhere|youve|sometime|all|out|won|become|youre|serious|must|once|none|wasnt|there|could|effects|mustnt|without|year|yet|hence|of|even|off|hereby|whereby|re|hasnt|moreover|perhaps|same|years|back|any|im|yourselves|herein|many|co|life|thus|for|take|former|below|am|use|does|always|give|ourselves|more|what|others|somewhere|three|us|ive|your|becoming|doing|and|my|somehow|are|some|anything|therein|le

In [195]:
df['review'] = df['review'].str.replace(pat, '')
df.review = df.review.str.replace(r'\W+', ' ')

## Creating the Corpus

In [205]:
reviews = []
corpus=[]
for review in df['review']:
    reviews.append(review)
    corpus.append(nltk.sent_tokenize(review))
corpus=[sent for sublist in corpus for sent in sublist]

In [187]:
len(corpus)

91815

# Tokenizing the corpus

In [206]:
wordfreq = {}
for sentence in corpus:
    words = sentence.split()
    #tokens = nltk.word_tokenize(sentence) # To get the words, it can be also done with sentence.split()
    for word in words:
        if ( word not in wordfreq.keys() ): ## first time appearnce in the sentence
            wordfreq[word] = 1 # We initialize the corresponding counter
        else: ## if the world is already existed in the dictionalry 
            wordfreq[word] += 1 # We increase the corresponding counter
wordfreq = dict(sorted(wordfreq.items(),key= lambda x:x[1],reverse=True))
print(wordfreq)



In [197]:
len(list(wordfreq.keys()))

41650

### Reducing the corpus

#### Stopwords

In [207]:
# Keeping 30 most preq words
corpus_freq = [(wordfreq[key],key) for key in list(wordfreq.keys())]
corpus_freq = [(word[1],word[0]) for word in corpus_freq[:60]] 
#corpus_freq = corpus_freq[1:]
corpus_freq

[('feel', 21883),
 ('medication', 18186),
 ('anxiety', 16990),
 ('doctor', 16612),
 ('medicine', 14415),
 ('weight', 14005),
 ('better', 13636),
 ('sleep', 13500),
 ('work', 13343),
 ('really', 13180),
 ('bad', 12801),
 ('night', 12672),
 ('pill', 12589),
 ('felt', 11930),
 ('good', 11503),
 ('great', 11161),
 ('drug', 10854),
 ('tried', 10794),
 ('prescribed', 10616),
 ('works', 10580),
 ('ago', 10489),
 ('dose', 10463),
 ('depression', 10385),
 ('worked', 9441),
 ('severe', 8822),
 ('little', 8822),
 ('period', 8729),
 ('help', 8721),
 ('feeling', 8602),
 ('times', 7572),
 ('helped', 7486),
 ('amp', 7383),
 ('control', 7264),
 ('lost', 7135),
 ('away', 7026),
 ('far', 7002),
 ('symptoms', 6944),
 ('effect', 6821),
 ('lot', 6705),
 ('acne', 6678),
 ('nausea', 6652),
 ('morning', 6632),
 ('think', 6564),
 ('know', 6541),
 ('stopped', 6456),
 ('using', 6427),
 ('long', 6426),
 ('old', 6362),
 ('normal', 6309),
 ('skin', 6124),
 ('taken', 6038),
 ('able', 6003),
 ('experience', 5780),
 (

In [64]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
lem = WordNetLemmatizer()

corpus_freq = [(lem.lemmatize(word[0]),word[1]) for word in corpus_freq]
corpus_freq

[nltk_data] Downloading package omw-1.4 to /home/sabeiro/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


[('side', 42947),
 ('taking', 42381),
 ('pain', 38340),
 ('year', 37515),
 ('take', 36678),
 ('effect', 36298),
 ('first', 35215),
 ('started', 34109),
 ('like', 33936),
 ('month', 33009),
 ('get', 32924),
 ('day', 32834),
 ('time', 31037),
 ('feel', 29010),
 ('would', 26471),
 ('back', 25441),
 ('one', 24313),
 ('week', 23491),
 ('pill', 23420),
 ('medication', 23411),
 ('took', 22815),
 ('week', 22548),
 ('also', 22533),
 ('doctor', 22515),
 ('weight', 21075),
 ('got', 20976),
 ('life', 20669),
 ('anxiety', 20369),
 ('dont', 20327),
 ('month', 20113)]

#### LIST 1 [FULL]

In [65]:
cols = {word[0]: [] for word in corpus_freq}
reviews = pd.DataFrame(cols)

reviews.columns

Index(['side', 'taking', 'pain', 'year', 'take', 'effect', 'first', 'started',
       'like', 'month', 'get', 'day', 'time', 'feel', 'would', 'back', 'one',
       'week', 'pill', 'medication', 'took', 'also', 'doctor', 'weight', 'got',
       'life', 'anxiety', 'dont'],
      dtype='object')

In [66]:
def review_inpector(sentence, stop_words, words):

    import re

    # Decompose the review in words -> tokens
    tokens = nltk.word_tokenize(sentence)
    # Cleanup the tokenp
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
        tokens[i] = re.sub(r'\W',' ',tokens[i]) # Replace everything non-alpahnumeric by ' '
        tokens[i] = re.sub(r'\s+','',tokens[i]) # Replace one or more whitespaces by  ' '
        tokens[i] = re.sub(r'\d+','',tokens[i]) # Replace one or more digits by  ' '
        tokens[i] = lem.lemmatize(tokens[i])
        
    # Dropping tokens which are "stopwords" or empty
    tokens = [ token for token in tokens if (token not in stop_words and token != '')]

    # Initializing an empty dictionary of word frequencies for the corresponding review
    col_freq = {col:0 for col in words}
    
    # Filling the dictionary with word frequencies in the review
    for token in tokens:
        if token in words:
            col_freq[token] += 1

    return col_freq

In [67]:
my_list = list(map(review_inpector, df['review'], 
                    [stop_words]*df.shape[0], [list(cols.keys())]*df.shape[0] ) )

my_list[:2]

[{'side': 1,
  'taking': 0,
  'pain': 0,
  'year': 0,
  'take': 1,
  'effect': 1,
  'first': 0,
  'started': 0,
  'like': 0,
  'month': 0,
  'get': 0,
  'day': 0,
  'time': 0,
  'feel': 0,
  'would': 0,
  'back': 0,
  'one': 0,
  'week': 0,
  'pill': 0,
  'medication': 0,
  'took': 0,
  'also': 0,
  'doctor': 0,
  'weight': 0,
  'got': 0,
  'life': 0,
  'anxiety': 0,
  'dont': 0},
 {'side': 0,
  'taking': 1,
  'pain': 0,
  'year': 0,
  'take': 0,
  'effect': 0,
  'first': 0,
  'started': 1,
  'like': 0,
  'month': 0,
  'get': 1,
  'day': 3,
  'time': 0,
  'feel': 0,
  'would': 0,
  'back': 0,
  'one': 0,
  'week': 2,
  'pill': 0,
  'medication': 1,
  'took': 0,
  'also': 0,
  'doctor': 1,
  'weight': 0,
  'got': 0,
  'life': 0,
  'anxiety': 0,
  'dont': 0}]

In [68]:
reviews = pd.DataFrame(my_list)

In [69]:
reviews

Unnamed: 0,side,taking,pain,year,take,effect,first,started,like,month,...,pill,medication,took,also,doctor,weight,got,life,anxiety,dont
0,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,3,1,0,0,1,2,1,1,0,0,...,2,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,3,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130280,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130281,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
130282,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130283,1,1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [70]:
reviews['rating'] = df['rating'].reset_index(drop=True)
reviews

Unnamed: 0,side,taking,pain,year,take,effect,first,started,like,month,...,medication,took,also,doctor,weight,got,life,anxiety,dont,rating
0,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
2,3,1,0,0,1,2,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,3,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130280,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
130281,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
130282,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
130283,1,1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1


#### LIST 3: stopwords incl. negation

In [71]:
stop_words = set(nltk.corpus.stopwords.words('english'))
exclude_words = set(("aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","isn't","mightn't","mustn't","needn't","no","nor","not","shan't","shouldn't","wasn't","weren't","wouldn't"))
new_stop_words = stop_words.difference(exclude_words)

In [72]:
new_stop_words = list(new_stop_words)

In [73]:
new_stop_words.extend(['im', 'ive', 'it', 'mg', 'quot'])
new_stop_words

['mustn',
 'shan',
 'shouldn',
 'did',
 'those',
 'hasn',
 'themselves',
 'an',
 'doing',
 's',
 'my',
 'it',
 'at',
 'd',
 'yourself',
 'if',
 'yours',
 'you',
 'ma',
 't',
 'against',
 'now',
 'is',
 'the',
 'aren',
 'ourselves',
 'i',
 'there',
 'should',
 'or',
 'by',
 'who',
 'down',
 'so',
 'm',
 'she',
 'before',
 'his',
 'and',
 'up',
 'o',
 'few',
 'once',
 'has',
 'their',
 'myself',
 'whom',
 'during',
 'in',
 'that',
 'because',
 'other',
 'when',
 'wasn',
 'but',
 "you'll",
 'why',
 'after',
 'where',
 'out',
 'ain',
 'her',
 'between',
 'of',
 'herself',
 'from',
 'its',
 'have',
 'we',
 'yourselves',
 'was',
 'into',
 'are',
 'for',
 "it's",
 'with',
 'again',
 'only',
 'didn',
 'very',
 'needn',
 "should've",
 'some',
 'just',
 "won't",
 'these',
 'below',
 'having',
 'itself',
 'all',
 'this',
 'hadn',
 'mightn',
 "she's",
 'haven',
 'been',
 'a',
 're',
 'over',
 'further',
 'y',
 'couldn',
 'll',
 'here',
 "you'd",
 'off',
 'am',
 'our',
 'about',
 'him',
 'above',
 

In [74]:
for i in range(len(new_stop_words)):
    new_stop_words[i] = re.sub(r"\s*'\s*\w*","",new_stop_words[i])
new_stop_words

['mustn',
 'shan',
 'shouldn',
 'did',
 'those',
 'hasn',
 'themselves',
 'an',
 'doing',
 's',
 'my',
 'it',
 'at',
 'd',
 'yourself',
 'if',
 'yours',
 'you',
 'ma',
 't',
 'against',
 'now',
 'is',
 'the',
 'aren',
 'ourselves',
 'i',
 'there',
 'should',
 'or',
 'by',
 'who',
 'down',
 'so',
 'm',
 'she',
 'before',
 'his',
 'and',
 'up',
 'o',
 'few',
 'once',
 'has',
 'their',
 'myself',
 'whom',
 'during',
 'in',
 'that',
 'because',
 'other',
 'when',
 'wasn',
 'but',
 'you',
 'why',
 'after',
 'where',
 'out',
 'ain',
 'her',
 'between',
 'of',
 'herself',
 'from',
 'its',
 'have',
 'we',
 'yourselves',
 'was',
 'into',
 'are',
 'for',
 'it',
 'with',
 'again',
 'only',
 'didn',
 'very',
 'needn',
 'should',
 'some',
 'just',
 'won',
 'these',
 'below',
 'having',
 'itself',
 'all',
 'this',
 'hadn',
 'mightn',
 'she',
 'haven',
 'been',
 'a',
 're',
 'over',
 'further',
 'y',
 'couldn',
 'll',
 'here',
 'you',
 'off',
 'am',
 'our',
 'about',
 'him',
 'above',
 'doesn',
 'tha

In [75]:
corpus4 = [(wordfreq[key],key) for key in list(wordfreq.keys()) if key not in new_stop_words]
corpus4

[(47360, 'no'),
 (42947, 'side'),
 (8976, 'effect'),
 (36678, 'take'),
 (1357, 'combination'),
 (176, 'bystolic'),
 (135, 'fish'),
 (571, 'oil'),
 (1746, 'son'),
 (143, 'halfway'),
 (481, 'fourth'),
 (22548, 'week'),
 (141, 'intuniv'),
 (3483, 'became'),
 (751, 'concerned'),
 (4084, 'began'),
 (12762, 'last'),
 (34109, 'started'),
 (42381, 'taking'),
 (283, 'highest'),
 (13028, 'dose'),
 (17073, 'two'),
 (32834, 'days'),
 (12010, 'could'),
 (1075, 'hardly'),
 (32924, 'get'),
 (5269, 'bed'),
 (133, 'cranky'),
 (1583, 'slept'),
 (1916, 'nearly'),
 (13517, 'hours'),
 (5193, 'drive'),
 (2647, 'home'),
 (2398, 'school'),
 (225, 'vacation'),
 (344, 'unusual'),
 (1947, 'called'),
 (22515, 'doctor'),
 (647, 'monday'),
 (8694, 'morning'),
 (6409, 'said'),
 (1424, 'stick'),
 (8244, 'see'),
 (11434, 'getting'),
 (5441, 'problem'),
 (3785, 'free'),
 (18943, 'much'),
 (7, 'agreeable'),
 (10175, 'ever'),
 (6626, 'less'),
 (2164, 'emotional'),
 (15996, 'good'),
 (7954, 'thing'),
 (295, 'remembering')

In [76]:
corpus4.sort(reverse = True)
corpus_freq4 = [(word[1],word[0]) for word in corpus4[:31]] 
corpus_freq4 = corpus_freq4[1:]
corpus_freq4

[('day', 48525),
 ('no', 47360),
 ('side', 42947),
 ('taking', 42381),
 ('pain', 38340),
 ('years', 37515),
 ('take', 36678),
 ('effects', 36298),
 ('first', 35215),
 ('started', 34109),
 ('like', 33936),
 ('months', 33009),
 ('get', 32924),
 ('days', 32834),
 ('time', 31037),
 ('feel', 29010),
 ('would', 26471),
 ('back', 25441),
 ('one', 24313),
 ('weeks', 23491),
 ('pill', 23420),
 ('medication', 23411),
 ('took', 22815),
 ('week', 22548),
 ('also', 22533),
 ('doctor', 22515),
 ('weight', 21075),
 ('got', 20976),
 ('life', 20669),
 ('anxiety', 20369)]

In [77]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
lem = WordNetLemmatizer()

corpus_freq = [(lem.lemmatize(word[0]),word[1]) for word in corpus_freq]
corpus_freq

[nltk_data] Downloading package omw-1.4 to /home/sabeiro/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[('side', 42947),
 ('taking', 42381),
 ('pain', 38340),
 ('year', 37515),
 ('take', 36678),
 ('effect', 36298),
 ('first', 35215),
 ('started', 34109),
 ('like', 33936),
 ('month', 33009),
 ('get', 32924),
 ('day', 32834),
 ('time', 31037),
 ('feel', 29010),
 ('would', 26471),
 ('back', 25441),
 ('one', 24313),
 ('week', 23491),
 ('pill', 23420),
 ('medication', 23411),
 ('took', 22815),
 ('week', 22548),
 ('also', 22533),
 ('doctor', 22515),
 ('weight', 21075),
 ('got', 20976),
 ('life', 20669),
 ('anxiety', 20369),
 ('dont', 20327),
 ('month', 20113)]