# __UCI ML Drug Review Analysis__

## __Importing Libraries__

In [31]:
## for data
import pandas as pd
import numpy as np
import collections
import json
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for text processing
import re
import nltk
## for vectorizer
from sklearn import feature_extraction, manifold


In [32]:
pip install textblob




In [33]:
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

In [34]:
import warnings
warnings.filterwarnings("ignore")

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
df = pd.read_csv('drugsComTrain_raw.csv')

In [37]:
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [38]:
df.shape

(161297, 7)

## __Dropping Missing Values__

In [39]:
df.dropna(inplace = True)

In [40]:
df.shape

(160398, 7)

### The rating column is scaled from 1 to 10, We then add a new column called “Positivity”, where any score above 5 is encoded as a 1, indicating it was positively rated. Otherwise, it’ll be encoded as a 0, indicating it was negatively rated.

In [41]:
df.dropna(inplace=True)

df['Positivity'] = np.where(df['rating'] > 5, 1, 0)
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,Positivity
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,1
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,0
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['Positivity'], random_state = 0)

## __Bag OF Words__

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
vect

CountVectorizer()

In [44]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<120298x45581 sparse matrix of type '<class 'numpy.int64'>'
	with 7071793 stored elements in Compressed Sparse Row format>

## __Tf–idf term weighting__

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

16958

## __n-grams__

In [48]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

194387

## __Logistic Regression__

In [49]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression()

In [50]:
from sklearn.metrics import roc_auc_score
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.8757678112306158


## __Testing The Model__

In [51]:
print(model.predict(vect.transform(['The medicine is of poor quality, I will never buy them again','The medicine  is not bad, I will buy them again'])))

[0 1]


## __Model correctly identifies them as negative and positive reviews respectively__