# __UCI ML Drug Review Analysis__

## __Importing Libraries__

In [2]:
## for data
import pandas as pd
import numpy as np
import collections
import json
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for text processing
import re
import nltk
## for vectorizer
from sklearn import feature_extraction, manifold


In [3]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [4]:
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
df = pd.read_csv('drugsComTrain_raw.csv')

In [8]:
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [9]:
df.shape

(161297, 7)

## __Dropping Missing Values__

In [10]:
df.dropna(inplace = True)

In [11]:
df.shape

(160398, 7)

### The rating column is scaled from 1 to 10, We then add a new column called “Positivity”, where any score above 5 is encoded as a 1, indicating it was positively rated. Otherwise, it’ll be encoded as a 0, indicating it was negatively rated.

In [12]:
df.dropna(inplace=True)

df['Positivity'] = np.where(df['rating'] > 5, 1, 0)
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,Positivity
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,1
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,0
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10,1
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,1


## __Text Preprocessing : Text Cleaning And Transformation__

## __Tokenization, Stemming, Lemmatization__

In [13]:
'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_stemm: bool - whether stemming is to be applied
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [15]:
df["Processed_Review"] = df["review"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True))

## __Splitting Train And Test Data__

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Processed_Review'], df['Positivity'], random_state = 0)

## __Bag OF Words__

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
vect

CountVectorizer()

In [18]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<120298x69586 sparse matrix of type '<class 'numpy.int64'>'
	with 6933994 stored elements in Compressed Sparse Row format>

## __Tf–idf term weighting__

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

16895

## __n-grams__

In [20]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

188090

## __Logistic Regression__

In [21]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression()

In [22]:
from sklearn.metrics import roc_auc_score
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.8678172411633344


## __Testing The Model__

In [23]:
print(model.predict(vect.transform(['The medicine is of poor quality, I will never buy them again','The medicine  is not bad, I will buy them again'])))

[0 1]


## __Model correctly identifies them as negative and positive reviews respectively__