In [1]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\desig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\desig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\desig\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Reading dataset as dataframe
df = pd.read_csv("MedReviews.csv", encoding = "ISO-8859-1") # You can also use "utf-8"
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

In [3]:
df

Unnamed: 0,Medicine,Condition,Review,Rating
0,Liraglutide,Obesity,"""I have been taking Saxenda since July 2016. I had severe nausea for about a month once I got up to the 2.6 dosage. It has since subsided and the only side effect I notice now is the dry mouth. I make sure to drink 2.5 litres of water a day (about 10 glasses). This helps with the weight loss as well as the constipation. I have been reducing my dose to find a comfortable spot where I am still losing weight but don&#039;t feel like I am over medicating. For me, 1.8 is working very well. I also feel wearing a Fitbit has really helped. I can track my food, water, exercise and steps - it keeps me moving more. When this started I could barely walk the length of myself without getting winded - I have lost 58 lbs so far.""",High
1,Trazodone,Insomnia,"""I have insomnia, it&#039;s horrible. My story begins with my PCP prescribing me Prozac to help with intestinal issues, because I was desperate I tried it, I was on it for 3 weeks. Stopped because of insomnia. Then I was prescribed Ativan, it out me out, but was very addicting. I had rebound insomnia. Then after about 14 days I hardly any sleep l tried the doctor one more time. I asked him about Trazadone. He told me that was a good medication for insomnia. He put me on 25 mgs, but stated I may have to figure out what dosage is best for me. I am currently taking 100 mgs, which is on the low range of what is prescribed, 400 mgs being at the high end for insomnia. I have the dry mouth and nasel congestion. I can live with that, I sleep now, yeah...""",High
2,Contrave,Obesity,"""I am just finishing my second week taking Contrave and have lost 10 lbs. It has been an interesting experience because the drug is definitely not an appetite suppressant, yet it does help you control the urge to eat. I have had mild side effects - some stomach discomfort and slight headaches along with constipation, but it has all been easily manageable. If you are considering taking this medication please keep in mind that you still need to do the work by exercising and eating right, but the medication really does help. It has helped me get my motivation back to exercise, and it definitely helps with cravings to eat making it easy to just eat what I need to be healthy. Use the coupon they give you, it drops cost to $70""",High
3,Dulaglutide,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my last post as I wanted to give it a few months to see how this was going to work. So, I have been on Trulicity for six months now with Metformin. When I hit the five month period the diarrhea, gas, sulphur belching finally subsided. I now longer have any of those side effects. However, I still haven&#039;t lost any weight at all, but I think that was because when I first started Trulicity I was taking it with Glimepiride and that one has a side effect of weight gain so I think the two meds were fighting each other lol. I have been back on the Metformin with Trulicity for about a week now, so we will see what this does. I was diagnosed as stage 3 chronic kidney disease (CKD 3) so I am watching diet closely.""",High
4,Bupropion,Smoking Cessation,"""Love this, no mouth sores, or ulcers like Wellbutrin gave me. I COMPLETELY QUIT SMOKING...this works.""",High
...,...,...,...,...
23300,Phentermine,Weight Loss,"""Started Adipex 2 weeks ago. Lost 20 lbs so far! I even skip the pill 1 to 2 times a week. The energy loss is aweful those days though.""",High
23301,Zolpidem,Insomnia,"""Zolpidem does work fast. However, I have a right arm that goes to sleep on me easily. So if I take zolpidem, I make sure that I fall asleep in a position that will not cut the circulation in my arm.""",Low
23302,Zolpidem,Insomnia,"""I started taking this medication 10 years ago. My doctor told me it was non-narcotic and non-habit forming. Up to that time I had never had any addictions to drugs or alcohol, but unfortunately I am dependent on this drug for sleep. It does however work very well with few side effects and I wake up feeling refreshed. """,High
23303,Invokana,"Diabetes, Type 2","""I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losing so much weight. No side effects yet. Miracle medicine for me""",High


In [4]:
# Converting structured categorical features to numerical features
df['Rating'] = df['Rating'].map({'High':1, 'Low':0})

categorical_features = ['Medicine', 'Condition']
df = pd.get_dummies(df, columns = categorical_features)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23305 entries, 0 to 23304
Columns: 579 entries, Review to Condition_Weight Loss
dtypes: int64(1), object(1), uint8(577)
memory usage: 13.2+ MB
None


In [5]:
# Converting unstructured 'Review' column to a TF-IDF matrix
def cleaner(review): # Cleaning reviews
    soup = BeautifulSoup(review, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas


In [6]:
df['cleaned_review'] = df.Review.apply(cleaner)
df = df[df['cleaned_review'].map(len) > 0] # removing rows with cleaned reviews of length 0
print("Printing top 5 rows of dataframe showing original and cleaned reviews....")
print(df[['Review','cleaned_review']].head())

df.to_csv('cleaned_Med_data.csv', index=False) # Saving cleaned reviews to csv (to inspect how well the reviews have been cleaned!)

Printing top 5 rows of dataframe showing original and cleaned reviews....
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Review  \
0                                    "I have been taking Saxenda since July 2016.  I had severe nausea for about a month once I got up to the 2.6 dosage

In [7]:
df['cleaned_review'] = [" ".join(row) for row in df['cleaned_review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_review']
Y = df['Rating'] # target column
tfidf = TfidfVectorizer(min_df=.00086, ngram_range=(1,3)) # min_df=.00086 means that each ngram (unigram, bigram, & trigram) must be present in at least 20 documents for it to be considered as a token (23305*.00086=20). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values

In [10]:
pd.DataFrame.from_dict(data=dict([word, i] for i, word in enumerate(tfidf.get_feature_names())), orient='index').to_csv('vocabulary.csv', header=False) # Saving vocabulary to csv
print("Shape of tfidf matrix: ", data_tfidf.shape)
print(type(data_tfidf))

Shape of tfidf matrix:  (23305, 8103)
<class 'scipy.sparse.csr.csr_matrix'>


In [15]:
# Concatenating TF-IDF matrix with the rest of the independent variables to create a final feature set
df1 = df.drop(['Review', 'cleaned_review', 'Rating'], axis=1)
df2 = pd.DataFrame(data_tfidf.toarray(), columns = tfidf.get_feature_names()) # Converting tfidf matrix into a dataframe
X = pd.concat([df1, df2], axis = 1)

print(df1.shape)
print(df2.shape)
print(X.shape)
print(X.info())


(23305, 577)
(23305, 8103)
(23305, 8680)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 23305 entries, 0 to 23304
Columns: 8680 entries, Medicine_Absorica to zyrtec
dtypes: float64(8103), uint8(577)
memory usage: 1.4 GB
None


In [19]:
# Implementing Support Vector Classifier
model = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(X, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = X.iloc[train_index], Y[train_index]
    X_test, Y_test = X.iloc[test_index], Y[test_index]
    model.fit(X_train, Y_train) # Fitting SVC
    Y_pred = model.predict(X_test)
    score = metrics.precision_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation precision: ", score)
    scores.append(score) # appending cross-validation precision for each iteration
mean_precision = np.mean(scores)
print("Mean cross-validation precision: ", mean_precision)

# Creating SVC on entire data and saving it
clf = LinearSVC().fit(X, Y)
joblib.dump(clf, 'svc.sav')


Iteration  1
Cross-validation precision:  0.9057815845824411
Iteration  2
Cross-validation precision:  0.9230769230769231
Iteration  3
Cross-validation precision:  0.9191096634093376
Iteration  4
Cross-validation precision:  0.9269085002707093
Iteration  5
Cross-validation precision:  0.9129032258064517
Iteration  6
Cross-validation precision:  0.9114806866952789
Iteration  7
Cross-validation precision:  0.9197002141327623
Iteration  8
Cross-validation precision:  0.9172043010752688
Iteration  9
Cross-validation precision:  0.9269085002707093
Iteration  10
Cross-validation precision:  0.9174361759913091
Mean cross-validation precision:  0.918050977531119


['svc.sav']