In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import html
from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import os

In [2]:
path_dataset = './datasets/'

datasets = os.listdir(path_dataset)

test = pd.read_csv(path_dataset + datasets[0])
train = pd.read_csv(path_dataset + datasets[1])

drugs_pd = test.append(train, ignore_index = True)
drugs_pd

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4
...,...,...,...,...,...,...,...
215058,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10,31-May-15,125
215059,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1,1-Nov-11,34
215060,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2,15-Mar-14,35
215061,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10,19-Sep-15,79


In [3]:
drugs_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215063 entries, 0 to 215062
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     215063 non-null  int64 
 1   drugName     215063 non-null  object
 2   condition    213869 non-null  object
 3   review       215063 non-null  object
 4   rating       215063 non-null  int64 
 5   date         215063 non-null  object
 6   usefulCount  215063 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 11.5+ MB


In [4]:
drugs_pd.describe()

Unnamed: 0,uniqueID,rating,usefulCount
count,215063.0,215063.0,215063.0
mean,116039.364814,6.990008,28.001004
std,67007.913366,3.275554,36.346069
min,0.0,1.0,0.0
25%,58115.5,5.0,6.0
50%,115867.0,8.0,16.0
75%,173963.5,10.0,36.0
max,232291.0,10.0,1291.0


In [5]:
drugs_pd.isnull().sum()

uniqueID          0
drugName          0
condition      1194
review            0
rating            0
date              0
usefulCount       0
dtype: int64

In [6]:
no_condition = drugs_pd.drop(drugs_pd[drugs_pd['condition'].isnull() == False].index).copy()

no_condition

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
34,77164,Lorcaserin,,"""Have had back and leg pain and sometimes arm ...",5,27-May-14,21
146,192374,Drospirenone / ethinyl estradiol,,"""I hate this birth control. I was originally t...",4,13-Apr-15,8
156,165767,Levonorgestrel,,"""I&#039;ve never had kids and I am 32 fyi. The...",7,3-Apr-17,0
779,157589,Guaifenesin,,"""You&#039;ll almost always notice a medicinal ...",9,23-Feb-10,10
1017,96918,Conjugated estrogens / medroxyprogesterone,,"""Has not worked one bit still have day and nig...",1,6-Jul-15,5
...,...,...,...,...,...,...,...
214234,117817,Multivitamin with minerals,,"""Severe hives itching after taking for 6 months""",5,15-Nov-15,0
214266,657,Medroxyprogesterone,,"""I am 18 and I have been using the shot for 8 ...",6,20-Nov-11,2
214455,104844,Ethinyl estradiol / levonorgestrel,,"""I&#039;ve been on Loseasonique for about 2 we...",10,13-Apr-10,3
214518,41252,Acetaminophen / oxycodone,,"""This is my third day using this pain medicine...",10,13-Dec-10,4


In [18]:
pattern = r'\w*</span>\w*'

check = list(drugs_pd['condition'].unique())
non_conditions = []

for cond in check:
    if type(cond) != float:
        if re.match(pattern, cond):
            non_conditions.append(cond)

non_condition = drugs_pd[drugs_pd['condition'].isin(non_conditions)].copy()
non_condition

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
136,33560,Microgestin Fe 1 / 20,12</span> users found this comment helpful.,I was really worried at first because of all t...,9,24-Dec-12,12
211,133232,Tri-Sprintec,0</span> users found this comment helpful.,I have been off of birth control for 2 years. ...,5,20-Dec-11,0
348,60892,Kombiglyze XR,6</span> users found this comment helpful.,"This medicine is great, our patients no longer...",10,4-Oct-12,6
368,66762,Seroquel,24</span> users found this comment helpful.,I've been taking this since January 2013 for d...,9,19-Sep-13,24
572,156099,Cialis,9</span> users found this comment helpful.,I took the pill Friday night and went to bed. ...,10,3-May-11,9
...,...,...,...,...,...,...,...
214088,173464,Alli,99</span> users found this comment helpful.,Great pill... You can definitely see it workin...,9,2-Apr-15,99
214557,150010,Soma,62</span> users found this comment helpful.,I am sharing a painful experience actually. Th...,5,11-Mar-10,62
214665,209239,Geodon,7</span> users found this comment helpful.,I've been taking Geodon for about a month or s...,7,4-Feb-11,7
214687,96527,Neurontin,92</span> users found this comment helpful.,At 300mg 4x daily my pain was almost gone.,9,8-Jul-13,92


In [8]:
noc_reviews = list(no_condition['review'])
nonc_reviews = list(non_condition['review'])

def clean_review(reviews):
    clean = list()
    pattern_unusual = r'.*%u2019.*'
    
    for rev in reviews:
        cleaned = (re.sub(r'%u2019', "'", rev) if re.match(pattern_unusual, rev) else html.unescape(rev))
        clean.append(cleaned.strip('"'))
        
    return clean

no_condition['review'] = clean_review(noc_reviews)
non_condition['review'] = clean_review(nonc_reviews)

In [9]:
drugs_pd.update(no_condition['review'])
drugs_pd.update(non_condition['review'])

conditions = drugs_pd[(~drugs_pd.index.isin(non_condition.index)) & (~drugs_pd.index.isin(no_condition.index))].copy()
conditon_reviews = list(conditions['review'])

conditions['review'] = clean_review(conditon_reviews)
drugs_pd.update(conditions['review'])

cleaned_drugs = drugs_pd.copy()

In [10]:
# work with this
cleaned_drugs

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,I've tried a few antidepressants over the year...,10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance",My son has Crohn's disease and has done very w...,8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,Quick reduction of symptoms,9,29-Sep-17,3
3,39293,Contrave,Weight Loss,Contrave combines drugs that were used for alc...,9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,I have been on this birth control for one cycl...,9,22-Oct-15,4
...,...,...,...,...,...,...,...
215058,191035,Campral,Alcohol Dependence,I wrote my first report in Mid-October of 2014...,10,31-May-15,125
215059,127085,Metoclopramide,Nausea/Vomiting,I was given this in IV before surgey. I immedi...,1,1-Nov-11,34
215060,187382,Orencia,Rheumatoid Arthritis,"Limited improvement after 4 months, developed ...",2,15-Mar-14,35
215061,47128,Thyroid desiccated,Underactive Thyroid,"I've been on thyroid medication 49 years, I sp...",10,19-Sep-15,79


In [15]:
def deconstruct(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

cleaned_drugs['review'] = cleaned_drugs['review'].apply(deconstruct)
cleaned_drugs

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,I have tried a few antidepressants over the ye...,10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance",My son has Crohn is disease and has done very ...,8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,Quick reduction of symptoms,9,29-Sep-17,3
3,39293,Contrave,Weight Loss,Contrave combines drugs that were used for alc...,9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,I have been on this birth control for one cycl...,9,22-Oct-15,4
...,...,...,...,...,...,...,...
215058,191035,Campral,Alcohol Dependence,I wrote my first report in Mid-October of 2014...,10,31-May-15,125
215059,127085,Metoclopramide,Nausea/Vomiting,I was given this in IV before surgey. I immedi...,1,1-Nov-11,34
215060,187382,Orencia,Rheumatoid Arthritis,"Limited improvement after 4 months, developed ...",2,15-Mar-14,35
215061,47128,Thyroid desiccated,Underactive Thyroid,"I have been on thyroid medication 49 years, I ...",10,19-Sep-15,79


### Tasks:

* exploratory data analysis (do graphs and stuff sa dataset (e.g. count yung number of drugs))


* nlp sentiment analysis (search VADER for more info but tl:dr, magreresult siya ng apat na values that scores yung sentiment ng sentence na yun)


* derive ng column to score yung actual rating based sa rating, sentiment score, and yung usefulCount


* idk paano iapply yung cosine and jaccard similarity... but yes ahaha