# Baseline Model

## 資料前處理
文字部份取`review`欄位。
`rating`原範圍1-10, 定義1-4分為負評(標記：-1), 5-7分為中性評論(標記：0), 8-10分為好評(標記：1)。新增`polarity`欄位紀錄好評、中性評論及負評標記。
建立 drugSet 紀錄所有`drugName`及`condition`出現過的藥名與治療目的，文字全部轉為小寫，作為額外的stopwords。
`review`部份全部過濾標點符號、停用字，並轉為小寫，但保留數字。

## Feature & Target
利用 sklearn 的 CountVectorizer 建立詞頻紀錄，作為輸入的 feature，feature 大小為原始資料詞庫大小：63011。target 則是`polarity`欄位。

## Model
- Logistic Regression

training accuracy:
0.8425451186320886

testing accuracy:
0.7738719636945282

- SVC

training accuracy:
0.604456375506054

testing accuracy:
0.6018115537700406

- MultinomialNB

training accuracy:
0.7533246123610483

testing accuracy:
0.7220734293047651


In [15]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './drugsComments/train.tsv'
test_data_path = './drugsComments/test.tsv'

train_df = pd.read_csv(train_data_path, sep='\t')
test_df = pd.read_csv(test_data_path, sep='\t')

print(train_df.shape)
print(train_df.head())

(161297, 7)
   Unnamed: 0                  drugName                     condition  \
0      206461                 Valsartan  Left Ventricular Dysfunction   
1       95260                Guanfacine                          ADHD   
2       92703                    Lybrel                 Birth Control   
3      138000                Ortho Evra                 Birth Control   
4       35696  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating  \
0  "It has no side effect, I take it in combinati...     9.0   
1  "My son is halfway through his fourth week of ...     8.0   
2  "I used to take another oral contraceptive, wh...     5.0   
3  "This is my first time using any form of birth...     8.0   
4  "Suboxone has completely turned my life around...     9.0   

                date  usefulCount  
0       May 20, 2012           27  
1     April 27, 2010          192  
2  December 14, 2009           17  
3   November 3, 2015

Drop the columns `Unnamed` and `date`, which are unnecessary. Then, there are 5 columns left:

In [16]:
# Drop unnecessary columns: Unnamed & date

train_df.drop(columns=['Unnamed: 0', 'date'], inplace=True)
test_df.drop(columns=['Unnamed: 0', 'date'], inplace=True)

print(train_df.shape)
print(train_df.head())

(161297, 5)
                   drugName                     condition  \
0                 Valsartan  Left Ventricular Dysfunction   
1                Guanfacine                          ADHD   
2                    Lybrel                 Birth Control   
3                Ortho Evra                 Birth Control   
4  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating  usefulCount  
0  "It has no side effect, I take it in combinati...     9.0           27  
1  "My son is halfway through his fourth week of ...     8.0          192  
2  "I used to take another oral contraceptive, wh...     5.0           17  
3  "This is my first time using any form of birth...     8.0           10  
4  "Suboxone has completely turned my life around...     9.0           37  


In [17]:
# Keep a set which includes all the names of drugs and conditions occurred in the dataset.

import re

drugNameSet_train = set(train_df['drugName'])
conditionNameSet_train = set(train_df['condition'])
drugNameSet_test = set(test_df['drugName'])
conditionNameSet_test = set(test_df['condition'])

oldDrugSet = drugNameSet_train | conditionNameSet_train | drugNameSet_test | conditionNameSet_test

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")

drugSet = set()

for drug in oldDrugSet:
    drugStr = str(drug).lower()
    if not("helpful." in drugStr):
        drugStr = REPLACE_NO_SPACE.sub("", drugStr)
        drugSet.add(drugStr)

drugSet.remove('nan')


# Print `drugSet`

print('size of drugSet:')
print(len(drugSet))
print('\nFirst 20 drug names:')
showNum = 20
countNum = 0
for drug in drugSet:
    print(drug)
    countNum += 1
    if(countNum == showNum):
        break

size of drugSet:
4507

First 20 drug names:
temozolomide
amnesteem
lithium
naloxone / pentazocine
salivart
proventil
belbuca
nasonex
triavil
nortriptyline
peg3350
ciclopirox
maxzide
mephobarbital
estazolam
promethazine dm
erosive esophagitis
adderall xr
azo urinary pain relief
pexeva


In [18]:
# Define `polarity`

train_df['polarity'] = pd.cut(train_df['rating'], bins=[-1, 4, 7, 10], labels=[-1, 0, 1])
test_df['polarity'] = pd.cut(test_df['rating'], bins=[-1, 4, 7, 10], labels=[-1, 0, 1])

print(train_df[['rating', 'polarity']].head(30))

    rating polarity
0      9.0        1
1      8.0        1
2      5.0        0
3      8.0        1
4      9.0        1
5      2.0       -1
6      1.0       -1
7     10.0        1
8      1.0       -1
9      8.0        1
10     9.0        1
11    10.0        1
12     4.0       -1
13     4.0       -1
14     3.0       -1
15     9.0        1
16     9.0        1
17     9.0        1
18    10.0        1
19    10.0        1
20     8.0        1
21    10.0        1
22     9.0        1
23    10.0        1
24     1.0       -1
25     7.0        0
26    10.0        1
27    10.0        1
28     6.0        0
29     8.0        1


In [19]:
# Text cleaning

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

tqdm.pandas()

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

stopWords = set(stopwords.words('english'))

def filter_stopwords(sentence):
    sentence = sentence.replace("&#039;", "'")
    sentence = sentence.lower()
    
    sentence = REPLACE_NO_SPACE.sub("", sentence)
    sentence = REPLACE_WITH_SPACE.sub(" ", sentence)
    
    words = word_tokenize(sentence)
    wordsFiltered = []
    for w in words:
        if (w not in stopWords) and (w not in drugSet):
            wordsFiltered.append(w)
    joinStr = " "
    return joinStr.join(wordsFiltered)


print(train_df['review'].head().values)

train_df['review'] = train_df['review'].progress_apply(filter_stopwords)
test_df['review'] = test_df['review'].progress_apply(filter_stopwords)

print('\n')
print(train_df['review'].head().values)

  0%|          | 114/161297 [00:00<02:21, 1138.79it/s]

['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"'
 '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'
 '"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone

100%|██████████| 161297/161297 [02:01<00:00, 1325.08it/s]
100%|██████████| 53766/53766 [00:40<00:00, 1338.65it/s]



['side effect take combination 5 mg fish oil'
 'son halfway fourth week became concerned began last week started taking highest dose two days could hardly get bed cranky slept nearly 8 hours drive home school vacation unusual called doctor monday morning said stick days see school getting morning last two days problem free much agreeable ever less emotional good thing less cranky remembering things overall behavior better tried many different medications far effective'
 'used take another oral contraceptive 21 pill cycle happy light periods max 5 days side effects contained hormone gestodene available us switched ingredients similar pills ended started immediately first day period instructions said period lasted two weeks taking second pack two weeks third pack things got even worse third period lasted two weeks end third week still daily brown discharge positive side didnt side effects idea period free tempting alas'
 'first time using form birth control im glad went patch 8 months 




In [20]:
# Word counts

from sklearn.feature_extraction.text import CountVectorizer
import datetime

print(datetime.datetime.now())
print('\n')

print(train_df.head(15))

vectorizer = CountVectorizer()

vectorizer.fit(train_df['review'])
data_train_x_count = vectorizer.transform(train_df['review'])
data_test_x_count = vectorizer.transform(test_df['review'])

print(data_train_x_count.shape)
print(type(data_train_x_count))
print(data_train_x_count[0])

print('\n')
print(datetime.datetime.now())

2019-05-16 03:09:39.621416


                              drugName                     condition  \
0                            Valsartan  Left Ventricular Dysfunction   
1                           Guanfacine                          ADHD   
2                               Lybrel                 Birth Control   
3                           Ortho Evra                 Birth Control   
4             Buprenorphine / naloxone             Opiate Dependence   
5                               Cialis  Benign Prostatic Hyperplasia   
6                       Levonorgestrel       Emergency Contraception   
7                         Aripiprazole               Bipolar Disorde   
8                               Keppra                      Epilepsy   
9   Ethinyl estradiol / levonorgestrel                 Birth Control   
10                          Topiramate           Migraine Prevention   
11                      L-methylfolate                    Depression   
12                             Pent

In [21]:
# Show first 100 words in the corpus

words = vectorizer.get_feature_names()  
print(len(words))

showNum = 100
countNum = 0
for word in words:
    print(word)
    countNum += 1
    if(countNum == showNum):
        break

63011
00
000
0000
00007
0001
0002
0003
0009
000vl
001
0010782485900484
0015
001mg
001mgs
002
0020
0025
0025mg
0025now
002s
003
0030
00375
003mg
004
005
0050
00550mcgfor
005mg
006
0075
008
01
010
0100
012
0125
0125mg
0133
0137mcg
0145
015
0150mcg
016
017
01mg
02
0200
02092016
0210
0216
02252016
0230am
0235
024
025
025mcgs
025mg
025mgs
025ml
027
0270
02g
02mg
03
032
0325
034
035
035mg
0375
0375mg
03mg
03milligrams
04
0400
042
0433am
045
0450
045mg
04mg
05
050
0500
050mcgs
050mg
0530
054
055lbs
059
05g
05l
05lpm
05mg
05mg1hr
05mgs
05ml
05s
05x2


In [22]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

print(datetime.datetime.now())
print('\n')

lr_model = LogisticRegression(max_iter=100)
lr_model.fit(data_train_x_count, train_df['polarity'])

lr_predict_train_y = lr_model.predict(data_train_x_count)
print('training accuracy:')
print(accuracy_score(train_df['polarity'], lr_predict_train_y))

lr_predict_test_y = lr_model.predict(data_test_x_count)
print('\ntesting accuracy:')
print(accuracy_score(test_df['polarity'], lr_predict_test_y))

print('\n')
print(datetime.datetime.now())

2019-05-16 03:10:05.180108


training accuracy:
0.8425451186320886

testing accuracy:
0.7738719636945282


2019-05-16 03:12:11.116520


In [23]:
# SVC

from sklearn.svm import SVC

print(datetime.datetime.now())
print('\n')

from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

scaler = StandardScaler(with_mean=False)
scaler.fit(data_train_x_count)

train_normalize = scaler.transform(data_train_x_count)
test_normalize = scaler.transform(data_test_x_count)

svc_model = SVC(kernel='linear', max_iter=50)
svc_model.fit(train_normalize, train_df['polarity'])

svc_predict_train_y = svc_model.predict(train_normalize)
print('training accuracy:')
print(accuracy_score(train_df['polarity'], svc_predict_train_y))

svc_predict_test_y = svc_model.predict(test_normalize)
print('\ntesting accuracy:')
print(accuracy_score(test_df['polarity'], svc_predict_test_y))

print('\n')
print(datetime.datetime.now())

2019-05-16 03:12:11.129487


training accuracy:
0.604456375506054

testing accuracy:
0.6018115537700406


2019-05-16 03:13:08.263783


In [24]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB

print(datetime.datetime.now())
print('\n')

nb_model = MultinomialNB()
nb_model.fit(data_train_x_count, train_df['polarity'])

nb_predict_train_y = nb_model.predict(data_train_x_count)
print('training accuracy:')
print(accuracy_score(train_df['polarity'], nb_predict_train_y))

nb_predict_test_y = nb_model.predict(data_test_x_count)
print('\ntesting accuracy:')
print(accuracy_score(test_df['polarity'], nb_predict_test_y))

print('\n')
print(datetime.datetime.now())

2019-05-16 03:13:08.274140


training accuracy:
0.7533246123610483

testing accuracy:
0.7220734293047651


2019-05-16 03:13:08.580623
