In [1]:
import os
from time import time
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

##### Lecture des données :

In [2]:
PATH_TO_DATA = 'data'

In [3]:
# Reading files
train_set = pd.read_table(os.path.join(PATH_TO_DATA, 'drugsComTrain_raw.tsv'))
test_set = pd.read_table(os.path.join(PATH_TO_DATA, 'drugsComTest_raw.tsv'))
df = pd.concat([train_set,test_set])

In [4]:
df.rename(columns={'Unnamed: 0': 'ID'}, inplace=True)
df.head()

Unnamed: 0,ID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


Sélection des colonnes utiles :

In [15]:
df1 = df[['ID','review','rating']].copy() 

On définit les fonctions de preprocessing et de transformation de la variable de sortie :

In [16]:
import re
import string
stemmer = nltk.stem.SnowballStemmer('english')
# lower case everything
def textLower(x):
    return x.lower()

#delete numbers and replace punctuation by space 
def keepletters(input_str):
    input_str = re.sub(r'\d+', '', input_str)
    return input_str.translate(str.maketrans(string.punctuation,' '*32))

#remove last space and first space
def stripfc(input_str):
    return input_str.strip()

def tokenize_stpords(input_str):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(input_str)
    return [i for i in tokens if not i in stop_words]

def preprocess(x):
    x = textLower(x)
    x = keepletters(x)
    x = stripfc(x)
    tokens = tokenize_stpords(x)
    tokens_stem = ' '.join([stemmer.stem(token) for token in tokens])
    return tokens_stem

Preprocessing des données :

In [17]:
start = time()
df1["review"] = df1["review"].apply(preprocess)
end = time()
print('Le preprocessing prend {}'.format(end-start))

Le preprocessing prend 341.49270391464233


##### Vectorisation : Calcul des vecteurs TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df1.review)
end = time()
print('Calcul du terme TF-IDF prend {}'.format(end-start))
print(len(vectorizer.get_feature_names()))

Calcul du terme TF-IDF prend 348.8041625022888
34996


##### Transformation de la variable de sortie :

In [19]:
df1['rating'] = df1['rating'].apply(lambda x: 0 if x<=5 else 1)
end = time()
print('Catégorisation des ratings {}'.format(end-start))
print(df1.head())

Catégorisation des ratings 348.9258360862732
       ID                                             review  rating
0  206461         side effect take combin bystol mg fish oil       1
1   95260  son halfway fourth week intuniv becam concern ...       1
2   92703  use take anoth oral contracept pill cycl happi...       0
3  138000  first time use form birth control glad went pa...       1
4   35696  suboxon complet turn life around feel healthie...       1


### Phase d'apprentissage :

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, df1['rating'], test_size=0.3, random_state=42)

##### Régression logistique :

In [21]:
from sklearn.linear_model import LogisticRegression
start = time()
model = LogisticRegression(C=1)
model.fit(X_train, y_train)
end = time()
print('Le training prend {}'.format(end-start))
preds = model.predict(X_test)
accuracy = (preds == y_test).mean()
print(accuracy)



Le training prend 3.179501533508301
0.8315844944899952


##### Naive Bayes :

In [22]:
from sklearn.naive_bayes import MultinomialNB
start = time()
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)
end = time()
print('Le training prend {}'.format(end-start))
preds_nb = model_nb.predict(X_test)
accuracy_nb = (preds_nb == y_test).mean()
print(accuracy_nb)

Le training prend 0.06582260131835938
0.7558548644585316


##### SVM :

In [23]:
from sklearn.svm import LinearSVC
start = time()
model_svm = LinearSVC()
model_svm.fit(X_train, y_train)
end = time()
print('Le training prend {}'.format(end-start))
preds_svm = model_svm.predict(X_test)
accuracy_svm = (preds_svm == y_test).mean()
print(accuracy_svm)

Le training prend 2.3646788597106934
0.8382336986004123


##### Random forests :

In [24]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
preds_rf = model_rf.predict(X_test)
accuracy_rf = (preds_rf == y_test).mean()
print(accuracy_rf)



0.8900788914893287
