In [131]:
#Étape 1. Importez les bibliothèques nécessaires, nous allons utiliser plus loin
import urllib.request
import os
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [161]:
df = pd.read_table('datasets.txt', sep=';')
df.head(3)

Unnamed: 0,name,review,sentiment
0,Leachco Snoogle Total Body Pillow,I am really enjoying the snoogle. It has help...,positive
1,"Baby Trend Flex Loc Infant Car Seat, Elixer",For our second child we bought this car seat (...,negative
2,Sunshine Kids Mighty Tite Seat Belt Tightener,If you sort these reviews by the star rating y...,positive


In [162]:
#Étape 3. Nettoyer les données textuelles
df.isnull().sum()


name         0
review       0
sentiment    0
dtype: int64

In [163]:
#supprimer les valeurs null
df = df.dropna()
df.isnull().sum()
len(df)

14167

In [140]:
df.to_csv(r'datasets.txt', header=None, index=None, sep=';', mode='a')

In [164]:
#installer nltk 
import nltk
nltk.download('brown')
nltk.download('stopwords')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [165]:
#tester nltk 
from nltk.corpus import brown
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [166]:
# init Objects
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords=set(stopwords.words('english'))
ps=PorterStemmer()
def getStemmedReview(review):
    review=review.lower()
    review=review.replace("<br /><br />"," ")
    #Tokenize
    tokens=tokenizer.tokenize(review)
    new_tokens=[token for token in tokens if token not in  en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    clean_review=' '.join(stemmed_tokens)
    return clean_review

In [178]:
len(df.loc[10000:, 'review'].values)

4167

In [181]:
#Étape 4. Nettoyer toutes les revues et diviser nos données pour la formation et les tests.
df['review'].apply(getStemmedReview)
X_train = df.loc[:10000, 'review'].values
y_train = df.loc[:10000, 'sentiment'].values
X_test = df.loc[10001:, 'review'].values
y_test = df.loc[10001:, 'sentiment'].values

In [None]:
#installer : pip3 install -U scikit-learn scipy matplotlib ou pip install -U scikit-learn scipy matplotlib

In [182]:
#Étape 5. Transformer des mots en vecteurs de caractéristiques
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8',
 decode_error='ignore')
vectorizer.fit(X_train)
X_train=vectorizer.transform(X_train)
X_test=vectorizer.transform(X_test)

In [183]:
#Étape 6. Création du modèle et vérification du score sur les données d'entraînement et de test
from sklearn.linear_model import LogisticRegression

model=LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)

print("Score on training data is: "+str(model.score(X_train,y_train)))
print("Score on testing data is: "+str(model.score(X_test,y_test)))

Score on training data is: 0.9292070792920708
Score on testing data is: 0.8900624099855977


In [185]:
# tester sur une nouvelle valeur 
val = ["While the diaper pins are attractive, the metal in the pins I received are flimsy and did not hold up to the thick fabric I used them on. Fortunately there was no baby involved"]

In [186]:
val =vectorizer.transform(val)

In [187]:
model.predict(val)

array(['negative'], dtype=object)

In [188]:
val = ["I not miss this book"]
val = vectorizer.transform(val)
model.predict(val)

array(['negative'], dtype=object)

In [189]:
model.predict_proba(val)

array([[0.65704591, 0.34295409]])

**Deuxièmes modèles**


In [190]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [191]:
x=df['review']
y=df['sentiment']
X_train,X_test,y_train,y_test=train_test_split(x,y)

In [192]:
vect= CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [193]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)

LogisticRegression(max_iter=1000)

In [194]:
accuracy_score(model.predict(X_test),y_test)

0.8780350084697911

In [195]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB()

In [196]:
accuracy_score(nb.predict(X_test),y_test)

0.8537549407114624