In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score

import numpy as np
import utils as ut
import pandas as pd

In [2]:
import warnings

# Ignorer tous les avertissements
warnings.filterwarnings("ignore")

# Ignorer les avertissements spécifiques par leur catégorie
warnings.filterwarnings("ignore", category=UserWarning)


# Etape 0 : Chargement de données movies

Dans cette partie nous allons rechargé les données de movies afin d'effectuer l'analyse de sentiments

In [3]:
path = "../datasets/movies/movies1000/"
alltxts,alllabs = ut.load_movies(path)

Création du DataFrame

In [4]:
movies_df = pd.DataFrame()
movies_df['text'] = alltxts
movies_df['label'] = alllabs

movies_df

Unnamed: 0,text,label
0,supposedly based on a true story in which the ...,0
1,one might expect a cathartic viewing experienc...,0
2,the camera zooms in incredibly close . \nit fo...,0
3,1990s would remembered as the era of binary mo...,0
4,""" the world on land -- it's just too big for ...",0
...,...,...
1995,one of the more unusual and suggestively viole...,1
1996,i find most of television so intensely boring ...,1
1997,the event of events is upon us . \npeople have...,1
1998,the sheer horrific audacity of the nazi plan t...,1


# Grid search pour TF-IDF Vectorizer

In [5]:
f = lambda doc: ut.suppression_balises_html( ut.remove_numbers(ut.remove_ponctuation(doc)) ) 

## Test 1

In [26]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f , stop_words='english')),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [0.2 ,0.5],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [(1, 1), (2, 2)],
    'tfidf__binary': [False, True],
    'tfidf__lowercase': [False, True],
    #'tfidf__use_idf': [False, True],
    #'tfidf__sublinear_tf': [False, True],
    'tfidf__max_features': [ 10000],

    'reg__C': [10],
    'reg__penalty': [ 'l2' , 'l1']
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

# print("Scores:")
# print("F1 Score:", grid_search.cv_results_['mean_test_f1_score'])
# print("AUC:", grid_search.cv_results_['mean_test_roc_auc'])
# print("Accuracy:", grid_search.cv_results_['mean_test_accuracy'])


Fitting 5 folds for each of 32 candidates, totalling 160 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.5, 'tfidf__max_features': 10000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 1)}
Scores :
Accuracy: 0.8475
F1 Score: 0.8486352357320099
AUC: 0.8482176360225141


## Test 2

In [27]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f , stop_words='english')),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [0.5 , 0.6 ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__binary': [False, True],
    'tfidf__lowercase': [False, True],
    #'tfidf__use_idf': [False, True],
    #'tfidf__sublinear_tf': [False, True],
    'tfidf__max_features': [ 10000 , 20000],

    'reg__C': [10, 100],
    'reg__penalty': [ 'l2' , 'l1']
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.5, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2)}
Scores :
Accuracy: 0.8575
F1 Score: 0.8571428571428571
AUC: 0.8579737335834896


## Test 3

In [28]:
f2 = lambda doc: ut.lemmatization(ut.suppression_balises_html( ut.remove_numbers(ut.remove_ponctuation(doc)) ) )

In [29]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2 , stop_words='english')),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [0.4 , 0.5 ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__binary': [False, True],
    'tfidf__lowercase': [False, True],
    #'tfidf__use_idf': [False, True],
    #'tfidf__sublinear_tf': [False, True],
    'tfidf__max_features': [20000 , 30000],

    'reg__C': [100],
    'reg__penalty': [ 'l2' ]
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 32 candidates, totalling 160 fits




Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.5, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 1)}
Scores :
Accuracy: 0.8625
F1 Score: 0.8635235732009926
AUC: 0.8632270168855534


## Test 4

In [31]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2 )),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [0.3 , 0.5 ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [ (1, 2) , (1,3)],
    'tfidf__binary': [False, True],
    'tfidf__lowercase': [False, True],
    'tfidf__use_idf': [ True],
    'tfidf__sublinear_tf': [ True],
    'tfidf__max_features': [20000 , 40000],

    'reg__C': [100],
    'reg__penalty': [ 'l2' ]
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.3, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
Scores :
Accuracy: 0.89
F1 Score: 0.8883248730964467
AUC: 0.8901813633520951


## Test 5

In [35]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2 )),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [0.3 , 0.4 , 0.5],
    'tfidf__min_df': [5 , 10],
    'tfidf__ngram_range': [ (1,3)],
    'tfidf__binary': [ True],
    'tfidf__lowercase': [ True],
    'tfidf__use_idf': [ True],
    'tfidf__sublinear_tf': [ True],
    'tfidf__max_features': [ 20000, 40000],

    'reg__C': [1 , 100],
    'reg__penalty': [ 'l2']
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': True, 'tfidf__max_df': 0.4, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 3), 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
Scores :
Accuracy: 0.8775
F1 Score: 0.8740359897172236
AUC: 0.8773608505315822


## Test 6

In [36]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2 )),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [ 0.4 , 0.5],
    'tfidf__min_df': [5 ],
    'tfidf__ngram_range': [ (1.2) ,(1,3)],
    'tfidf__binary': [ True],
    'tfidf__lowercase': [ False ,True],
    'tfidf__use_idf': [ False ,True],
    'tfidf__sublinear_tf': [ False ,True],
    'tfidf__max_features': [ 20000, 40000],

    'reg__C': [100],
    'reg__penalty': [ 'l2']
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.4, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 3), 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True}
Scores :
Accuracy: 0.8775
F1 Score: 0.8740359897172236
AUC: 0.8773608505315822


## Test 7

In [38]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2 )),
    ('reg', LogisticRegression())
])

parameters = {
    'tfidf__max_df': [0.2 , 0.3 , 0.4 ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [ (1, 2) , (1,3)],
    'tfidf__binary': [True],
    'tfidf__lowercase': [False],
    'tfidf__use_idf': [ True],
    'tfidf__sublinear_tf': [ True , False],
    'tfidf__max_features': [20000 ,30000, 40000],

    'reg__C': [100],
    'reg__penalty': [ 'l2' ]
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

[X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)


grid_search.fit(X_all_train, Y_train)


print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /users/Etu7/21200397/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Do

Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__penalty': 'l2', 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.3, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'tfidf__sublinear_tf': True, 'tfidf__use_idf': True}
Scores :
Accuracy: 0.89
F1 Score: 0.8883248730964467
AUC: 0.8901813633520951
