In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, recall_score

import numpy as np
import utils as ut
import pandas as pd

[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anyes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anyes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import warnings

# Ignorer tous les avertissements
warnings.filterwarnings("ignore")

# Ignorer les avertissements spécifiques par leur catégorie
warnings.filterwarnings("ignore", category=UserWarning)

# Etape 0 : Chargement de données movies

Dans cette partie nous allons rechargé les données de movies afin d'effectuer l'analyse de sentiments

In [7]:
path = "../datasets/movies/movies1000/"
alltxts,alllabs = ut.load_movies(path)

Création du DataFrame

In [8]:
movies_df = pd.DataFrame()
movies_df['text'] = alltxts
movies_df['label'] = alllabs

movies_df

Unnamed: 0,text,label
0,the kids in the hall are an acquired taste . \...,0
1,capsule : a science fiction allegory . \nat th...,0
2,there is a rule when it comes to movies . \na ...,0
3,it's amazing how a comedian can have the some ...,0
4,"absolute power , the new film produced and dir...",0
...,...,...
1995,"as i walked out of crouching tiger , hidden dr...",1
1996,"when andy leaves for cowboy camp , his mother ...",1
1997,plot : a bunch of bad guys dressed up as elvis...,1
1998,trees lounge is the directoral debut from one ...,1


# Grid search pour TF-IDF Vectorizer

In [9]:
f = lambda doc: ut.suppression_balises_html( ut.remove_numbers(ut.remove_ponctuation(doc)) ) 

## Test 1

In [14]:

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f)),
    ('xgb', XGBClassifier())
])

# Définition des paramètres à optimiser
parameters = {
    'tfidf__max_df': [0.3 , 0.5 ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [ True],
    'tfidf__lowercase': [False],
    'tfidf__max_features': [20000],
    'xgb__learning_rate': [0.1, 0.01],
    'xgb__n_estimators': [100],
    'xgb__max_depth': [3],
}

# Définition des métriques de scoring
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# Séparation des données en ensembles d'entraînement et de test
[X_all_train, X_all_test, Y_train, y_test] = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

# Application de la Grid Search
grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_all_train, Y_train)

# Affichage des meilleurs paramètres trouvés
print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

# Évaluation du meilleur modèle sur l'ensemble de test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

# Calcul et affichage des scores
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anyes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anyes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nl

Meilleurs paramètres trouvés:
{'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.3, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
Scores :
Accuracy: 0.7625
F1 Score: 0.7732696897374701
AUC: 0.7641651031894935


# Test 2

In [16]:
f2 = lambda doc: ut.lemmatization(ut.suppression_balises_html( ut.remove_numbers(ut.remove_ponctuation(doc)) ) )

In [17]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2)),
    ('xgb', XGBClassifier())
])

# Définition des paramètres à optimiser
parameters = {
    'tfidf__max_df': [0.3  ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [ True],
    'tfidf__lowercase': [False],
    'tfidf__max_features': [20000],
    'xgb__learning_rate': [0.1, 0.01],
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 5],
}

# Définition des métriques de scoring
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# Séparation des données en ensembles d'entraînement et de test
[X_all_train, X_all_test, Y_train, y_test] = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

# Application de la Grid Search
grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_all_train, Y_train)

# Affichage des meilleurs paramètres trouvés
print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

# Évaluation du meilleur modèle sur l'ensemble de test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

# Calcul et affichage des scores
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anyes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anyes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anyes/nltk_data...
[nltk_data] Downloading package punkt to /home/anyes/nltk_data..

Meilleurs paramètres trouvés:
{'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.3, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'xgb__learning_rate': 0.1, 'xgb__max_depth': 5, 'xgb__n_estimators': 200}
Scores :
Accuracy: 0.815
F1 Score: 0.8177339901477833
AUC: 0.8158849280800501


# test 3

In [18]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=f2)),
    ('xgb', XGBClassifier())
])

# Définition des paramètres à optimiser
parameters = {
    'tfidf__max_df': [0.3  ],
    'tfidf__min_df': [5],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [ True],
    'tfidf__lowercase': [False],
    'tfidf__max_features': [20000],
    'xgb__learning_rate': [0.1, 0.01],
    'xgb__n_estimators': [1000],
    'xgb__max_depth': [6],
}

# Définition des métriques de scoring
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

# Séparation des données en ensembles d'entraînement et de test
[X_all_train, X_all_test, Y_train, y_test] = train_test_split(alltxts, alllabs, test_size=0.2, random_state=10, shuffle=True)

# Application de la Grid Search
grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit='accuracy', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_all_train, Y_train)

# Affichage des meilleurs paramètres trouvés
print("Meilleurs paramètres trouvés:")
print(grid_search.best_params_)

# Évaluation du meilleur modèle sur l'ensemble de test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_all_test)

# Calcul et affichage des scores
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Scores :")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC:", roc_auc)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data] Downloading package punkt to /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/anyes/nltk_data...

[nltk_data]   Package punkt is already up-to-date!
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anyes/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[n

Meilleurs paramètres trouvés:
{'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.3, 'tfidf__max_features': 20000, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 1000}
Scores :
Accuracy: 0.8275
F1 Score: 0.8296296296296297
AUC: 0.8283302063789869
