## Test avec différents paramétres : 




In [43]:
# pip install pandas


In [44]:
# pip install nltk

## Lecture des données :

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import common as cmn
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score


from scipy.ndimage import gaussian_filter


import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.model_selection._validation")

In [2]:
fname = "../datasets/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts,alllabs = cmn.load_pres(fname)


## Fonction grid_search

In [35]:

def grid_search (parameters , scoring,preprocessors ,score_to_maximize):
    pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocessors)),
    ('reg', LogisticRegression())
    ])

    [X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)
    
    
    grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)
    
    
    grid_search.fit(X_all_train, Y_train)
    
    
    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)
    #_____________
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_all_test)
    
    smoothed_pred = gaussian_filter(y_pred,sigma=0.1)
    
    # Calcul des métriques de performance après le lissage

    f1 = f1_score(y_test, smoothed_pred,  pos_label=-1)  # or 'macro' or 'weighted'

    roc_auc = roc_auc_score(y_test, smoothed_pred)
    accuracy = accuracy_score(y_test, (smoothed_pred > 0.5).astype(int))
    
    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)
    
    
    
    
    #_____________
    # print("Scores:")
    # print("F1 Score:", grid_search.cv_results_['mean_test_f1_score'])
    # print("AUC:", grid_search.cv_results_['mean_test_roc_auc'])
    # print("Accuracy:", grid_search.cv_results_['mean_test_accuracy'])


# Premiers test en modifiant les poids de la fonction de coût pour pallier au déséquilibre : 
## Test 1 :
On essaye de penaliser notre classe minoritaire avec diffèrentes vaaleurs de pénaisation 

In [48]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

### Test 2 :
Maximiser le f1 score de la classe minoritaire

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [40000,100000],
    'reg__class_weight': ['balanced',class_weights],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1.0, -1: 5.0}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.9040320473743795
AUC: 0.782878231512926
Accuracy: 0.8230427588609248


### Test 3 :
Maximiser roc auc score

In [36]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [40000,100000],
    'reg__class_weight': ['balanced',class_weights],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'roc_auc')


Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

### Test 4:
essayer de maximiser des deux

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.9050743149094286
AUC: 0.7827068919817048
Accuracy: 0.8226892707849512


## Test 5:

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.9050743149094286
AUC: 0.7827068919817048
Accuracy: 0.8226892707849512


In [None]:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6344735077129444
AUC: 0.7827068919817048
Accuracy: 0.8226892707849512


In [None]:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6349419124218052
AUC: 0.7831072171504867
Accuracy: 0.82263121226196


In [None]:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6361607142857143
AUC: 0.7839748961092721
Accuracy: 0.82263121226196


In [34]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6361607142857143
AUC: 0.7839748961092721
Accuracy: 0.82263121226196


In [None]:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.lemmatization(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6296296296296295
AUC: 0.7804017907837535
Accuracy: 0.8217022758941013


### Stop words :  
ici nous avons essayé de creer notre propre liste de stop words

In [38]:

preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

vectorizer_bigram = CountVectorizer(ngram_range=(2, 2),preprocessor=preprocessors)
corpus_dicours = alltxts
corpus_dicours_sparse_mat_bigram = vectorizer_bigram.fit_transform(corpus_dicours) # Output is a sparse matrix
print("Taille initiale du vocabulaire avec des bigrammes :",len(vectorizer_bigram.get_feature_names_out()))
frequence = np.array(corpus_dicours_sparse_mat_bigram.sum(axis=0))[0]
indices_tries = np.argsort(-frequence, kind='quicksort')

# Trier les sommes des colonnes en utilisant les indices triés
somme_colonnes_triees = [frequence[i] for i in indices_tries]
somme_colonnes_triees

bigrammes = vectorizer_bigram.get_feature_names_out()
bigrammes_100_plus_frequents = [bigrammes[i] for i in indices_tries[:100] ]
list(somme_colonnes_triees[:100])

In [39]:

vectorizer_bigram = CountVectorizer(ngram_range=(1, 1),preprocessor=preprocessors)
corpus_dicours = alltxts
corpus_dicours_sparse_mat_bigram = vectorizer_bigram.fit_transform(corpus_dicours) # Output is a sparse matrix
print("Taille initiale du vocabulaire avec des bigrammes :",len(vectorizer_bigram.get_feature_names_out()))
frequence = np.array(corpus_dicours_sparse_mat_bigram.sum(axis=0))[0]
indices_tries = np.argsort(-frequence, kind='quicksort')

# Trier les sommes des colonnes en utilisant les indices triés
somme_colonnes_triees = [frequence[i] for i in indices_tries]
somme_colonnes_triees

bigrammes = vectorizer_bigram.get_feature_names_out()
unigrammes_100_plus_frequents = [bigrammes[i] for i in indices_tries[:100] ]
print((unigrammes_100_plus_frequents))
print((somme_colonnes_triees[:100]))

Taille initiale du vocabulaire avec des bigrammes : 31213
['de', 'la', 'et', 'le', 'les', 'des', 'est', 'que', 'qui', 'en', 'un', 'une', 'pour', 'dans', 'du', 'vous', 'nous', 'au', 'plus', 'ce', 'il', 'pas', 'qu', 'je', 'par', 'notre', 'France', 'ne', 'sur', 'nos', 'avec', 'se', 'pays', 'sont', 'aussi', 'Je', 'aux', 'cette', 'ont', 'être', 'leur', 'Il', 'tout', 'tous', 'votre', 'son', 'La', 'même', 'elle', 'ou', 'ses', 'bien', 'entre', 'Europe', 'mais', 'hui', 'comme', 'on', 'ces', 'sa', 'doit', 'monde', 'faire', 'ai', 'aujourd', 'Et', 'faut', 'été', 'sans', 'fait', 'Mais', 'si', 'Nous', 'Le', 'où', 'leurs', 'avez', 'dire', 'Les', 'ils', 'très', 'deux', 'dont', 'peut', 'développement', 'ensemble', 'autres', 'Elle', 'Français', 'politique', 'vos', 'vie', 'encore', 'avons', 'toutes', 'Monsieur', 'ceux', 'cela', 'Vous', 'Président']
[68623, 39750, 35180, 24366, 24362, 21495, 17247, 16081, 15568, 15473, 12257, 12012, 11033, 10962, 10639, 8131, 7735, 7571, 7266, 6585, 6393, 6312, 6241, 6128

In [40]:


parameters = {
    # 'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'tfidf__stop_words':[bigrammes_100_plus_frequents[:10]+unigrammes_100_plus_frequents[:5]] ,
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits




Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': ['de la', 'et de', 'la France', 'aujourd hui', 'et la', 'dans le', 'de notre', 'qu il', 'de nos', 'que nous', 'de', 'la', 'et', 'le', 'les']}
Performance après lissage:
F1 Score: 0.6383259911894273
AUC: 0.7879744546800008
Accuracy: 0.8205411054342777


In [81]:


parameters = {
    # 'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],  
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6348214285714285
AUC: 0.7832235939583514
Accuracy: 0.8224570366929865


### Fonction de grid search tenant compte du deséquilibre des classes et utilisation de l'oversampling :


In [7]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from scipy.ndimage import gaussian_filter

from imblearn.over_sampling import RandomOverSampler

def grid_search_ov(parameters, scoring, preprocessors, score_to_maximize):
    X_all_train, X_all_test, Y_train, y_test = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)
    
    tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessors)
    
    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('oversample', RandomOverSampler(random_state=42)),  # Suréchantillonnage RandomOverSampler
        ('reg', LogisticRegression())
    ])
    
    grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)
    
    grid_search.fit(X_all_train, Y_train)
    
    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)
    
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_all_test)
    
    smoothed_pred = gaussian_filter(y_pred, sigma=0.1)
    
    f1 = f1_score(y_test, smoothed_pred, pos_label=-1)
    roc_auc = roc_auc_score(y_test, smoothed_pred)
    accuracy = accuracy_score(y_test, (smoothed_pred > 0.5).astype(int))
    
    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)
    return tfidf_vectorizer


In [27]:

parameters = {
    'tfidf__max_df': [ 0.45],

    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6303874619068349
AUC: 0.7859142478567198
Accuracy: 0.8173478866697631


In [28]:

parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6303874619068349
AUC: 0.7859142478567198
Accuracy: 0.8173478866697631


In [21]:

parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6321041214750542
AUC: 0.7876319253397123
Accuracy: 0.8169414770088249


In [34]:

parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__lowercase': False, 'tfidf__max_df': 0.5, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.633434915106661
AUC: 0.7876672862088685
Accuracy: 0.8177542963307014


### Best solution :

In [10]:

parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6348077350522339
AUC: 0.7840894263585909
Accuracy: 0.8217022758941013


Test avec suppression de quelque mots trop frequents :

In [41]:

parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10],
    'tfidf__stop_words':[bigrammes_100_plus_frequents[:10]+unigrammes_100_plus_frequents[:5]] ,


}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': ['de la', 'et de', 'la France', 'aujourd hui', 'et la', 'dans le', 'de notre', 'qu il', 'de nos', 'que nous', 'de', 'la', 'et', 'le', 'les']}
Performance après lissage:
F1 Score: 0.6306149149585696
AUC: 0.7856814942409911
Accuracy: 0.8176962378077102


# Autres idées essayées mais sans amélioration : 

## Mauvaise idée d'utiliser les stop words (la liste predéfinie):

In [12]:

french_stop_words = [
    "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et", "eux", "il", "je", "la", "le", "leur", "lui", "ma", "mais", "me", "même", "mes", "moi", "mon", "ne", "nos", "notre", "nous", "on", "ou", "par", "pas", "pour", "qu", "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", "c", "d", "j", "l", "à", "m", "n", "s", "t", "y", "été", "étée", "étées", "étés", "étant", "étante", "étants", "étantes", "suis", "es", "est", "sommes", "êtes", "sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez", "seraient", "étais", "était", "étions", "étiez", "étaient", "fus", "fut", "fûmes", "fûtes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "fût", "fussions", "fussiez", "fussent", "ayant", "ayante", "ayantes", "ayants", "eu", "eue", "eues", "eus", "ai", "as", "avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", "auront", "aurais", "aurait", "aurions", "auriez", "auraient", "avais", "avait", "avions", "aviez", "avaient", "eut", "eûmes", "eûtes", "eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "eût", "eussions", "eussiez", "eussent"
]
parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__stop_words': [french_stop_words],  # Utiliser les stopwords en français
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': ['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent'

## Avec les odds_ratio :
ici nous avons essayé de recuprer les mots les plus descriminants selon des OddsRatio mais on s'est rendu compte qu'on a pas eu d'amelioration


In [27]:
import pandas as pd

def calculate_odds_ratios(mitterrand_counts, chirac_counts):
    odds_ratios = {}
    for word in mitterrand_counts.keys():

        count_mitterrand = mitterrand_counts[word] + 0.5 # Avec une correction de Yates pour eviter les div par zerro
        count_chirac = chirac_counts.get(word, 0)+ 0.5
        
        total_mitterrand = sum(mitterrand_counts.values())
        total_chirac = sum(chirac_counts.values())

        odds_ratio = (count_mitterrand * (total_chirac - count_chirac)) / ((count_chirac * (total_mitterrand - count_mitterrand)))
        odds_ratios[word] = odds_ratio
    
    return odds_ratios

data_frame_discours = pd.DataFrame()
data_frame_discours['text'] = [ ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((text))))))) for text in alltxts]
data_frame_discours['label'] = alllabs
data_frame_discours
mitterrand_counts =  Counter(' '.join(data_frame_discours[data_frame_discours['label']==-1].text).split())
chirac_counts = Counter(' '.join(data_frame_discours[data_frame_discours['label']==1].text).split())
odds_ratios = calculate_odds_ratios(mitterrand_counts, chirac_counts)

sorted_odds_ratios = sorted(odds_ratios.items(), key=lambda x: x[1], reverse=True)
top_100_odds_ratios = dict(sorted_odds_ratios[:100000])

top_100_odds_ratios.keys()

dict_keys(['monsieur', '320', 'convenait', 'mesdames', 'Nièvre', 'Eurêka', 'madame', 'Morvan', 'estampes', 'CEE', 'Nevers', 'cognac', 'laitiers', 'électrification', 'Montluçon', '340', 'vocabulaire', 'dédain', 'définitions', 'Moi', 'disposés', 'étonné', '500000', 'Magistrature', 'Figeac', 'Cantal', '100000', '10000', 'connait', 'banc', 'imprudent', 'plaindre', 'bougent', 'RN', 'Poulenc', 'Buffon', 'reproche', 'CSCE', 'légumes', 'dominent', 'surarmement', 'carrément', 'pur', 'écu', '125000', 'quotas', 'Bayonne', 'aie', 'Bourges', 'Songez', 'fabrique', 'colonialisme', 'Turquoise', 'alignés', 'IXème', 'protestations', 'fameuses', 'Yonne', 'Vesoul', 'provincial', 'étonnera', 'robotique', 'décentraliser', 'philosopher', 'çà', 'descendre', 'Hollande', 'primes', 'fractions', 'IDS', 'chagrins', 'Neuve', 'tarifaires', 'Albion', 'Rappelez', 'romanes', 'multipartisme', 'artillerie', 'Toronto', 'guadeloupéen', 'tracteurs', 'Italiens', 'dûs', 'Dunkerque', 'Valmy', 'Kellermann', 'Vallées', 'messieur

In [30]:


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__binary': [True],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10],
    'tfidf__vocabulary':[list(top_100_odds_ratios.keys())],

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits




Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__vocabulary': ['monsieur', '320', 'convenait', 'mesdames', 'Nièvre', 'Eurêka', 'madame', 'Morvan', 'estampes', 'CEE', 'Nevers', 'cognac', 'laitiers', 'électrification', 'Montluçon', '340', 'vocabulaire', 'dédain', 'définitions', 'Moi', 'disposés', 'étonné', '500000', 'Magistrature', 'Figeac', 'Cantal', '100000', '10000', 'connait', 'banc', 'imprudent', 'plaindre', 'bougent', 'RN', 'Poulenc', 'Buffon', 'reproche', 'CSCE', 'légumes', 'dominent', 'surarmement', 'carrément', 'pur', 'écu', '125000', 'quotas', 'Bayonne', 'aie', 'Bourges', 'Songez', 'fabrique', 'colonialisme', 'Turquoise', 'alignés', 'IXème', 'protestations', 'fameuses', 'Yonne', 'Vesoul', 'provincial', 'étonnera', 'robotique', 'décentraliser', 'philosopher', 'çà', 'descendre', 'Hollande', 'primes', 'fractions', 'IDS', 'chagrins', 'Neuve', 'tarifaires', 'Albion', 'Ra