## Test avec différents paramétres : 




In [1]:
# pip install pandas


In [2]:
# pip install nltk

## Lecture des données :

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import common as cmn
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score


from scipy.ndimage import gaussian_filter


import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.model_selection._validation")

In [4]:
fname = "./datasets/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts,alllabs = cmn.load_pres(fname)


### Test ici : 

In [22]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

def grid_search_2(parameters, scoring, preprocessors, score_to_maximize):
    pipeline = ImbPipeline([
        ('tfidf', TfidfVectorizer(preprocessor=preprocessors)),
        ('undersampler', RandomUnderSampler(sampling_strategy=0.5, random_state=42)),  # Exemple de paramètres personnalisés
        ('reg', LogisticRegression())
    ])

    [X_all_train, X_all_test, Y_train, y_test] = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)

    grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)

    grid_search.fit(X_all_train, Y_train)

    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)
    #_____________

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_all_test)

    smoothed_pred = gaussian_filter(y_pred, sigma=0.05)

    # Calcul des métriques de performance après le lissage
    f1 = f1_score(y_test, smoothed_pred,  pos_label=-1)  # or 'macro' or 'weighted'
    roc_auc = roc_auc_score(y_test, smoothed_pred)
    accuracy = accuracy_score(y_test, (smoothed_pred > 0.5).astype(int))

    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)


In [7]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 1}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.8861472364143056
AUC: 0.7895715533439505
Accuracy: 0.7981305155596842


In [8]:
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 1}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.8861472364143056
AUC: 0.7895715533439505
Accuracy: 0.7981305155596842


In [10]:


parameters = {
    'tfidf__max_df': [0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2),(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.8880631676730144
AUC: 0.7950792558921576
Accuracy: 0.7986530422666047


In [11]:

#sigma = 0.1 jusqu'ici
parameters = {
    'tfidf__max_df': [0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2),(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.8880631676730144
AUC: 0.7950792558921576
Accuracy: 0.7986530422666047


In [13]:

#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000,300000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.8880631676730144
AUC: 0.7950792558921576
Accuracy: 0.7986530422666047


In [14]:

#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [(1,2),(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.stemming(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.8809219693450998
AUC: 0.7898565631125358
Accuracy: 0.7918601950766373


In [15]:

#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.1,0.5],
    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [1, 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.1, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.889921040408732
AUC: 0.796518524689869
Accuracy: 0.8003947979563399


In [16]:
#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.2],
    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [100]
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 100, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.2, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.8802833255921969
AUC: 0.8003088260118741
Accuracy: 0.787796098467255


In [17]:
#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.2],
    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [10]
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.2, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.8900952159777056
AUC: 0.7975360947671293
Accuracy: 0.8002786809103576


In [23]:
#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.2],

    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [10]
}

scoring = {
    'f1_score': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.2, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.6204130739923801
AUC: 0.7975360947671293
Accuracy: 0.8002786809103576


In [26]:
def lowercase(text):
    return text.lower()
#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.2],

    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [10]
}


def f1_score_class_minus_one(y_true, y_pred):
    return f1_score(y_true, y_pred, pos_label=-1)

scoring = {
    'f1_score_class_minus_one': make_scorer(f1_score_class_minus_one),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

preprocessors = lambda x: ((lowercase(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score_class_minus_one')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.2, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.5988857938718662
AUC: 0.785710806540423
Accuracy: 0.7955759405480725


In [27]:
def lowercase(text):
    return text.lower()
#sigma = 0.05 jusqu'ici
parameters = {
    'tfidf__max_df': [0.2],

    'tfidf__ngram_range': [(1,3)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [10]
}


def f1_score_class_minus_one(y_true, y_pred):
    return f1_score(y_true, y_pred, pos_label=-1)

scoring = {
    'f1_score_class_minus_one': make_scorer(f1_score_class_minus_one),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}

preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.suppression_ponctuation((cmn.suppression_balises_html((x)))))))

grid_search_2 (parameters , scoring,preprocessors ,'f1_score_class_minus_one')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.2, 'tfidf__max_features': 100000, 'tfidf__ngram_range': (1, 3)}
Performance après lissage:
F1 Score: 0.6178764247150571
AUC: 0.7965660263179667
Accuracy: 0.7993497445424989


## Fonction grid_search

In [8]:

def grid_search (parameters , scoring,preprocessors ,score_to_maximize):
    pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocessors)),
    ('reg', LogisticRegression())
    ])

    [X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)
    
    
    grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)
    
    
    grid_search.fit(X_all_train, Y_train)
    
    
    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)
    #_____________
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_all_test)
    
    smoothed_pred = gaussian_filter(y_pred,sigma=0.1)
    
    # Calcul des métriques de performance après le lissage
    f1 = f1_score(y_test, smoothed_pred, average='micro')  # or 'macro' or 'weighted'
    roc_auc = roc_auc_score(y_test, smoothed_pred)
    accuracy = accuracy_score(y_test, (smoothed_pred > 0.5).astype(int))
    
    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)
    
    
    
    
    #_____________
    # print("Scores:")
    # print("F1 Score:", grid_search.cv_results_['mean_test_f1_score'])
    # print("AUC:", grid_search.cv_results_['mean_test_roc_auc'])
    # print("Accuracy:", grid_search.cv_results_['mean_test_accuracy'])


## Test 1 :
On essaye de penaliser notre classe minoritaire avec diffèrentes vaaleurs de pénaisation 

In [9]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

### Test 2 :
Maximiser le f1 score de la classe minoritaire

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [40000,100000],
    'reg__class_weight': ['balanced',class_weights],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1.0, -1: 5.0}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.9040320473743795
AUC: 0.782878231512926
Accuracy: 0.8230427588609248


### Test 3 :
Maximiser roc auc score

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [40000,100000],
    'reg__class_weight': ['balanced',class_weights],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'roc_auc')


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Meilleurs paramètres trouvés:
{'reg__C': 1, 'reg__class_weight': 'balanced', 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 40000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.8680658364538884
AUC: 0.8090931030390787
Accuracy: 0.7726203953670644


### Test 4:
essayer de maximiser des deux

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.9050743149094286
AUC: 0.7827068919817048
Accuracy: 0.8226892707849512


## POur tester sur mon pc :

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],  
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.9050743149094286
AUC: 0.7827068919817048
Accuracy: 0.8226892707849512


In [None]:
# from xgboost import XGBClassifier
# f2 = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(preprocessor=f2)),
#     ('xgb', XGBClassifier())
# ])

# class_weights = {
#     1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
#     -1: 5.0  # Class -1, the minority class, gets weight 5.0
# }


# parameters = {
#     'tfidf__max_df': [ 0.2,0.5],
#     'tfidf__min_df': [2],
#     'tfidf__ngram_range': [ (1, 2)],
#     'tfidf__binary': [True],
#     'tfidf__max_features': [100000],
#     'xgb__learning_rate': [0.1],
#     'xgb__n_estimators': [1000],
#     'xgb__max_depth': [6],
# }

# # Définition des métriques de performance
# scoring = {
#     'f1_score': make_scorer(f1_score),
#     'roc_auc': make_scorer(roc_auc_score),
#     'accuracy': make_scorer(accuracy_score)
# }

# # Séparation des données en ensembles d'entraînement et de test
# X_train, X_test, y_train, y_test = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)

# # Entraînement du modèle avec GridSearchCV
# grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit='roc_auc', cv=5, n_jobs=-1, verbose=1)

# # Transformation des étiquettes de classe -1 et 1 à 0 et 1
# y_train_mapped = list(np.where(np.array(y_train) == -1, 0, 1))
# y_test = list(np.where(np.array(y_test) == -1, 0, 1))
# # Entraînement du modèle avec les nouvelles étiquettes de classe
# grid_search.fit(X_train, y_train_mapped)

# # grid_search.fit(X_train, y_train)

# print("Meilleurs paramètres trouvés:")
# print(grid_search.best_params_)

# # Extraction du meilleur modèle
# best_model = grid_search.best_estimator_

# # Prédiction sur l'ensemble de test
# y_pred = best_model.predict(X_test)

# # Lissage des prédictions
# smoothed_pred = gaussian_filter(y_pred, sigma=0.1)

# # Calcul des métriques de performance après le lissage
# f1 = f1_score(y_test, smoothed_pred, average='micro')  # or 'macro' or 'weighted'
# roc_auc = roc_auc_score(y_test, smoothed_pred)
# accuracy = accuracy_score(y_test, (smoothed_pred > 0.5).astype(int))

# print("Performance après lissage:")
# print("F1 Score:", f1)
# print("AUC:", roc_auc)
    

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Meilleurs paramètres trouvés:
{'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2), 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 1000}
Performance après lissage:
F1 Score: 0.8977008824895495
AUC: 0.6544685044722339


In [None]:
# print(y_train)
# y_train_mapped = np.where(np.array(y_train) == -1, 0, 1)
# print(list(y_train_mapped))

[1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

### SVM :

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
def grid_searchsvm (parameters , scoring,preprocessors ,score_to_maximize):
    pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocessors)),
    ('svm', SVC())
    ])

    [X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)
    
    
    grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)
    
    
    grid_search.fit(X_all_train, Y_train)
    
    
    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)
    #_____________
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_all_test)
    
    smoothed_pred = gaussian_filter(y_pred,sigma=0.1)
    
    # Calcul des métriques de performance après le lissage
    f1 = f1_score(y_test, smoothed_pred, average='micro')  # or 'macro' or 'weighted'
    roc_auc = roc_auc_score(y_test, smoothed_pred)
    accuracy = accuracy_score(y_test, (smoothed_pred > 0.5).astype(int))
    
    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)
    
    
    
    
    #_____________
    # print("Scores:")
    # print("F1 Score:", grid_search.cv_results_['mean_test_f1_score'])
    # print("AUC:", grid_search.cv_results_['mean_test_roc_auc'])
    # print("Accuracy:", grid_search.cv_results_['mean_test_accuracy'])


In [12]:

class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
    'svm__class_weight': ['balanced', {1: 2}, {1: 5}, None],
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_searchsvm (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 48 candidates, totalling 240 fits
