## Lecture des données :

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import common as cmn
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score


from scipy.ndimage import gaussian_filter


import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.model_selection._validation")

In [2]:
fname = "../datasets/AFDpresidentutf8/corpus.tache1.learn.utf8"
alltxts,alllabs = cmn.load_pres(fname)


## Fonction grid_search

In [3]:

def grid_search (parameters , scoring,preprocessors ,score_to_maximize):
    pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocessors)),
    ('reg', LogisticRegression())
    ])

    [X_all_train, X_all_test, Y_train, y_test]  = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)


    grid_search = GridSearchCV(pipeline, parameters,scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)


    grid_search.fit(X_all_train, Y_train)


    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)
    #_____________

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict_proba(X_all_test)

    smoothed_pred = gaussian_filter(y_pred,sigma=0.1)
    smoothed_pred_labels = (2 * (smoothed_pred[:,0] >= 0.5) - 1).astype(int)

    f1 = f1_score(y_test, smoothed_pred_labels,  pos_label=-1)  

    roc_auc = roc_auc_score(y_test, smoothed_pred_labels)
    accuracy = accuracy_score(y_test, (smoothed_pred_labels > 0.5).astype(int))

    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)



## Test 1 :
On essaye de penaliser notre classe minoritaire avec diffèrentes vaaleurs de pénaisation

In [13]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.2,0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [100000],
    'reg__class_weight': [{1: 1, -1: w} for w in [1, 5, 10, 20]],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6344735077129444
AUC: 0.7827068919817048
Accuracy: 0.8226892707849512


### Test 2 :
Maximiser le f1 score de la classe minoritaire

In [14]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [40000,100000],
    'reg__class_weight': ['balanced',class_weights],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1.0, -1: 5.0}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6368715083798883
AUC: 0.7841424676623252
Accuracy: 0.822921504876916


### Test 3 :
Maximiser roc auc score

In [15]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.5],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [40000,100000],
    'reg__class_weight': ['balanced',class_weights],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 1,10,100]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((cmn.majuscules_en_marqueurs((cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'roc_auc')


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Meilleurs paramètres trouvés:
{'reg__C': 1, 'reg__class_weight': 'balanced', 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__max_features': 100000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.5928350609201672
AUC: 0.8011627939099627
Accuracy: 0.7753715745471436


### Test 4:
essayer de maximiser des deux

In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((cmn.suppression_chiffres(cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6349419124218052
AUC: 0.7831072171504867
Accuracy: 0.82263121226196


In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6361607142857143
AUC: 0.7839748961092721
Accuracy: 0.82263121226196


In [None]:
class_weights = {
    1: 1.0,  # Class 1, the majority class, gets weight 1.0 (default weight)
    -1: 5.0  # Class -1, the minority class, gets weight 5.0
}


parameters = {
    'tfidf__max_df': [ 0.45],
    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__class_weight': [{1: 1, -1: 5} ],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]
}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.lemmatization(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__class_weight': {1: 1, -1: 5}, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6296296296296295
AUC: 0.7804017907837535
Accuracy: 0.8217022758941013


### test ppti : avec over sample


In [4]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from scipy.ndimage import gaussian_filter

def grid_search_ov(parameters, scoring, preprocessors, score_to_maximize):
    X_all_train, X_all_test, Y_train, y_test = train_test_split(alltxts, alllabs, test_size=0.3, random_state=10, shuffle=True)

    tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessors)

    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('reg', LogisticRegression())
    ])

    grid_search = GridSearchCV(pipeline, parameters, scoring=scoring, refit=score_to_maximize, cv=5, n_jobs=-1, verbose=1)

    grid_search.fit(X_all_train, Y_train)

    print("Meilleurs paramètres trouvés:")
    print(grid_search.best_params_)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict_proba(X_all_test)

    smoothed_pred = gaussian_filter(y_pred, sigma=0.1)
    smoothed_pred_labels = (2 * (smoothed_pred[:,0] >= 0.5) - 1).astype(int)

    f1 = f1_score(y_test, smoothed_pred_labels, pos_label=-1)
    roc_auc = roc_auc_score(y_test, smoothed_pred_labels)
    accuracy = accuracy_score(y_test, (smoothed_pred_labels > 0.5).astype(int))

    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)
    return tfidf_vectorizer


In [None]:

parameters = {
    'tfidf__max_df': [ 0.45],

    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6303874619068349
AUC: 0.7859142478567198
Accuracy: 0.8173478866697631


In [None]:

parameters = {
    'tfidf__max_df': [1204],

    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    # 'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (((((cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6349760139555167
AUC: 0.7881858347440606
Accuracy: 0.818276823037622


In [None]:

parameters = {
    'tfidf__max_df': [ 0.45],

    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: (cmn.suppression_chiffres(cmn.majuscules_en_marqueurs((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits


Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.45, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6321041214750542
AUC: 0.7876319253397123
Accuracy: 0.8169414770088249


In [38]:
#The best pour le moment avec ficheir 9 28 2 2 = > j'ai fait mieux la best n'est plsu celle -ci

parameters = {
    'tfidf__max_df': [ 0.5],

    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    # 'tfidf__lowercase':[False],
    # 'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10]

}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6335728282168517
AUC: 0.7877008005194791
Accuracy: 0.8178123548536925


In [59]:
# THe best
parameters = {
    'tfidf__max_df': [ 0.5],

    # 'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2)],
    'tfidf__binary': [True],
    # 'tfidf__lowercase':[False],
    # 'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10],


}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
Performance après lissage:
F1 Score: 0.6348077350522339
AUC: 0.7840894263585909
Accuracy: 0.8217022758941013


In [69]:
# THe best
parameters = {
    'tfidf__max_df': [ 0.6],

    'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (2, 2)],
    'tfidf__binary': [True],
    # 'tfidf__lowercase':[False],
    'tfidf__max_features': [90000],
    'reg__max_iter': [1000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10],


}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 1000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': True, 'tfidf__max_df': 0.6, 'tfidf__max_features': 90000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (2, 2)}
Performance après lissage:
F1 Score: 0.5703405017921148
AUC: 0.7464451494323869
Accuracy: 0.8147352531351603


In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
parameters = {
    'tfidf__max_df': [ 0.5],

    # 'tfidf__min_df': [2],
    'tfidf__ngram_range': [ (1, 2),(1,1),(1,3),(2,2),(2,3)],
    'tfidf__binary': [True,False],
    'tfidf__sublinear_tf': [True,False],
    'tfidf__use_idf': [False,True],
    'tfidf__stop_words': [stopwords.words('french')],
    'tfidf__max_features':[None, 5000, 20000, 50000,100000],
    'reg__max_iter': [10000],
    'reg__tol': [1e-4],
    'reg__penalty': [ 'l2' ],
    'reg__C': [ 10],


}

scoring = {
    'f1_score': make_scorer(f1_score,pos_label=-1),
    'roc_auc': make_scorer(roc_auc_score),
    'accuracy': make_scorer(accuracy_score)
}
preprocessors = lambda x: ((((cmn.suppression_ponctuation(cmn.suppression_balises_html((x)))))))

grid_search_ov (parameters , scoring,preprocessors ,'f1_score')


Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Meilleurs paramètres trouvés:
{'reg__C': 10, 'reg__max_iter': 10000, 'reg__penalty': 'l2', 'reg__tol': 0.0001, 'tfidf__binary': False, 'tfidf__max_df': 0.5, 'tfidf__max_features': None, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': ['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soi

In [17]:
def fonction_eval(vectorizer,alltexts,alllabs):

    [X_all_train, X_all_test, y_train, y_test]  = train_test_split(alltexts,alllabs, test_size=0.2, random_state=10, shuffle=True)
    X_train = vectorizer.fit_transform(X_all_train)
    X_test = vectorizer.transform(X_all_test)


    #Logistic Regression
    t = 1e-8
    C=10
    lr_clf = LogisticRegression(random_state=0, solver='liblinear',max_iter=1000, tol=t, C=C,class_weight='balanced')
    lr_clf.fit(X_train, y_train)
    pred_lr = lr_clf.predict_proba(X_test)
    probabilites_metterand = pred_lr[:,0]
    smoothed_pred = gaussian_filter(probabilites_metterand,sigma=2)

    smoothed_pred_labels = (2 * (smoothed_pred <= 0.5) - 1).astype(int)

    f1 = f1_score(y_test, smoothed_pred_labels,  pos_label=-1)  

    roc_auc = roc_auc_score(y_test, smoothed_pred_labels)
    accuracy = accuracy_score(y_test, (smoothed_pred_labels > 0.5).astype(int))

    print("Performance après lissage:")
    print("F1 Score:", f1)
    print("AUC:", roc_auc)
    print("Accuracy:", accuracy)


In [21]:
from sklearn.metrics import precision_recall_fscore_support,classification_report
from scipy.ndimage import gaussian_filter

def fonction_eval(vectorizer,state,alltexts,alllabs):

    [X_all_train, X_all_test, y_train, y_test]  = train_test_split(alltexts, alllabs, test_size=0.2, random_state=state, shuffle=True)
    X_train = vectorizer.fit_transform(X_all_train)
    X_test = vectorizer.transform(X_all_test)
    #Logistic Regression
    t = 1e-8
    C=10
    lr_clf = LogisticRegression(random_state=0, solver='liblinear',max_iter=1000, tol=t, C=C,class_weight='balanced')
    lr_clf.fit(X_train, y_train)
    pred_lr = lr_clf.predict_proba(X_test)

    smoothed_pred = gaussian_filter(pred_lr[:,0],sigma=0.3)

    threshold = 0.5 
    binary_predictions = np.where(smoothed_pred >= threshold, -1, 1)

    # Calculate F1-score
    f1 = f1_score(y_test, binary_predictions, average='weighted')

    print("F1 Score:", f1)
    print(classification_report(y_test,binary_predictions ))

In [22]:
from imblearn.over_sampling import RandomOverSampler

tf_idf_vectorizer =TfidfVectorizer(min_df=5, ngram_range=(1, 3),binary=True)

def preprocessors_fct (x):
    x_1 = cmn.lowercase(cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html(x)))))
    return x_1
alltxts_cleand = [preprocessors_fct (x) for x in alltxts]
fonction_eval(tf_idf_vectorizer,42,  alltxts_cleand,alllabs)

F1 Score: 0.8952444623126089
              precision    recall  f1-score   support

          -1       0.57      0.66      0.61      1481
           1       0.95      0.93      0.94     10002

    accuracy                           0.89     11483
   macro avg       0.76      0.79      0.77     11483
weighted avg       0.90      0.89      0.90     11483



In [25]:
tf_idf_vectorizer =TfidfVectorizer(min_df=5, ngram_range=(1, 2),binary=True)

def preprocessors_fct (x):
    x_1 = cmn.remove_stopwords(cmn.lowercase(cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html(x))))))
    return x_1
alltxts_cleand = [preprocessors_fct (x) for x in alltxts]
fonction_eval(tf_idf_vectorizer,42,  alltxts_cleand,alllabs)

F1 Score: 0.8702808340858814
              precision    recall  f1-score   support

          -1       0.47      0.66      0.55      1481
           1       0.95      0.89      0.92     10002

    accuracy                           0.86     11483
   macro avg       0.71      0.78      0.73     11483
weighted avg       0.89      0.86      0.87     11483



In [23]:
from imblearn.over_sampling import RandomOverSampler

tf_idf_vectorizer =TfidfVectorizer(min_df=5, ngram_range=(1, 3),binary=True,max_df=0.9)

def preprocessors_fct (x):
    x_1 = (cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html(x)))))
    x_2 =(cmn.lemmatization(cmn.remove_stopwords(cmn.suppression_chiffres(x_1))))

    return x_2
alltxts_cleand = [preprocessors_fct (x) for x in alltxts]
fonction_eval(tf_idf_vectorizer,42,  alltxts_cleand,alllabs)

F1 Score: 0.871634726229245
              precision    recall  f1-score   support

          -1       0.48      0.66      0.55      1481
           1       0.95      0.89      0.92     10002

    accuracy                           0.86     11483
   macro avg       0.71      0.78      0.74     11483
weighted avg       0.89      0.86      0.87     11483



In [27]:
tf_idf_vectorizer =TfidfVectorizer(min_df=5, ngram_range=(1, 3),binary=True,max_df=0.5)

def preprocessors_fct (x):
    x_1 = (cmn.majuscules_en_marqueurs(cmn.suppression_ponctuation((cmn.suppression_balises_html(x)))))
    x_2 =(cmn.lemmatization(cmn.suppression_chiffres(x_1)))

    return x_2
alltxts_cleand = [preprocessors_fct (x) for x in alltxts]
fonction_eval(tf_idf_vectorizer,42,  alltxts_cleand,alllabs)

F1 Score: 0.895940670090053
              precision    recall  f1-score   support

          -1       0.57      0.66      0.61      1481
           1       0.95      0.93      0.94     10002

    accuracy                           0.89     11483
   macro avg       0.76      0.79      0.78     11483
weighted avg       0.90      0.89      0.90     11483

