In [2]:
# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from transformers import AutoTokenizer, AutoModel
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import torch
from tqdm import tqdm
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
import pickle
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
import torch


# Sauvegarde des modèles
import joblib

# Système et utilitaires
import os
from datetime import datetime
from pathlib import Path

# API et web
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List

# Implicite 
import streamlit
import requests
import uvicorn

import pandas as pd
import numpy as np
import json
import pickle

  from .autonotebook import tqdm as notebook_tqdm


# 5. VECTORISATION ET MODÉLISATION

In [3]:
# Importation des données processed
df_processed = pd.read_csv('data/processed/tweets_processed.csv')
df_processed

Unnamed: 0,id,keyword,location,text,target,combined_text,cleaned_text,text_length,processed_text
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1,"Communal violence in Bhainsa, Telangana. ""Ston...",communal violenc bhainsa telangana stone pelt ...,125,communal violenc in bhainsa telangana stone we...
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1,Telangana: Section 144 has been imposed in Bha...,telangana section impos bhainsa januari clash ...,131,telangana section has been impos in bhainsa fr...
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1,Arsonist sets cars ablaze at dealership https:...,arsonist set car ablaz dealership ablaz,63,arsonist set car ablaz at dealership
3,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0,"""Lord Jesus, your love brings freedom and pard...",lord jesus love bring freedom pardon fill holi...,140,lord jesus your love bring freedom and pardon ...
4,5,ablaze,OC,"If this child was Chinese, this tweet would ha...",0,"If this child was Chinese, this tweet would ha...",child chines tweet would gone viral social med...,122,if this child was chines this tweet would have...
...,...,...,...,...,...,...,...,...,...
10798,11364,wrecked,,Had these guys last game n fcked them. Talked ...,0,Had these guys last game n fcked them. Talked ...,guy last game n fcked talk non stop shit n sti...,139,had these guy last game n fcked them talk non ...
10799,11365,wrecked,Blue State in a red sea,Media should have warned us well in advance. T...,0,Media should have warned us well in advance. T...,media warn us well advanc wreck whole night re...,92,media should have warn us well in advanc this ...
10800,11366,wrecked,arohaonces,i feel directly attacked 💀 i consider moonbin ...,0,i feel directly attacked 💀 i consider moonbin ...,feel direct attack consid moonbin amp jinjin b...,115,i feel direct attack i consid moonbin amp jinj...
10801,11368,wrecked,auroraborealis,"ok who remember ""outcast"" nd the ""dora"" au?? T...",0,"ok who remember ""outcast"" nd the ""dora"" au?? T...",ok rememb outcast nd dora au au wreck nerv nd ...,105,ok who rememb outcast nd the dora au those au ...


In [4]:
# Séparation des données en features et target
X = df_processed['processed_text']
y = df_processed['target']
# Séparation des données en train et test
X_train, X_test = train_test_split(df_processed, test_size=0.2, random_state=42, stratify=y)

In [5]:
class BERTEmbedder(BaseEstimator, TransformerMixin):
    """
    Classe pour transformer des textes en embeddings BERT
    Compatible avec les pipelines sklearn
    """
    def __init__(self, model_name='bert-base-multilingual-cased', max_length=128, embedding_strategy="cls"):
        """
        Initialise la classe BERTEmbedder
        
        Args:
            model_name (str): Nom du modèle à utiliser
            max_length (int): Longueur maximale des séquences après tokenization
            embedding_strategy (str): Stratégie d'embedding ("cls" ou "mean")
        """
        self.model_name = model_name
        self.max_length = max_length
        self.embedding_strategy = embedding_strategy
        self.tokenizer = None
        self.model = None
        
        valid_emb_strategy = ("cls", "mean")
        if self.embedding_strategy not in valid_emb_strategy:
            raise ValueError(f"embedding_strategy doit être l'un des suivants: {valid_emb_strategy}")

    def fit(self, X, y=None):
        """ Charge le modèle et le tokenizer
        Args:
            X: Les textes d'entrée
            y: Les labels
            
        Returns:
            self: Retourne l'instance pour le chaînage
        """
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.model.eval()
        return self

    def transform(self, X):
        """
        Transforme les textes en embeddings
        
        Args:
            X: Liste de textes à transformer
            
        Returns:
            np.array: Matrice des embeddings
        """
        # Convertir en liste si nécessaire
        if isinstance(X, pd.Series) or isinstance(X, np.ndarray):
            X = X.tolist()
        elif not isinstance(X, list):
            raise ValueError("Les données d'entrée doivent être une liste ou convertible en liste de chaînes de caractères.")
        
        # Nettoyage et conversion en string
        X_cleaned = [str(text) if not pd.isna(text) else "" for text in X]
        
        embeddings = []
        
        for text in tqdm(X_cleaned, desc=f"Embeddings {self.model_name}"):
            inputs = self.tokenizer(text, return_tensors='pt', truncation=True, 
                                    padding=True, max_length=self.max_length)
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            last_hidden_states = outputs.last_hidden_state
            
            if self.embedding_strategy == 'cls':
                embedding = last_hidden_states[:, 0, :].squeeze().numpy()
            else:  # mean
                embedding = last_hidden_states.mean(dim=1).squeeze().numpy()
            
            embeddings.append(embedding)
        
        return np.vstack(embeddings)

def test_multiple_classifiers(model_name, twenty_train, twenty_test, embedding_strategy="cls"):
    """
    Crée des pipelines sklearn avec BERTEmbedder et différents classifieurs,
    les entraîne et les évalue
    
    Args:
        model_name (str): Nom du modèle BERT à utiliser
        twenty_train: Données d'entraînement
        twenty_test: Données de test
        embedding_strategy (str): Stratégie d'embedding ("cls" ou "mean")
        
    Returns:
        tuple: (meilleur_classifieur, scores)
    """
    print(f"\n=== Test du modèle {model_name} (embedding par: {embedding_strategy}) ===")
    
    # Assurer que processed_text est une liste de strings
    X_train = twenty_train.processed_text.tolist() if hasattr(twenty_train, 'processed_text') else twenty_train.tolist()
    X_test = twenty_test.processed_text.tolist() if hasattr(twenty_test, 'processed_text') else twenty_test.tolist()
    
    y_train = twenty_train.target if hasattr(twenty_train, 'target') else twenty_train
    y_test = twenty_test.target if hasattr(twenty_test, 'target') else twenty_test
    
    # Créer l'embedder BERT
    embedder = BERTEmbedder(model_name=model_name, max_length=128, embedding_strategy=embedding_strategy)
    
    # Transformer les données une seule fois pour éviter de répéter cette opération coûteuse
    print("Création des embeddings (peut prendre du temps)...")
    X_train_embedded = embedder.fit_transform(X_train)
    X_test_embedded = embedder.transform(X_test)
    
    # Définir les classifieurs à tester
    classifiers = {
        'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
        'SVM': SVC(class_weight='balanced', random_state=42, probability=True),
        'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        'LightGBM': lgb.LGBMClassifier(random_state=42, class_weight='balanced')
    }
    
    results = {}
    
    # Tester chaque classifieur
    for name, clf in classifiers.items():
        print(f"\nEntraînement du modèle {name}...")
        clf.fit(X_train_embedded, y_train)
        y_pred = clf.predict(X_test_embedded)
        
        # Évaluer les performances avec focus sur la classe 1
        accuracy = (y_pred == y_test).mean()
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        recall_class1 = recall_score(y_test, y_pred, average=None)[1] if 1 in np.unique(y_test) else 0
        f1_class1 = f1_score(y_test, y_pred, average=None)[1] if 1 in np.unique(y_test) else 0
        
        results[name] = {
            'accuracy': accuracy,
            'recall': recall,
            'f1': f1,
            'recall_class1': recall_class1,
            'f1_class1': f1_class1,
            'clf': clf
        }
        
        print(f"Résultats pour {name}:")
        print(f"Exactitude: {accuracy:.4f}")
        print(f"Recall (général): {recall:.4f}")
        print(f"F1-score (général): {f1:.4f}")
        print(f"Recall (classe 1): {recall_class1:.4f}")
        print(f"F1-score (classe 1): {f1_class1:.4f}")
        print(classification_report(y_test, y_pred))
    
    # Déterminer le meilleur classifieur en fonction du F1-score et recall pour classe 1
    best_score = 0
    best_classifier = None
    
    for name, scores in results.items():
        # Score combiné donnant un poids égal au recall et F1 pour la classe 1
        score = (scores['f1_class1'])
        if score > best_score:
            best_score = score
            best_classifier = name
    
    print(f"\nMeilleur classifieur: {best_classifier} avec un score de {best_score:.4f}")
    
    # Créer une pipeline complète avec le meilleur classifieur
    best_pipeline = Pipeline([
        ('embedder', embedder),
        ('classifier', results[best_classifier]['clf'])
    ])
    
    return best_classifier, results, best_pipeline, embedder

def optimize_best_classifier(best_classifier, X_train_embedded, y_train, X_test_embedded, y_test):
    """
    Optimise les hyperparamètres du meilleur classifieur
    
    Args:
        best_classifier (str): Nom du meilleur classifieur
        X_train_embedded: Données d'entraînement embedées
        y_train: Labels d'entraînement
        X_test_embedded: Données de test embedées
        y_test: Labels de test
        
    Returns:
        model: Modèle optimisé
    """
    print(f"\n=== Optimisation du classifieur {best_classifier} ===")
    
    # Définir les grilles de paramètres pour chaque classifieur
    param_grids = {
        'LogisticRegression': {
            'C': [0.01, 0.1, 1, 10, 100],
            'solver': ['liblinear', 'lbfgs'],
            'penalty': ['l1', 'l2']
        },
        'SVM': {
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto', 0.1, 0.01],
            'kernel': ['rbf', 'linear']
        },
        'XGBoost': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.3]
        },
        'LightGBM': {
            'n_estimators': [50, 100, 200],
            'num_leaves': [31, 50, 100],
            'learning_rate': [0.01, 0.1, 0.3]
        }
    }
    
    # Sélectionner le classifieur et la grille de paramètres appropriés
    if best_classifier == 'LogisticRegression':
        clf = LogisticRegression(class_weight='balanced', random_state=42, max_iter=2000)
    elif best_classifier == 'SVM':
        clf = SVC(class_weight='balanced', random_state=42, probability=True)
    elif best_classifier == 'XGBoost':
        clf = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    elif best_classifier == 'LightGBM':
        clf = lgb.LGBMClassifier(random_state=42, class_weight='balanced')
    else:
        raise ValueError(f"Classifieur {best_classifier} non reconnu")
    
    param_grid = param_grids[best_classifier]
    
    # Optimisation par validation croisée
    grid_search = GridSearchCV(
        clf, param_grid,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=2
    )
    
    grid_search.fit(X_train_embedded, y_train)
    
    # Récupérer le meilleur modèle
    best_model = grid_search.best_estimator_
    print(f"Meilleurs paramètres: {grid_search.best_params_}")
    
    # Évaluer le modèle optimisé
    y_pred = best_model.predict(X_test_embedded)
    
    print("\nRésultats après optimisation:")
    print(f"Exactitude: {(y_pred == y_test).mean():.4f}")
    print(f"Recall (classe 1): {recall_score(y_test, y_pred, average=None)[1] if 1 in np.unique(y_test) else 0:.4f}")
    print(f"F1-score (classe 1): {f1_score(y_test, y_pred, average=None)[1] if 1 in np.unique(y_test) else 0:.4f}")
    print(classification_report(y_test, y_pred))
    
    return best_model

def run_bert_experiments(twenty_train, twenty_test):
    """
    Exécute des expériences avec différents modèles BERT, stratégies d'embedding
    et différents classifieurs
    
    Args:
        twenty_train: Données d'entraînement
        twenty_test: Données de test
        
    Returns:
        tuple: (meilleur_modèle, meilleur_embedder)
    """
    models = [
        'bert-base-multilingual-cased',  # BERT multilingue (pour plusieurs langues)
        'distilbert-base-uncased',       # DistilBERT (version plus légère)
    ]
    
    strategies = ['mean', 'cls']  # Stratégies d'embedding: 'mean' ou 'cls'
    
    results = {}
    best_pipelines = {}
    
    for model in models:
        for strategy in strategies:
            model_key = f"{model}_{strategy}"
            best_clf, clf_results, pipeline, embedder = test_multiple_classifiers(
                model, twenty_train, twenty_test, embedding_strategy=strategy
            )
            
            # Stocker les résultats
            results[model_key] = {
                'best_classifier': best_clf,
                'classifier_results': clf_results
            }
            
            best_pipelines[model_key] = pipeline
    
    # Déterminer la meilleure combinaison (modèle BERT + stratégie + classifieur)
    best_score = 0
    best_config = None
    
    print("\n=== Résumé des résultats ===")
    summary_data = []
    
    for model_key, result in results.items():
        model, strategy = model_key.split('_')
        best_clf = result['best_classifier']
        clf_scores = result['classifier_results'][best_clf]
        
        # Score combiné (recall + f1 pour classe 1)
        score = (clf_scores['f1_class1'])
        
        summary_data.append({
            'Modèle BERT': model,
            'Stratégie': strategy,
            'Classifieur': best_clf,
            'Recall (classe 1)': clf_scores['recall_class1'],
            'F1 (classe 1)': clf_scores['f1_class1'],
            'Score': score
        })
        
        if score > best_score:
            best_score = score
            best_config = model_key
    
    # Afficher un résumé des résultats
    summary_df = pd.DataFrame(summary_data)
    print(summary_df.sort_values('Score', ascending=False))
    
    if best_config:
        print(f"\nMeilleure configuration: {best_config} avec {results[best_config]['best_classifier']}")
        best_model_key = best_config
        best_clf_name = results[best_config]['best_classifier']
        
        # Préparer les données pour l'optimisation
        model_name, embedding_strategy = best_model_key.split('_')
        embedder = BERTEmbedder(model_name=model_name, max_length=128, embedding_strategy=embedding_strategy)
        
        # Transformer les données
        X_train = twenty_train.processed_text.tolist() if hasattr(twenty_train, 'processed_text') else twenty_train.tolist()
        X_test = twenty_test.processed_text.tolist() if hasattr(twenty_test, 'processed_text') else twenty_test.tolist()
        y_train = twenty_train.target if hasattr(twenty_train, 'target') else twenty_train
        y_test = twenty_test.target if hasattr(twenty_test, 'target') else twenty_test
        
        print("\nCréation des embeddings pour l'optimisation...")
        X_train_embedded = embedder.fit_transform(X_train)
        X_test_embedded = embedder.transform(X_test)
        
        # Optimiser le meilleur classifieur
        best_model = optimize_best_classifier(
            best_clf_name, X_train_embedded, y_train, X_test_embedded, y_test
        )
        
        # Créer la pipeline finale
        final_pipeline = Pipeline([
            ('embedder', embedder),
            ('classifier', best_model)
        ])

        # Exporter le modèle
        print("\nExportation du modèle au format pkl...")
        with open(f'bert_model.pkl', 'wb') as f:
            pickle.dump(final_pipeline, f)
        print(f"Modèle exporté: bert_model.pkl")
        
        return final_pipeline, embedder
    
    return None, None

# Pour exécuter les expériences:
final_pipeline, embedder = run_bert_experiments(X_train, X_test)


=== Test du modèle bert-base-multilingual-cased (embedding par: mean) ===
Création des embeddings (peut prendre du temps)...


Embeddings bert-base-multilingual-cased: 100%|██████████| 8642/8642 [13:31<00:00, 10.65it/s]
Embeddings bert-base-multilingual-cased: 100%|██████████| 2161/2161 [02:52<00:00, 12.53it/s]



Entraînement du modèle LogisticRegression...
Résultats pour LogisticRegression:
Exactitude: 0.8112
Recall (général): 0.8112
F1-score (général): 0.8260
Recall (classe 1): 0.7507
F1-score (classe 1): 0.5837
              precision    recall  f1-score   support

           0       0.94      0.82      0.88      1780
           1       0.48      0.75      0.58       381

    accuracy                           0.81      2161
   macro avg       0.71      0.79      0.73      2161
weighted avg       0.86      0.81      0.83      2161


Entraînement du modèle SVM...
Résultats pour SVM:
Exactitude: 0.8316
Recall (général): 0.8316
F1-score (général): 0.8442
Recall (classe 1): 0.7900
F1-score (classe 1): 0.6232
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      1780
           1       0.51      0.79      0.62       381

    accuracy                           0.83      2161
   macro avg       0.73      0.82      0.76      2161
weighted avg       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Résultats pour XGBoost:
Exactitude: 0.8760
Recall (général): 0.8760
F1-score (général): 0.8615
Recall (classe 1): 0.4304
F1-score (classe 1): 0.5503
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1780
           1       0.76      0.43      0.55       381

    accuracy                           0.88      2161
   macro avg       0.83      0.70      0.74      2161
weighted avg       0.87      0.88      0.86      2161


Entraînement du modèle LightGBM...
[LightGBM] [Info] Number of positive: 1524, number of negative: 7118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041575 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8642, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.0



Résultats pour LightGBM:
Exactitude: 0.8561
Recall (général): 0.8561
F1-score (général): 0.8562
Recall (classe 1): 0.5932
F1-score (classe 1): 0.5924
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1780
           1       0.59      0.59      0.59       381

    accuracy                           0.86      2161
   macro avg       0.75      0.75      0.75      2161
weighted avg       0.86      0.86      0.86      2161


Meilleur classifieur: SVM avec un score de 0.6232

=== Test du modèle bert-base-multilingual-cased (embedding par: cls) ===
Création des embeddings (peut prendre du temps)...


Embeddings bert-base-multilingual-cased: 100%|██████████| 8642/8642 [11:54<00:00, 12.09it/s]
Embeddings bert-base-multilingual-cased: 100%|██████████| 2161/2161 [02:51<00:00, 12.62it/s]



Entraînement du modèle LogisticRegression...
Résultats pour LogisticRegression:
Exactitude: 0.7885
Recall (général): 0.7885
F1-score (général): 0.8057
Recall (classe 1): 0.7008
F1-score (classe 1): 0.5388
              precision    recall  f1-score   support

           0       0.93      0.81      0.86      1780
           1       0.44      0.70      0.54       381

    accuracy                           0.79      2161
   macro avg       0.68      0.75      0.70      2161
weighted avg       0.84      0.79      0.81      2161


Entraînement du modèle SVM...
Résultats pour SVM:
Exactitude: 0.7881
Recall (général): 0.7881
F1-score (général): 0.8061
Recall (classe 1): 0.7244
F1-score (classe 1): 0.5465
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      1780
           1       0.44      0.72      0.55       381

    accuracy                           0.79      2161
   macro avg       0.69      0.76      0.70      2161
weighted avg       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Résultats pour XGBoost:
Exactitude: 0.8616
Recall (général): 0.8616
F1-score (général): 0.8396
Recall (classe 1): 0.3360
F1-score (classe 1): 0.4613
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1780
           1       0.74      0.34      0.46       381

    accuracy                           0.86      2161
   macro avg       0.80      0.66      0.69      2161
weighted avg       0.85      0.86      0.84      2161


Entraînement du modèle LightGBM...
[LightGBM] [Info] Number of positive: 1524, number of negative: 7118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8642, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.0



Résultats pour LightGBM:
Exactitude: 0.8607
Recall (général): 0.8607
F1-score (général): 0.8555
Recall (classe 1): 0.5197
F1-score (classe 1): 0.5681
              precision    recall  f1-score   support

           0       0.90      0.93      0.92      1780
           1       0.63      0.52      0.57       381

    accuracy                           0.86      2161
   macro avg       0.76      0.73      0.74      2161
weighted avg       0.85      0.86      0.86      2161


Meilleur classifieur: LightGBM avec un score de 0.5681

=== Test du modèle distilbert-base-uncased (embedding par: mean) ===
Création des embeddings (peut prendre du temps)...


Embeddings distilbert-base-uncased: 100%|██████████| 8642/8642 [05:36<00:00, 25.69it/s]
Embeddings distilbert-base-uncased: 100%|██████████| 2161/2161 [01:24<00:00, 25.53it/s]



Entraînement du modèle LogisticRegression...
Résultats pour LogisticRegression:
Exactitude: 0.8283
Recall (général): 0.8283
F1-score (général): 0.8412
Recall (classe 1): 0.7795
F1-score (classe 1): 0.6155
              precision    recall  f1-score   support

           0       0.95      0.84      0.89      1780
           1       0.51      0.78      0.62       381

    accuracy                           0.83      2161
   macro avg       0.73      0.81      0.75      2161
weighted avg       0.87      0.83      0.84      2161


Entraînement du modèle SVM...
Résultats pour SVM:
Exactitude: 0.8431
Recall (général): 0.8431
F1-score (général): 0.8538
Recall (classe 1): 0.7874
F1-score (classe 1): 0.6390
              precision    recall  f1-score   support

           0       0.95      0.86      0.90      1780
           1       0.54      0.79      0.64       381

    accuracy                           0.84      2161
   macro avg       0.74      0.82      0.77      2161
weighted avg       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Résultats pour XGBoost:
Exactitude: 0.8792
Recall (général): 0.8792
F1-score (général): 0.8708
Recall (classe 1): 0.5118
F1-score (classe 1): 0.5991
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1780
           1       0.72      0.51      0.60       381

    accuracy                           0.88      2161
   macro avg       0.81      0.73      0.76      2161
weighted avg       0.87      0.88      0.87      2161


Entraînement du modèle LightGBM...
[LightGBM] [Info] Number of positive: 1524, number of negative: 7118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8642, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.0



Résultats pour LightGBM:
Exactitude: 0.8741
Recall (général): 0.8741
F1-score (général): 0.8755
Recall (classe 1): 0.6719
F1-score (classe 1): 0.6531
              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1780
           1       0.64      0.67      0.65       381

    accuracy                           0.87      2161
   macro avg       0.78      0.79      0.79      2161
weighted avg       0.88      0.87      0.88      2161


Meilleur classifieur: LightGBM avec un score de 0.6531

=== Test du modèle distilbert-base-uncased (embedding par: cls) ===
Création des embeddings (peut prendre du temps)...


Embeddings distilbert-base-uncased: 100%|██████████| 8642/8642 [06:10<00:00, 23.33it/s]
Embeddings distilbert-base-uncased: 100%|██████████| 2161/2161 [01:29<00:00, 24.05it/s]



Entraînement du modèle LogisticRegression...
Résultats pour LogisticRegression:
Exactitude: 0.8390
Recall (général): 0.8390
F1-score (général): 0.8492
Recall (classe 1): 0.7559
F1-score (classe 1): 0.6234
              precision    recall  f1-score   support

           0       0.94      0.86      0.90      1780
           1       0.53      0.76      0.62       381

    accuracy                           0.84      2161
   macro avg       0.74      0.81      0.76      2161
weighted avg       0.87      0.84      0.85      2161


Entraînement du modèle SVM...
Résultats pour SVM:
Exactitude: 0.8366
Recall (général): 0.8366
F1-score (général): 0.8479
Recall (classe 1): 0.7717
F1-score (classe 1): 0.6249
              precision    recall  f1-score   support

           0       0.95      0.85      0.90      1780
           1       0.53      0.77      0.62       381

    accuracy                           0.84      2161
   macro avg       0.74      0.81      0.76      2161
weighted avg       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Résultats pour XGBoost:
Exactitude: 0.8783
Recall (général): 0.8783
F1-score (général): 0.8698
Recall (classe 1): 0.5092
F1-score (classe 1): 0.5960
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1780
           1       0.72      0.51      0.60       381

    accuracy                           0.88      2161
   macro avg       0.81      0.73      0.76      2161
weighted avg       0.87      0.88      0.87      2161


Entraînement du modèle LightGBM...
[LightGBM] [Info] Number of positive: 1524, number of negative: 7118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8642, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.0



Résultats pour LightGBM:
Exactitude: 0.8649
Recall (général): 0.8649
F1-score (général): 0.8654
Recall (classe 1): 0.6273
F1-score (classe 1): 0.6208
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1780
           1       0.61      0.63      0.62       381

    accuracy                           0.86      2161
   macro avg       0.77      0.77      0.77      2161
weighted avg       0.87      0.86      0.87      2161


Meilleur classifieur: SVM avec un score de 0.6249

=== Résumé des résultats ===
                    Modèle BERT Stratégie Classifieur  Recall (classe 1)  \
2       distilbert-base-uncased      mean    LightGBM           0.671916   
3       distilbert-base-uncased       cls         SVM           0.771654   
0  bert-base-multilingual-cased      mean         SVM           0.790026   
1  bert-base-multilingual-cased       cls    LightGBM           0.519685   

   F1 (classe 1)     Score  
2       0.653061  0.653061  
3  

Embeddings distilbert-base-uncased: 100%|██████████| 8642/8642 [06:14<00:00, 23.09it/s]
Embeddings distilbert-base-uncased: 100%|██████████| 2161/2161 [01:34<00:00, 22.92it/s]



=== Optimisation du classifieur LightGBM ===
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 1524, number of negative: 7118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 8642, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Meilleurs paramètres: {'learning_rate': 0.3, 'n_estimators': 200, 'num_leaves': 50}

Résultats après optimisation:
Exactitude: 0.8903
Recall (classe 1): 0.5669
F1-score (classe 1): 0.6457
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      1780
           1       0.75      0.57      0.65       381

    accuracy                           0.89    



Modèle exporté: bert_model.pkl
