In [1]:
# 1. Installation & Imports
!pip install -q torch accelerate transformers datasets evaluate scikit-learn matplotlib seaborn plotly xgboost lightgbm shap lime
!pip install -q huggingface_hub peft bitsandbytes loguru

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:


import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed
)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
)
import evaluate
from peft import LoraConfig, get_peft_model, TaskType
from loguru import logger
import shap, lime.lime_text

In [3]:
# 2. Chargement et Fusion des Données
# Twitter
ds = load_dataset("zeroshot/twitter-financial-news-sentiment", split="train")
df1 = ds.to_pandas()
df1["label"] = df1["label"].map({0:"negative", 1:"positive", 2:"neutral"})
df1["input"] = df1["text"]

# FinGPT
dataset = load_dataset("FinGPT/fingpt-sentiment-train", split="train")
df2 = dataset.to_pandas()
def simplify_9lbl(lbl):
    lbl = lbl.lower()
    if "very positive" in lbl or "strongly positive" in lbl:
        return "very positive"
    if "positive" in lbl:
        return "positive"
    if "very negative" in lbl or "strongly negative" in lbl:
        return "very negative"
    if "negative" in lbl:
        return "negative"
    return "neutral"
df2["label"] = df2["output"].apply(simplify_9lbl)
df2 = df2[["input","label"]]

# Fusion
df_combined = pd.concat([df1[["input","label"]], df2[["input","label"]]], axis=0).reset_index(drop=True)
print(df_combined["label"].value_counts())

README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

sent_train.csv:   0%|          | 0.00/859k [00:00<?, ?B/s]

sent_valid.csv:   0%|          | 0.00/217k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2388 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]

(…)-00000-of-00001-dabab110260ac909.parquet:   0%|          | 0.00/6.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76772 [00:00<?, ? examples/s]

label
neutral     35393
positive    32433
negative    18489
Name: count, dtype: int64


In [4]:
# 3. Visualisation et Option Multi-labels
fig = px.histogram(df_combined, x="label", color="label", title="Distribution des classes")
fig.show()

In [5]:
# 4. Prétraitement
import re
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower()
    return text.strip()
df_combined["input"] = df_combined["input"].astype(str).apply(clean_text)

In [6]:
# 5. Split Train/Val/Test (70/10/20)
train_df, temp_df = train_test_split(
    df_combined, test_size=0.30, random_state=42, stratify=df_combined["label"]
)
val_df, test_df = train_test_split(
    temp_df, test_size=2/3, random_state=42, stratify=temp_df["label"]
)
print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

Train: 60420 Val: 8631 Test: 17264


In [7]:
# --- Vectorisation TF-IDF ---
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['input'])
X_val   = vectorizer.transform(val_df['input'])
X_test  = vectorizer.transform(test_df['input'])

# --- Encodage des labels textes en entiers ---
le = LabelEncoder()
y_train = le.fit_transform(train_df['label'])
y_val   = le.transform(val_df['label'])
y_test  = le.transform(test_df['label'])

# --- 1. Support Vector Machine (Linear SVC) ---
svm_params = {'C': [0.01, 0.1, 1, 10]}
svm_cv = GridSearchCV(
    LinearSVC(),
    param_grid=svm_params,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    error_score='raise'
)
svm_cv.fit(X_train, y_train)
print('--- SVM ---')
print(svm_cv.best_params_)
print(classification_report(y_test, svm_cv.predict(X_test), target_names=le.classes_, digits=3))

# Calibration pour obtenir predict_proba
calibrated_svm = CalibratedClassifierCV(svm_cv.best_estimator_, cv=3)
calibrated_svm.fit(X_train, y_train)

# --- 2. Random Forest ---
rf_params = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
rf = GridSearchCV(
    RandomForestClassifier(),
    param_grid=rf_params,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    error_score='raise'
)
rf.fit(X_train, y_train)
print('--- Random Forest ---')
print(rf.best_params_)
print(classification_report(y_test, rf.predict(X_test), target_names=le.classes_, digits=3))

# --- 3. XGBoost ---
xgb_params = {'n_estimators': [100, 200]}
xgb = GridSearchCV(
    XGBClassifier(tree_method='hist', use_label_encoder=False, eval_metric='mlogloss'),
    param_grid=xgb_params,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    error_score='raise'
)
xgb.fit(X_train, y_train)
print('--- XGBoost ---')
print(xgb.best_params_)
print(classification_report(y_test, xgb.predict(X_test), target_names=le.classes_, digits=3))

# --- 4. LightGBM ---
lgbm_params = {'n_estimators': [100, 200]}
lgbm = GridSearchCV(
    LGBMClassifier(),
    param_grid=lgbm_params,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    error_score='raise'
)
lgbm.fit(X_train, y_train)
print('--- LightGBM ---')
print(lgbm.best_params_)
print(classification_report(y_test, lgbm.predict(X_test), target_names=le.classes_, digits=3))

# --- 5. Voting Ensemble (Soft Voting) ---
voting = VotingClassifier(
    estimators=[
        ('svm', calibrated_svm),
        ('rf', rf.best_estimator_),
        ('xgb', xgb.best_estimator_),
        ('lgbm', lgbm.best_estimator_)
    ],
    voting='soft',
    n_jobs=-1
)
voting.fit(X_train, y_train)
print('--- Voting Ensemble ---')
print(classification_report(y_test, voting.predict(X_test), target_names=le.classes_, digits=3))


--- SVM ---
{'C': 10}
              precision    recall  f1-score   support

    negative      0.884     0.856     0.870      3698
     neutral      0.915     0.945     0.929      7079
    positive      0.905     0.889     0.897      6487

    accuracy                          0.905     17264
   macro avg      0.901     0.896     0.899     17264
weighted avg      0.904     0.905     0.904     17264

--- Random Forest ---
{'max_depth': None, 'n_estimators': 200}
              precision    recall  f1-score   support

    negative      0.967     0.790     0.870      3698
     neutral      0.918     0.966     0.941      7079
    positive      0.885     0.926     0.905      6487

    accuracy                          0.913     17264
   macro avg      0.923     0.894     0.905     17264
weighted avg      0.916     0.913     0.912     17264




A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



--- XGBoost ---
{'n_estimators': 200}
              precision    recall  f1-score   support

    negative      0.915     0.731     0.813      3698
     neutral      0.821     0.944     0.878      7079
    positive      0.873     0.832     0.852      6487

    accuracy                          0.856     17264
   macro avg      0.870     0.835     0.848     17264
weighted avg      0.861     0.856     0.854     17264




A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.215798 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210462
[LightGBM] [Info] Number of data points in the train set: 60420, number of used features: 9908
[LightGBM] [Info] Start training from score -1.540842
[LightGBM] [Info] Start training from score -0.891485
[LightGBM] [Info] Start training from score -0.978823
--- LightGBM ---
{'n_estimators': 200}
              precision    recall  f1-score   support

    negative      0.894     0.762     0.823      3698
     neutral      0.841     0.933     0.885      7079
    positive      0.877     0.847     0.861      6487

    accuracy                          0.864     17264
   macro avg      0.871     0.847     0.856     17264
weighted avg      0.866     0.864     0.863     17264

--- Voting Ensemble ---
              precision    recall  f1

In [None]:
# Réglages globaux
set_seed(42)

# --- Préparation des labels ---
label_list = sorted(set(train_df['label']) | set(val_df['label']) | set(test_df['label']))
label2id = {lbl: i for i, lbl in enumerate(label_list)}
id2label = {i: lbl for lbl, i in label2id.items()}

# Fonction de mapping pour HuggingFace Dataset
def map_label(ex):
    ex['labels'] = label2id[ex.pop('label')]
    return ex

# Création des datasets HF
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True)).map(map_label, remove_columns=['label'])
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True)).map(map_label, remove_columns=['label'])
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True)).map(map_label, remove_columns=['label'])

# Tokenization
def tokenize_map(ex, tokenizer):
    return tokenizer(
        ex['input'], padding='max_length', truncation=True, max_length=128
    )

# Boucle de fine-tuning avec régularisation et early stopping
def run_transformer(model_name):
    # Chargement du tokenizer et du modèle
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    # Augmentation du dropout
    model.config.hidden_dropout_prob = 0.3
    model.config.attention_probs_dropout_prob = 0.3

    # Préparation des données tokenisées
    collator = DataCollatorWithPadding(tokenizer)
    train_tok = train_ds.map(lambda ex: tokenize_map(ex, tokenizer), batched=True).remove_columns(['input'])
    val_tok   = val_ds.map(lambda ex: tokenize_map(ex, tokenizer), batched=True).remove_columns(['input'])
    test_tok  = test_ds.map(lambda ex: tokenize_map(ex, tokenizer), batched=True).remove_columns(['input'])

    # Arguments d'entraînement
    args = TrainingArguments(
        output_dir=f"./{model_name.replace('/', '_')}_sentiment",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=500,
        lr_scheduler_type="linear",
        logging_steps=100,
        fp16=True,
        report_to="none"
    )

    # Fonction de calcul des métriques
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            'accuracy': accuracy_score(labels, preds),
            'f1': f1_score(labels, preds, average='macro'),
            'precision': precision_score(labels, preds, average='macro'),
            'recall': recall_score(labels, preds, average='macro')
        }

    # Création du trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        tokenizer=tokenizer,
        data_collator=collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    # Entraînement
    trainer.train()

    # Évaluation finale
    preds = trainer.predict(test_tok)
    y_true = preds.label_ids
    y_pred = np.argmax(preds.predictions, axis=1)
    print(f"\n=== {model_name} sur test ===")
    print(classification_report(y_true, y_pred, target_names=label_list, digits=3))

    return trainer

# Liste des modèles à tester
model_names = [
    "ProsusAI/finbert",
    "bert-base-uncased",
    "distilbert-base-uncased",
    "roberta-base"
]

# Exécution
for name in model_names:
    try:
        run_transformer(name)
    except Exception as e:
        print(f"Erreur pour {name}: {e}")


Map:   0%|          | 0/60420 [00:00<?, ? examples/s]

Map:   0%|          | 0/8631 [00:00<?, ? examples/s]

Map:   0%|          | 0/17264 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/60420 [00:00<?, ? examples/s]

Map:   0%|          | 0/8631 [00:00<?, ? examples/s]

Map:   0%|          | 0/17264 [00:00<?, ? examples/s]


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2587,0.216519,0.92805,0.925698,0.928586,0.923535



Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



In [None]:
# 8. Explicabilité (SHAP, LIME)
# SHAP pour XGBoost
explainer = shap.Explainer(xgb.best_estimator_)
shap_values = explainer(X_test[:100])
shap.summary_plot(shap_values, X_test[:100], feature_names=vectorizer.get_feature_names_out())

# LIME pour SVM
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=label_list)
idx = 0
exp = explainer.explain_instance(test_df.iloc[idx]["input"], svm.predict_proba, num_features=10)
exp.show_in_notebook()

In [None]:
# 9. Analyse d’erreurs et Visualisation
# Exemples mal classés
test_df["pred"] = voting.predict(X_test)
erreurs = test_df[test_df["label"] != test_df["pred"]]
print(erreurs.sample(10))

# Matrice de confusion
cm = confusion_matrix(y_test, voting.predict(X_test), labels=label_list)
fig = ff.create_annotated_heatmap(
    z=cm, x=label_list, y=label_list, annotation_text=[[str(y) for y in x] for x in cm], colorscale="Blues"
)
fig.update_layout(title="Confusion Matrix - Voting Ensemble")
fig.show()

In [None]:
# 10. Export du meilleur modèle
import joblib
joblib.dump(voting, "voting_sentiment_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")