## Importation des bibliothèques

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import numpy as np


##  Préparation des données

In [None]:

tfidf = TfidfVectorizer(max_features=5000)  # Limite à 5000 mots pour gérer la mémoire
X_tfidf = tfidf.fit_transform(data['comment_text'])  # Utilisation de toutes les données
y = data[data.columns[2:]].values  # Étiquettes binaires (6 colonnes)

# Séparation train/test
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Vérification des données
print("X_train_tfidf Shape:", X_train_tfidf.shape)
print("y_train Shape:", y_train.shape)
print("Sample of y_train:\n", y_train[:5])


## Entraînement du modèle baseline

In [None]:

baseline_model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
baseline_model.fit(X_train_tfidf, y_train)


##  Test du modèle sur des exemples

In [None]:

test_comments = [
    "You’re a disgusting idiot!",
    "Great job, keep it up!",
    "I could kill you for this… just kidding!"
]
print("\nTest du modèle baseline sur des exemples :")
for comment in test_comments:
    X_test_comment = tfidf.transform([comment])
    pred = baseline_model.predict(X_test_comment)[0]
    pred_proba = baseline_model.predict_proba(X_test_comment)[0]
    print(f"\nComment: {comment}")
    print("Prédictions (0/1):", dict(zip(data.columns[2:], pred)))
    print("Probabilités:", dict(zip(data.columns[2:], pred_proba)))


##  Évaluation avec seuil fixe

In [None]:

y_pred = baseline_model.predict(X_test_tfidf)
y_pred_proba = baseline_model.predict_proba(X_test_tfidf)
f1_macro_baseline = f1_score(y_test, y_pred, average='macro')
f1_micro_baseline = f1_score(y_test, y_pred, average='micro')
roc_auc_baseline = roc_auc_score(y_test, y_pred_proba)
print("\nÉvaluation du modèle baseline (seuil fixe 0.5):")
print(f"F1 Score (macro): {f1_macro_baseline:.4f}")
print(f"F1 Score (micro): {f1_micro_baseline:.4f}")
print(f"ROC-AUC Score: {roc_auc_baseline:.4f}")


##  Optimisation des seuils pour F1-score macro

In [None]:

def optimize_thresholds(y_true, y_pred_proba, labels):
    best_thresholds = []
    for i in range(len(labels)):
        thresholds = np.arange(0.1, 0.9, 0.05)
        best_f1 = 0
        best_thresh = 0.5
        for thresh in thresholds:
            y_pred = (y_pred_proba[:, i] > thresh).astype(int)
            f1 = f1_score(y_true[:, i], y_pred)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
        best_thresholds.append(best_thresh)
    return best_thresholds

labels = data.columns[2:].tolist()
thresholds_baseline = optimize_thresholds(y_test, y_pred_proba, labels)
print("\nSeuils optimisés par étiquette:", dict(zip(labels, thresholds_baseline)))


##  Évaluation avec seuils optimisés

In [None]:

y_pred_adjusted = np.zeros_like(y_pred_proba)
for i, thresh in enumerate(thresholds_baseline):
    y_pred_adjusted[:, i] = (y_pred_proba[:, i] > thresh).astype(int)
f1_macro_opt_baseline = f1_score(y_test, y_pred_adjusted, average='macro')
f1_micro_opt_baseline = f1_score(y_test, y_pred_adjusted, average='micro')
print("\nÉvaluation avec seuils optimisés:")
print(f"F1 Score (macro): {f1_macro_opt_baseline:.4f}")
print(f"F1 Score (micro): {f1_micro_opt_baseline:.4f}")
