# 🤖 02 - Modèle classique : TF-IDF + Régression Logistique
Ce notebook entraîne un modèle de base pour la classification du sentiment à partir des tweets nettoyés.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns

## 📥 Chargement des données nettoyées

In [None]:
df = pd.read_csv("../data/processed/tweets_clean.csv")
X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 🧪 Vectorisation avec TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

## 🚀 Entraînement du modèle avec MLflow

In [None]:
with mlflow.start_run(run_name="TFIDF_LogisticRegression"):

    model = LogisticRegression(max_iter=200)
    model.fit(X_train_vec, y_train)
    
    y_pred = model.predict(X_test_vec)

    acc = model.score(X_test_vec, y_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Log paramètres, métriques, modèle
    mlflow.log_param("vectorizer", "TF-IDF")
    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metrics({
        "precision_neg": report["0"]["precision"],
        "recall_neg": report["0"]["recall"],
        "f1_neg": report["0"]["f1-score"],
        "precision_pos": report["1"]["precision"],
        "recall_pos": report["1"]["recall"],
        "f1_pos": report["1"]["f1-score"]
    })

    mlflow.sklearn.log_model(model, "model")
    mlflow.log_artifact("")

    print("✅ Modèle entraîné et loggé avec MLflow")

## 📊 Matrice de confusion

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Prédiction")
plt.ylabel("Réel")
plt.title("Matrice de confusion")
plt.show()

## 🧾 Rapport

In [None]:
print(classification_report(y_test, y_pred))