# 🧠 03 - Modèle sur mesure avancé (Keras + embeddings)
Ce notebook entraîne un modèle de deep learning simple pour la classification de sentiment.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import mlflow
import mlflow.tensorflow

## 📥 Chargement des données nettoyées

In [None]:
df = pd.read_csv("../data/processed/tweets_clean.csv")
X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 🔠 Tokenisation et padding

In [None]:
vocab_size = 20000
max_len = 50

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

## 🧠 Construction du modèle Keras

In [None]:
embedding_dim = 64

with mlflow.start_run(run_name="NN_Keras_Embedding"):

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
        GlobalAveragePooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    history = model.fit(
        X_train_pad, y_train,
        validation_split=0.1,
        epochs=5,
        batch_size=128,
        verbose=2
    )

    loss, acc = model.evaluate(X_test_pad, y_test, verbose=0)
    print(f"✅ Test accuracy: {acc:.4f}")

    # Log via MLflow
    mlflow.log_param("vocab_size", vocab_size)
    mlflow.log_param("embedding_dim", embedding_dim)
    mlflow.log_param("max_len", max_len)
    mlflow.log_metric("test_accuracy", acc)
    mlflow.tensorflow.log_model(model, "model")

## 🧾 Évaluation du modèle

In [None]:
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)

print(classification_report(y_test, y_pred))