In [1]:
import os
import re
import joblib
import numpy as np
import pandas as pd

from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors

import xgboost as xgb
import mlflow
import mlflow.xgboost

In [2]:
# =========================
# ⚙️ Config
# =========================
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
SQL_QUERY = """
SELECT movie_id, title, synopsis, rating, genres, release_year
FROM movies
WHERE synopsis IS NOT NULL
"""

EXPERIMENT_NAME = "movies_hybrid_like_dislike"
RUN_NAME = "xgb_hybrid_like_dislike"

LIKE_THRESHOLD = 4.0          # seuil like/dislike
TFIDF_MAX_FEATURES = 5000
SVD_COMPONENTS = 100
RANDOM_STATE = 42
SAMPLE_SIZE = 10_000          # None = tout le dataset

ARTIFACT_DIR = "model"
os.makedirs(ARTIFACT_DIR, exist_ok=True)


In [3]:
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

engine = create_engine(DATABASE_URL)
df = pd.read_sql(SQL_QUERY, engine)

df["synopsis_clean"] = df["synopsis"].fillna("").apply(preprocess_text)
df.head()


Unnamed: 0,movie_id,title,synopsis,rating,genres,release_year,synopsis_clean
0,911430,F1,Racing legend Sonny Hayes is coaxed out of ret...,7.8,Action,2025.0,racing legend sonny hayes is coaxed out of ret...
1,575265,Mission: Impossible - The Final Reckoning,Ethan Hunt and team continue their search for ...,7.2,Action,2025.0,ethan hunt and team continue their search for ...
2,1061474,Superman,"Superman, a journalist in Metropolis, embarks ...",7.6,Action,2025.0,superman a journalist in metropolis embarks ...
3,1151334,Eenie Meanie,A former teenage getaway driver gets dragged b...,6.8,Action,2025.0,a former teenage getaway driver gets dragged b...
4,1234821,Jurassic World Rebirth,Five years after the events of Jurassic World ...,6.4,Action,2025.0,five years after the events of jurassic world ...


In [4]:
# TF-IDF
vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words="english")
tfidf_matrix_full = vectorizer.fit_transform(df["synopsis_clean"])

# Réduction SVD
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
tfidf_svd_full = svd.fit_transform(tfidf_matrix_full)

# Genres
df["genres_list"] = df["genres"].fillna("").apply(lambda x: x.split("|"))
mlb = MultiLabelBinarizer()
genres_encoded_full = mlb.fit_transform(df["genres_list"])

# Année
scaler_year = StandardScaler()
year_scaled_full = scaler_year.fit_transform(
    df[["release_year"]].fillna(df["release_year"].mean())
)

# Similarité kNN
nn_full = NearestNeighbors(metric="cosine", algorithm="brute")
nn_full.fit(tfidf_matrix_full)
distances_full, _ = nn_full.kneighbors(tfidf_matrix_full, n_neighbors=6)
neighbor_scores_full = 1 - distances_full[:, 1:]

sim_mean_full = neighbor_scores_full.mean(axis=1)
sim_max_full = neighbor_scores_full.max(axis=1)
sim_min_full = neighbor_scores_full.min(axis=1)
sim_std_full = neighbor_scores_full.std(axis=1)

sim_stats_full = np.column_stack([sim_mean_full, sim_max_full, sim_min_full, sim_std_full])


In [5]:
# Label binaire
y = (df["rating"] >= LIKE_THRESHOLD).astype(int).to_numpy()

# Features
X_full = np.column_stack([tfidf_svd_full, genres_encoded_full, year_scaled_full, sim_stats_full])

# Échantillonnage optionnel
if SAMPLE_SIZE and SAMPLE_SIZE < len(df):
    rng = np.random.default_rng(RANDOM_STATE)
    idx = rng.choice(len(df), size=SAMPLE_SIZE, replace=False)
    X = X_full[idx]
    y = y[idx]
    df_used = df.iloc[idx].reset_index(drop=True)
else:
    X = X_full
    df_used = df.reset_index(drop=True)

X.shape, y.shape


((10000, 124), (10000,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

n_pos = int((y_train == 1).sum())
n_neg = int((y_train == 0).sum())
scale_pos_weight = (n_neg / n_pos) if n_pos > 0 else 1.0

print(f"Pos: {n_pos}, Neg: {n_neg}, scale_pos_weight={scale_pos_weight:.2f}")


Pos: 4903, Neg: 3097, scale_pos_weight=0.63


In [7]:
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME):
    params = {
        "n_estimators": 300,
        "max_depth": 6,
        "learning_rate": 0.05,
        "random_state": RANDOM_STATE,
        "n_jobs": -1,
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "scale_pos_weight": scale_pos_weight,
    }
    mlflow.log_param("like_threshold", LIKE_THRESHOLD)
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    mlflow.xgboost.log_model(model, "xgb_hybrid_like_dislike_model")

print(f"✅ Metrics — ACC: {acc:.4f} | ROC-AUC: {roc:.4f} | P: {prec:.4f} | R: {rec:.4f} | F1: {f1:.4f}")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)


✅ Metrics — ACC: 0.6675 | ROC-AUC: 0.6770 | P: 0.7049 | R: 0.7871 | F1: 0.7437


In [8]:
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


Confusion Matrix:
 [[370 404]
 [261 965]]

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.48      0.53       774
           1       0.70      0.79      0.74      1226

    accuracy                           0.67      2000
   macro avg       0.65      0.63      0.64      2000
weighted avg       0.66      0.67      0.66      2000



In [None]:
joblib.dump(model, os.path.join(ARTIFACT_DIR, "xgb_classifier_model.joblib"))
joblib.dump(vectorizer, os.path.join(ARTIFACT_DIR, "reco_vectorizer.joblib"))
joblib.dump(svd, os.path.join(ARTIFACT_DIR, "svd_model.joblib"))
joblib.dump(tfidf_matrix_full, os.path.join(ARTIFACT_DIR, "tfidf_matrix_full.joblib"))
joblib.dump(mlb, os.path.join(ARTIFACT_DIR, "mlb_model.joblib"))
joblib.dump(scaler_year, os.path.join(ARTIFACT_DIR, "scaler_year.joblib"))
joblib.dump(nn_full, os.path.join(ARTIFACT_DIR, "nn_full.joblib"))

#df[["movie_id", "title"]].to_csv(os.path.join(ARTIFACT_DIR, "movie_index_full.csv"), index=False)
#df.to_csv(os.path.join(ARTIFACT_DIR, "movies_full.csv"), index=False)

print("🎉 Artefacts sauvegardés dans", ARTIFACT_DIR)


🎉 Artefacts sauvegardés dans model


In [13]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sqlalchemy import create_engine
import re
import os
import joblib
import nltk

nltk.download("stopwords")
nltk.download("wordnet")

# =========================
# 1. Charger les données
# =========================
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
engine = create_engine(DATABASE_URL)
query = "SELECT movie_id, title, synopsis, rating, genres, release_year FROM movies WHERE synopsis IS NOT NULL"
df = pd.read_sql(query, engine)

# Nettoyage texte
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# =========================
# 2. TF-IDF + SVD sur tout le dataset
# =========================
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix_full = vectorizer.fit_transform(df["synopsis_clean"].fillna(""))

svd_full = TruncatedSVD(n_components=100, random_state=42)
tfidf_svd_full = svd_full.fit_transform(tfidf_matrix_full)

# =========================
# 3. Genres et années
# =========================
df["genres_list"] = df["genres"].fillna("").apply(lambda x: x.split("|"))

mlb = MultiLabelBinarizer()
genres_encoded_full = mlb.fit_transform(df["genres_list"])

scaler_year = StandardScaler()
year_scaled_full = scaler_year.fit_transform(df[["release_year"]].fillna(df["release_year"].mean()))

# =========================
# 4. Nearest Neighbors sur full dataset
# =========================
nn_full = NearestNeighbors(metric="cosine", algorithm="brute")
nn_full.fit(tfidf_matrix_full)
distances_full, indices_full = nn_full.kneighbors(tfidf_matrix_full, n_neighbors=6)
neighbor_scores_full = 1 - distances_full[:, 1:]

sim_mean_full = neighbor_scores_full.mean(axis=1)
sim_max_full = neighbor_scores_full.max(axis=1)
sim_min_full = neighbor_scores_full.min(axis=1)
sim_std_full = neighbor_scores_full.std(axis=1)

# =========================
# 5. Échantillon pour entraînement XGB
# =========================
df_sample = df.sample(10_000, random_state=42).reset_index(drop=True)

tfidf_matrix_sample = vectorizer.transform(df_sample["synopsis_clean"].fillna(""))
tfidf_svd_sample = svd_full.transform(tfidf_matrix_sample)

genres_encoded_sample = mlb.transform(df_sample["genres_list"])
year_scaled_sample = scaler_year.transform(df_sample[["release_year"]].fillna(df["release_year"].mean()))

nn_sample = NearestNeighbors(metric="cosine", algorithm="brute")
nn_sample.fit(tfidf_matrix_sample)
distances_sample, indices_sample = nn_sample.kneighbors(tfidf_matrix_sample, n_neighbors=6)
neighbor_scores_sample = 1 - distances_sample[:, 1:]

sim_mean_sample = neighbor_scores_sample.mean(axis=1)
sim_max_sample = neighbor_scores_sample.max(axis=1)
sim_min_sample = neighbor_scores_sample.min(axis=1)
sim_std_sample = neighbor_scores_sample.std(axis=1)

# Classification : like / dislike
threshold = 7.0  # note >= 7 -> "like"
y_class_sample = (df_sample["rating"] >= threshold).astype(int)

# Features
X_sample = np.column_stack([
    tfidf_svd_sample,
    genres_encoded_sample,
    year_scaled_sample,
    sim_mean_sample,
    sim_max_sample,
    sim_min_sample,
    sim_std_sample
])

X_train, X_test, y_train, y_test = train_test_split(X_sample, y_class_sample, test_size=0.2, random_state=42)

# =========================
# 6. MLflow & XGB
# =========================
mlflow.set_experiment("movies_reco_pipeline_classif")

with mlflow.start_run(run_name="xgb_hybrid_classif"):
    params = {
        "n_estimators": 300,
        "max_depth": 6,
        "learning_rate": 0.05,
        "random_state": 42,
        "n_jobs": -1,
        "use_label_encoder": False,
        "eval_metric": "logloss"
    }
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Évaluation
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("auc", auc)
    mlflow.xgboost.log_model(model, "xgb_classifier_model")

print("✅ Pipeline classification terminé ! Accuracy:", acc, "| AUC:", auc)

# =========================
# 7. Sauvegarde des artefacts
# =========================
os.makedirs("model", exist_ok=True)

# Sauvegarde du modèle
joblib.dump(model, "model/xgb_classifier_model.joblib")
print("✅ Modèle XGB sauvegardé !")

# Sauvegarde du vectorizer
joblib.dump(vectorizer, "model/reco_vectorizer.joblib")
print("✅ TfidfVectorizer sauvegardé !")

# Sauvegarde du SVD
joblib.dump(svd_full, "model/svd_model.joblib")
print("✅ SVD sauvegardé !")

# Sauvegarde de la TF-IDF matrix complète
joblib.dump(tfidf_matrix_full, "model/tfidf_matrix_full.joblib")
print("✅ TF-IDF matrix complète sauvegardée !")

# Sauvegarde du movie index
df[["movie_id", "title"]].to_csv("model/movie_index.csv", index=False)
print("✅ Movie index complet sauvegardé !")

# Sauvegarde du MultiLabelBinarizer
joblib.dump(mlb, "model/mlb_model.joblib")
print("✅ MultiLabelBinarizer sauvegardé !")

# Sauvegarde du StandardScaler pour l'année
joblib.dump(scaler_year, "model/scaler_year.joblib")
print("✅ StandardScaler pour l'année sauvegardé !")

# Sauvegarde du NearestNeighbors complet
joblib.dump(nn_full, "model/nn_full.joblib")
print("✅ NearestNeighbors complet sauvegardé !")

# Sauvegarde du DataFrame complet
df.to_csv("model/movies_full.csv", index=False)
print("✅ DataFrame complet sauvegardé !")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loulo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\loulo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)


✅ Pipeline classification terminé ! Accuracy: 0.832 | AUC: 0.5694185487226879
✅ Modèle XGB sauvegardé !
✅ TfidfVectorizer sauvegardé !
✅ SVD sauvegardé !
✅ TF-IDF matrix complète sauvegardée !
✅ Movie index complet sauvegardé !
✅ MultiLabelBinarizer sauvegardé !
✅ StandardScaler pour l'année sauvegardé !
✅ NearestNeighbors complet sauvegardé !
✅ DataFrame complet sauvegardé !


In [11]:
# import pandas as pd
# from sqlalchemy import create_engine
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import (
#     accuracy_score,
#     roc_auc_score,
#     precision_score,
#     recall_score,
#     f1_score,
#     confusion_matrix,
#     classification_report
# )
# import xgboost as xgb
# import numpy as np
# import re
# import mlflow
# import mlflow.xgboost
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

# # =========================
# # 1️⃣ Charger les données
# # =========================
# DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
# engine = create_engine(DATABASE_URL)
# query = "SELECT title, synopsis, rating, genres, release_year FROM movies WHERE synopsis IS NOT NULL"
# df = pd.read_sql(query, engine)

# # =========================
# # 2️⃣ Nettoyage texte
# # =========================
# def preprocess_text(text: str) -> str:
#     text = text.lower()
#     text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
#     return text

# df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# # =========================
# # 3️⃣ Like/Dislike
# # =========================
# threshold = 4.0
# df["like_dislike"] = df["rating"].apply(lambda x: 1 if x >= threshold else 0)

# # =========================
# # 4️⃣ Features : TF-IDF + genres + année
# # =========================
# vectorizer = TfidfVectorizer(max_features=2000, stop_words="english")
# tfidf_matrix = vectorizer.fit_transform(df["synopsis_clean"].fillna(""))

# df["genres_list"] = df["genres"].fillna("").apply(lambda x: x.split("|"))
# mlb = MultiLabelBinarizer()
# genres_encoded = mlb.fit_transform(df["genres_list"])

# scaler_year = StandardScaler()
# years_scaled = scaler_year.fit_transform(df[["release_year"]].fillna(df["release_year"].mean()))

# X = np.column_stack([tfidf_matrix.toarray(), genres_encoded, years_scaled])
# y = df["like_dislike"].values

# # =========================
# # 5️⃣ Train/Test split
# # =========================
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# # Calcul du scale_pos_weight après train/test split
# scale_pos_weight = y_train.tolist().count(0) / y_train.tolist().count(1)

# # =========================
# # 6️⃣ MLflow & XGBoost
# # =========================
# mlflow.set_experiment("movies_like_dislike_classif")

# with mlflow.start_run(run_name="xgb_like_dislike"):
#     # Log du seuil et du scale_pos_weight
#     mlflow.log_param("like_threshold", threshold)
#     mlflow.log_param("scale_pos_weight", scale_pos_weight)

#     # Paramètres du modèle
#     params = {
#         "n_estimators": 200,
#         "max_depth": 5,
#         "learning_rate": 0.05,
#         "random_state": 42,
#         "n_jobs": -1,
#         "use_label_encoder": False,
#         "eval_metric": "logloss",
#         "scale_pos_weight": scale_pos_weight
#     }
#     mlflow.log_params(params)

#     # Entraînement
#     model = xgb.XGBClassifier(**params)
#     model.fit(X_train, y_train)

#     # Prédictions
#     y_pred = model.predict(X_test)
#     y_prob = model.predict_proba(X_test)[:, 1]

#     # Metrics
#     acc = accuracy_score(y_test, y_pred)
#     roc = roc_auc_score(y_test, y_prob)
#     prec = precision_score(y_test, y_pred)
#     rec = recall_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     cm = confusion_matrix(y_test, y_pred)
#     report = classification_report(y_test, y_pred)

#     # Log des métriques
#     mlflow.log_metric("accuracy", acc)
#     mlflow.log_metric("roc_auc", roc)
#     mlflow.log_metric("precision", prec)
#     mlflow.log_metric("recall", rec)
#     mlflow.log_metric("f1_score", f1)

#     # Log du modèle
#     mlflow.xgboost.log_model(model, "xgb_like_dislike_model")

#     print("✅ Modèle loggé sur MLflow avec seuil et scale_pos_weight !")

# print("Accuracy:", acc)
# print("ROC AUC:", roc)
# print("Precision:", prec)
# print("Recall:", rec)
# print("F1-score:", f1)
# print("\nConfusion Matrix:\n", cm)
# print("\nClassification Report:\n", report)
