# Entraînement et Évaluation du Modèle ALS

Ce notebook couvre :
1. Lecture des données nettoyées  
2. Construction du jeu train/test  
3. Recherche des hyperparamètres (RMSE, MAE)  
4. Évaluation des métriques de ranking (Precision@10, Recall@10, MAP, NDCG)  
5. Sauvegarde du meilleur modèle dans HDFS  


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RankingMetrics
from pyspark.sql.functions import expr, collect_list

spark = SparkSession.builder \
    .appName("ALS_Training") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .getOrCreate()


In [None]:
# On lit les données CSV issues de l’ETL batch
ratings = spark.read.csv(
    "hdfs://namenode:9000/movielens/processed/batch/ratings_csv",
    header=True, inferSchema=True
).select("userId","movieId","rating")

print(f"Total interactions : {ratings.count()}")
ratings.show(5, truncate=False)


In [None]:
train, test = ratings.randomSplit([0.8,0.2], seed=42)
print(f"▷ Train: {train.count()} lignes  •  Test: {test.count()} lignes")


In [None]:
ranks  = [10,20,30]
regs   = [0.01,0.1]
iters  = [5,10]
best_rmse = float("inf")
best_model = None
results=[]

evaluator_rmse = RegressionEvaluator(metricName="rmse",
                                     labelCol="rating",
                                     predictionCol="prediction")
evaluator_mae  = RegressionEvaluator(metricName="mae",
                                     labelCol="rating",
                                     predictionCol="prediction")

for rank in ranks:
    for reg in regs:
        for n in iters:
            als = ALS(userCol="userId",
                      itemCol="movieId",
                      ratingCol="rating",
                      coldStartStrategy="drop",
                      rank=rank,
                      regParam=reg,
                      maxIter=n)
            model = als.fit(train)
            preds = model.transform(test)
            rmse = evaluator_rmse.evaluate(preds)
            mae  = evaluator_mae.evaluate(preds)
            results.append((rank,reg,n,rmse,mae))
            print(f"rank={rank}  reg={reg}  iter={n}  →  RMSE={rmse:.4f}, MAE={mae:.4f}")
            if rmse < best_rmse:
                best_rmse, best_model = rmse, model

import pandas as pd
df_res = pd.DataFrame(results, columns=["rank","regParam","iter","rmse","mae"])
df_res.sort_values("rmse").head(5)


In [None]:
# Générer top-10 pour chaque user
recs = best_model.recommendForAllUsers(10) \
    .select("userId",
            expr("transform(recommendations, x -> x.movieId) as pred"))

# Construire la vérité terrain à partir du test
actual = test.groupBy("userId") \
    .agg(collect_list("movieId").alias("actual"))

# Préparer RDD (predictions, labels)
pred_and_labels = recs.join(actual, "userId") \
    .select("pred","actual") \
    .rdd.map(lambda r: (r.pred, r.actual))

metrics = RankingMetrics(pred_and_labels)
print(f"Precision@10 : {metrics.precisionAt(10):.4f}")
print(f"Recall@10    : {metrics.recallAt(10):.4f}")
print(f"MAP@10       : {metrics.meanAveragePrecision:.4f}")
print(f"NDCG@10      : {metrics.ndcgAt(10):.4f}")


In [None]:
best_model.write().overwrite().save(
    "hdfs://namenode:9000/movielens/models/als_best"
)
print("✅ Modèle sauvegardé dans HDFS : /movielens/models/als_best")
