In [0]:
%pyspark
# Importation des bibliothèques
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, variance
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [1]:


%pyspark
# Création de la SparkSession
spark = SparkSession.builder \
    .appName("RandomForest") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .enableHiveSupport() \
    .getOrCreate()




In [2]:
%pyspark
# Lecture du fichier parquet depuis HDFS
traffic_df = spark.read.parquet("hdfs://namenode:9000/traffic_volume_cleaned_encoded.parquet")

In [3]:


%pyspark

label_col = "traffic_volume"
feature_cols = [col for col in traffic_df.columns if col != label_col]

# Création de la colonne features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(traffic_df).select("features", col(label_col).alias("label"))


In [4]:

%pyspark
# Séparation du dataset en ensembles d’entraînement et de test
train_df, test_df = data.randomSplit([0.75, 0.25], seed=0)

In [5]:


%pyspark
# Entraînement du Random Forest et prédiction sur le jeu de test
rf = RandomForestRegressor(featuresCol="features", labelCol="label", numTrees=100, maxDepth=5, maxBins=50)
rf_model = rf.fit(train_df)

# Prédiction
predictions = rf_model.transform(test_df)
predictions.select("label", "prediction").show(5)



In [6]:


%pyspark
# Évaluation des performances du modèle 
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})

print(f"R Squared : {r2}")
print(f"Mean Absolute Error : {mae}")
print(f"Mean Squared Error : {mse}")
print(f"Root Mean Squared Error : {rmse}")


In [7]:


%pyspark
# Recherche des meilleurs hyperparamètres avec Cross-Validation
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [3, 5, 7, 9]) \
    .addGrid(rf.numTrees, [5, 10, 15]) \
    .build()

cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

cv_model = cv.fit(train_df)
best_model = cv_model.bestModel

print("Meilleurs paramètres :")
print(f"Max Depth : {best_model.getMaxDepth()}")
print(f"Num Trees : {best_model.getNumTrees}")


In [8]:


%pyspark
# Évaluation biais et variance du modèle
# Moyennes
mean_actual = test_df.select(mean("label")).collect()[0][0]
mean_pred = predictions.select(mean("prediction")).collect()[0][0]

# Variances
var_actual = test_df.select(variance("label")).collect()[0][0]
var_pred = predictions.select(variance("prediction")).collect()[0][0]

print("Bias Error")
print(f"Actual value : {mean_actual}")
print(f"Predicted value : {mean_pred}")

print("Variance Error")
print(f"Actual value : {var_actual}")
print(f"Predicted value : {var_pred}")


In [9]:

%pyspark
# Création de la base Hive et enregistrement des métriques du modèle
spark.sql("CREATE DATABASE IF NOT EXISTS traffic_ml")
spark.sql("USE traffic_ml")
spark.sql("CREATE TABLE IF NOT EXISTS model_metrics (model_name STRING, rmse DOUBLE, r2 DOUBLE, mae DOUBLE)")

spark.sql(f"""
    INSERT INTO model_metrics VALUES (
        'RandomForest',
        {rmse},
        {r2},
        {mae}
    )
""")

In [10]:
%pyspark
# Affichage des métriques des modèles enregistrées
spark.sql("SELECT * FROM traffic_ml.model_metrics").show()