In [0]:
%pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [1]:
%pyspark

# 1. Arrêter complètement la session existante
spark.sparkContext.stop()

# 2. Créer une nouvelle session avec les bonnes configurations
spark = SparkSession.builder \
    .appName("TrafficVolume_DecisionTree") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:

%pyspark
#Lecture des données nettoyées depuis HDFS
traffic_df = spark.read.parquet("hdfs://namenode:9000/traffic_volume_cleaned_encoded.parquet")

In [3]:


%pyspark
#Préparation des données et split Train/Test

traffic_df = traffic_df.withColumnRenamed(
    "traffic_volume", "label"
)

train_df, test_df = traffic_df.randomSplit(
    [0.75, 0.25], seed=0
)

print("Train size:", train_df.count())
print("Test size :", test_df.count())


In [4]:


%pyspark

# 1️⃣ Colonnes features uniquement numériques / encodées
feature_cols = [
    "temp", "rain_1h", "snow_1h",
    "Year", "Month", "Day", "Hour",
    "holiday", "weather_main", "weather_description"  # supposons que ce sont les colonnes encodées
]

# 2️⃣ Créer VectorAssembler
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# 3️⃣ Transformer train/test
train_df, test_df = traffic_df.randomSplit([0.75, 0.25], seed=0)

train_df = assembler.transform(train_df)
test_df  = assembler.transform(test_df)

# 4️⃣ Vérifier
train_df.select("features", "label").show(5)



In [5]:

%pyspark

# 1️⃣ Créer le modèle Spark ML
dt = DecisionTreeRegressor(featuresCol="features", labelCol="label", maxBins=50)

# 2️⃣ Entraîner le modèle
dt_model = dt.fit(train_df)  # train_df contient 'features' et 'label'

# 3️⃣ Faire les prédictions sur test set
predictions = dt_model.transform(test_df)
predictions.select("label", "prediction").show(10)

# 4️⃣ Évaluer le modèle
evaluator_rmse = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse"
)
evaluator_r2 = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="r2"
)
evaluator_mae = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="mae"
)

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)

print("R Squared (R²)        :", r2)
print("Mean Absolute Error    :", mae)
print("Root Mean Squared Error:", rmse)


In [6]:
%pyspark
#Création de la base Hive et enregistrement des métriques du modèle

spark.sql("CREATE DATABASE IF NOT EXISTS traffic_ml")
spark.sql("USE traffic_ml")
spark.sql("CREATE TABLE IF NOT EXISTS model_metrics (model_name STRING, rmse DOUBLE, r2 DOUBLE, mae DOUBLE)")

spark.sql(f"""
    INSERT INTO model_metrics VALUES (
        'DecisionTree',
        {rmse},
        {r2},
        {mae}
    )
""")

In [7]:
%pyspark
#Affichage des métriques des modèles enregistrées
spark.sql("SELECT * FROM traffic_ml.model_metrics").show()