In [0]:
%pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, sin, cos, when, sqrt, abs as spark_abs
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np
from pyspark.sql.functions import (
    col, when, sin, cos
)

In [1]:

%pyspark
# 1. Arrêter complètement la session existante
spark.sparkContext.stop()

# 2. Créer une nouvelle session avec les bonnes configurations
spark = SparkSession.builder \
    .appName("UrbanTrafficPrediction_GBT") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .enableHiveSupport() \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")


In [2]:

%pyspark

# Heures de pointe
df = df.withColumn("is_peak_hour", 
    when((col("Hour").between(7, 9)) | (col("Hour").between(16, 19)), 1).otherwise(0)
)

# Weekend
df = df.withColumn("is_weekend", 
    when(col("Day").isin([5, 6]), 1).otherwise(0)
)
# Cycles temporels - Heure
df = df.withColumn("hour_sin", sin(2 * np.pi * col("Hour") / 24))
df = df.withColumn("hour_cos", cos(2 * np.pi * col("Hour") / 24))
df = df.withColumn("hour_sin2", sin(4 * np.pi * col("Hour") / 24))
df = df.withColumn("hour_cos2", cos(4 * np.pi * col("Hour") / 24))

# Cycles temporels - Jour
df = df.withColumn("day_sin", sin(2 * np.pi * col("Day") / 7))
df = df.withColumn("day_cos", cos(2 * np.pi * col("Day") / 7))

# Cycles temporels - Mois
df = df.withColumn("month_sin", sin(2 * np.pi * col("Month") / 12))
df = df.withColumn("month_cos", cos(2 * np.pi * col("Month") / 12))

# Termes quadratiques
df = df.withColumn("Hour_sq", col("Hour") ** 2)
df = df.withColumn("temp_sq", col("temp") ** 2)
df = df.withColumn("rain_sq", col("rain_1h") ** 2)

# Interactions temporelles
df = df.withColumn("hour_peak", col("Hour") * col("is_peak_hour"))
df = df.withColumn("hour_day", col("Hour") * col("Day"))
df = df.withColumn("hour_temp", col("Hour") * col("temp"))

# Interactions météo
df = df.withColumn("temp_peak", col("temp") * col("is_peak_hour"))
df = df.withColumn("temp_rain", col("temp") * col("rain_1h"))
df = df.withColumn("temp_snow", col("temp") * col("snow_1h"))

# Interactions période
df = df.withColumn("holiday_weekend", col("holiday") * col("is_weekend"))

# Conditions extrêmes
df = df.withColumn("is_cold", when(col("temp") < 0, 1).otherwise(0))
df = df.withColumn("is_hot", when(col("temp") > 25, 1).otherwise(0))
df = df.withColumn("heavy_rain", when(col("rain_1h") > 5, 1).otherwise(0))

# Liste complète des features
feature_cols = [
    "temp", "rain_1h", "snow_1h",
    "Hour", "Day", "Month", "Year",
    "holiday", "weather_description",
    "is_peak_hour", "is_weekend",
    "hour_sin", "hour_cos", "hour_sin2", "hour_cos2",
    "day_sin", "day_cos",
    "month_sin", "month_cos",
    "Hour_sq", "temp_sq", "rain_sq",
    "hour_peak", "hour_day", "hour_temp",
    "temp_peak", "temp_rain", "temp_snow",
    "holiday_weekend",
    "is_cold", "is_hot", "heavy_rain"
]

print(f"\nNombre de features: {len(feature_cols)}")

In [3]:

%pyspark
# Assembler toutes les features en un seul vecteur
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

df_assembled = assembler.transform(df)

# Sélectionner seulement les colonnes nécessaires
df_final = df_assembled.select("features", col("traffic_volume").alias("label"))

# Split train/test (80/20)
train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)

print(f"\nTaille train: {train_data.count()}")
print(f"Taille test: {test_data.count()}")

In [4]:

%pyspark
# Entraînement et prédiction avec GBTRegressor
gbt = GBTRegressor(
    maxIter=150,              
    maxDepth=7,
    stepSize=0.05,            
    subsamplingRate=0.8,   
    minInstancesPerNode=2,
    maxBins=64,
    seed=42,
    featuresCol="features",
    labelCol="label",
    predictionCol="prediction"
)

print("\nEntraînement du modèle en cours...")
model = gbt.fit(train_data)

# Prédictions
predictions = model.transform(test_data)

In [5]:

%pyspark
#Évaluation des performances du modèle GBT
print("\nÉchantillon de prédictions:")
predictions.select("label", "prediction").show(10)
# Évaluateur R²
evaluator_r2 = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="r2"
)

# Évaluateur RMSE
evaluator_rmse = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="rmse"
)


# Évaluateur MAE
evaluator_mae = RegressionEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="mae"
)

r2 = evaluator_r2.evaluate(predictions)
rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)

print(f"\n{'='*60}")
print(f"RÉSULTATS DU MODÈLE")
print(f"{'='*60}")
print(f"R² Score : {r2:.4f}")
print(f"RMSE     : {rmse:.2f}")
print(f"MAE      : {mae:.2f}")
print(f"{'='*60}")


In [6]:

%pyspark
# Création de la base Hive et enregistrement des métriques du modèle
spark.sql("CREATE DATABASE IF NOT EXISTS traffic_ml")
spark.sql("USE traffic_ml")
spark.sql("CREATE TABLE IF NOT EXISTS model_metrics (model_name STRING, rmse DOUBLE, r2 DOUBLE, mae DOUBLE)")

spark.sql(f"""
    INSERT INTO model_metrics VALUES (
        'Gradient-Boosted Trees',
        {rmse},
        {r2},
        {mae}
    )
""")
# Affichage des métriques des modèles enregistrées
spark.sql("SELECT * FROM traffic_ml.model_metrics").show()

In [7]:

%pyspark
# Récupérer l'importance des features
feature_importances = model.featureImportances.toArray()

# Créer un DataFrame pour l'importance
import pandas as pd
importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

print(f"\nTop 15 features importantes:")
print(importance_df.head(15))

In [8]:

%pyspark
