In [0]:
%pyspark
# ---------------------------
# Import des librairies
# ---------------------------
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, variance
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [1]:
%pyspark
# ---------------------------
#Créer / Réinitialiser SparkSession
# ---------------------------

if 'spark' in globals():
    spark.stop()

spark = SparkSession.builder \
    .appName("TrafficVolume_Regression") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
%pyspark
spark.sql("SELECT * FROM traffic_ml.model_metrics").show()

In [3]:
%pyspark
%matplotlib inline
# ---------------------------
# Visualisation des métriques
# ---------------------------
metrics_df = spark.sql("SELECT * FROM traffic_ml.model_metrics")
data = metrics_df.toPandas()



In [4]:
%pyspark
%matplotlib inline
# ---- Bar Chart RMSE ----
plt.figure(figsize=(8,5))
plt.bar(data['model_name'], data['rmse'], color='skyblue')
plt.title("RMSE par modèle")
plt.ylabel("RMSE")
plt.xlabel("Modèle")
plt.show()

In [5]:
%pyspark
%matplotlib inline
# ---- Bar Chart MAE ----
plt.figure(figsize=(8,5))
plt.bar(data['model_name'], data['mae'], color='lightgreen')
plt.title("MAE par modèle")
plt.ylabel("MAE")
plt.xlabel("Modèle")
plt.show()

In [6]:
%pyspark
%matplotlib inline
# ---- Bar Chart R² ----
plt.figure(figsize=(8,5))
plt.bar(data['model_name'], data['r2'], color='salmon')
plt.title("R² par modèle")
plt.ylabel("R²")
plt.xlabel("Modèle")
plt.ylim(0, 1)
plt.show()

In [7]:
%pyspark
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

labels = data['model_name']
rmse = data['rmse']
mae = data['mae']
r2 = data['r2']
x = np.arange(len(labels))
width = 0.35

fig, ax1 = plt.subplots(figsize=(10,5))

# Axe principal pour RMSE et MAE
ax1.bar(x - width/2, rmse, width, label='RMSE', color='tab:blue')
ax1.bar(x + width/2, mae, width, label='MAE', color='tab:orange')
ax1.set_xlabel('Modèle')
ax1.set_ylabel('RMSE / MAE', color='black')
ax1.set_xticks(x)
ax1.set_xticklabels(labels)
ax1.legend(loc='upper left')

# Axe secondaire pour R²
ax2 = ax1.twinx()
ax2.plot(x, r2, color='tab:green', marker='o', linewidth=2, markersize=8, label='R²')
ax2.set_ylabel('R²', color='tab:green')
ax2.tick_params(axis='y', labelcolor='tab:green')
ax2.set_ylim(0, 1)
ax2.legend(loc='upper right')

plt.title("Comparaison des métriques par modèle")
plt.tight_layout()
plt.show()

In [8]:
%pyspark
%matplotlib inline
# ---- Heatmap des métriques ----
plt.figure(figsize=(8,4))
sns.heatmap(data.set_index('model_name').T, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title("Heatmap des métriques par modèle")
plt.show()