In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("NYC_Taxi_EDA") \
        .config("spark.sql.adaptive.enabled","true") \
        .getOrCreate()

df_silver = spark.read.parquet("../data/silver/trips_selected_parquet")
df_silver.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+------------------+-----------+-----------+-----+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|cbd_congestion_fee|     trip_duration|pickup_hour|day_of_week|month|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+------------------+-----------+--

In [20]:
df_silver.describe().show()



+-------+-------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-------------------+------------------+------------------+--------------------+------------------+------------------+---------------------+------------------+--------------------+-------------------+-------------------+--------------------+------------------+------------------+--------------------+
|summary|           VendorID|   passenger_count|     trip_distance|         RatecodeID|store_and_fwd_flag|      PULocationID|     DOLocationID|       payment_type|       fare_amount|             extra|             mta_tax|        tip_amount|      tolls_amount|improvement_surcharge|      total_amount|congestion_surcharge|        Airport_fee| cbd_congestion_fee|       trip_duration|       pickup_hour|       day_of_week|               month|
+-------+-------------------+------------------+------------------+-------------------+------------------+--------

                                                                                

In [21]:

features_to_keep = [
    'trip_distance',
    'RatecodeID',
    'tolls_amount',
    'fare_amount',
    'tip_amount',
    'total_amount',
    'Airport_fee',
    'pickup_hour',
    'day_of_week',
    "trip_duration"
]


print(f"\n✅ Features sélectionnées: {len(features_to_keep) - 1}")
print(f"✅ Dataset prêt avec {df_silver.count():,} lignes")


✅ Features sélectionnées: 9
✅ Dataset prêt avec 2,607,083 lignes


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, round as spark_round
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from datetime import datetime
import json
import pickle


In [23]:
feature_columns = [
    'trip_distance',
    'RatecodeID',
    'tolls_amount',
    'fare_amount',
    'tip_amount',
    'total_amount',
    'Airport_fee',
    'pickup_hour',
    'day_of_week',
    
]
target_column = 'trip_duration'
df_Gold = df_silver.select(feature_columns + [target_column])

In [24]:
train_data, test_data = df_Gold.randomSplit([0.8, 0.2], seed=42)
print(f"Nombre d'exemples d'entraînement: {train_data.count()}")
print(f"Nombre d'exemples de test: {test_data.count()}")

                                                                                

Nombre d'exemples d'entraînement: 2086061




Nombre d'exemples de test: 521022


                                                                                

In [25]:
train_data.cache()
test_data.cache()

DataFrame[trip_distance: double, RatecodeID: int, tolls_amount: double, fare_amount: double, tip_amount: double, total_amount: double, Airport_fee: double, pickup_hour: int, day_of_week: int, trip_duration: double]

In [26]:
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol='features_raw'
)

In [27]:
scaler = StandardScaler(
    inputCol='features_raw',
    outputCol='features',
    withStd=True,
    withMean=False
)

In [28]:
# lr = LinearRegression(
#     featuresCol='features',
#     labelCol=target_column,
#     maxIter=10,
#     regParam=0.3,
#     elasticNetParam=0.8
# )

rf = RandomForestRegressor(
    featuresCol='features',
    labelCol=target_column,
    numTrees=50,
    maxDepth=10,
    seed=42
)

In [29]:
pipeline_lr = Pipeline(stages=[assembler, scaler, rf])

In [30]:
model_lr = pipeline_lr.fit(train_data)

26/01/11 14:25:52 WARN MemoryStore: Not enough space to cache rdd_125_2 in memory! (computed 94.4 MiB so far)
26/01/11 14:25:52 WARN BlockManager: Persisting block rdd_125_2 to disk instead.
26/01/11 14:25:52 WARN MemoryStore: Not enough space to cache rdd_125_1 in memory! (computed 94.4 MiB so far)
26/01/11 14:25:52 WARN BlockManager: Persisting block rdd_125_1 to disk instead.
26/01/11 14:25:52 WARN MemoryStore: Not enough space to cache rdd_125_0 in memory! (computed 94.4 MiB so far)
26/01/11 14:25:52 WARN BlockManager: Persisting block rdd_125_0 to disk instead.
26/01/11 14:26:02 WARN MemoryStore: Not enough space to cache rdd_125_0 in memory! (computed 94.4 MiB so far)
26/01/11 14:26:03 WARN MemoryStore: Not enough space to cache rdd_125_1 in memory! (computed 144.8 MiB so far)
26/01/11 14:26:10 WARN MemoryStore: Not enough space to cache rdd_125_0 in memory! (computed 94.4 MiB so far)
26/01/11 14:26:11 WARN MemoryStore: Not enough space to cache rdd_125_1 in memory! (computed 144

In [31]:
predictions_lr = model_lr.transform(test_data)

In [32]:
evaluator_mae = RegressionEvaluator(labelCol=target_column, predictionCol='prediction', metricName='mae')
evaluator_rmse = RegressionEvaluator(labelCol=target_column, predictionCol='prediction', metricName='rmse')
evaluator_r2 = RegressionEvaluator(labelCol=target_column, predictionCol='prediction', metricName='r2')

In [33]:
mae_lr = evaluator_mae.evaluate(predictions_lr)
rmse_lr = evaluator_rmse.evaluate(predictions_lr)
r2_lr = evaluator_r2.evaluate(predictions_lr)

                                                                                

In [34]:
print(f"\n✅ RÉSULTATS - Régression Linéaire:")
print(f"   MAE:  {mae_lr:.2f} minutes")
print(f"   RMSE: {rmse_lr:.2f} minutes")
print(f"   R²:   {r2_lr:.4f}")


✅ RÉSULTATS - Régression Linéaire:
   MAE:  1.10 minutes
   RMSE: 1.77 minutes
   R²:   0.9360


### GBTRegressor

In [35]:
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)


gbt = GBTRegressor(
    featuresCol="features",
    labelCol=target_column,
    maxIter=50,        # nombre d'arbres
    maxDepth=6,        # profondeur (⚠️ augmente = plus lent)
    stepSize=0.1,      # learning rate
    subsamplingRate=0.8,
    seed=42
)

pipeline_gbt = Pipeline(stages=[assembler, gbt])


model_gbt = pipeline_gbt.fit(train_data)


predictions_gbt = model_gbt.transform(test_data)

                                                                                

In [36]:
evaluator_mae = RegressionEvaluator(
    labelCol=target_column,
    predictionCol="prediction",
    metricName="mae"
)

evaluator_rmse = RegressionEvaluator(
    labelCol=target_column,
    predictionCol="prediction",
    metricName="rmse"
)

evaluator_r2 = RegressionEvaluator(
    labelCol=target_column,
    predictionCol="prediction",
    metricName="r2"
)

print("\n✅ RÉSULTATS - GBT")
print(f"MAE  : {evaluator_mae.evaluate(predictions_gbt):.2f}")
print(f"RMSE : {evaluator_rmse.evaluate(predictions_gbt):.2f}")
print(f"R²   : {evaluator_r2.evaluate(predictions_gbt):.4f}")



✅ RÉSULTATS - GBT


26/01/11 14:34:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

MAE  : 0.99


                                                                                

RMSE : 1.63




R²   : 0.9456


                                                                                