In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.set_option('display.float_format', lambda x: f'{x:,.3f}')


In [2]:
traf = pd.read_csv("../cleaned dataset/traffic_daily_cleaned_member1_v2.csv")
wth  = pd.read_csv("../cleaned dataset/weather_daily_cleaned_member2_v2.csv")

# Make sure both dates are same type
traf["date"] = pd.to_datetime(traf["date"], errors="coerce").dt.date
wth["date"]  = pd.to_datetime(wth["date"],  errors="coerce").dt.date

print("Traffic rows:", len(traf), " | Weather rows:", len(wth))
print("Traffic date range:", traf["date"].min(), "to", traf["date"].max())
print("Weather date range:", wth["date"].min(), "to", wth["date"].max())


Traffic rows: 177  | Weather rows: 181
Traffic date range: 2023-01-03 to 2023-06-30
Weather date range: 2023-01-01 to 2023-06-30


In [3]:
df_merged = (
    traf.merge(wth, on="date", how="inner")
        .sort_values("date")
        .reset_index(drop=True)
)

print("NEW merged shape:", df_merged.shape)   # EXPECT ~ (177, 13)
df_merged.head()


NEW merged shape: (177, 13)


Unnamed: 0,date,daily_flow_total,daily_flow_mean,daily_cong_mean,daily_dsat_mean,num_records,rain_mm,tmax_c,tmin_c,tmean,wind_speed_knots,sunshine_hours,soil_temp_c
0,2023-01-03,649194,153.765,0.208,27.207,4222,1.5,13.3,5.6,9.45,12.5,0.0,6.125
1,2023-01-04,551602,90.19,0.173,21.307,6116,0.1,12.6,8.8,10.7,13.7,0.5,8.2
2,2023-01-05,464146,75.903,0.309,24.752,6115,2.7,12.6,6.7,9.65,10.3,0.0,8.025
3,2023-01-06,860701,140.937,0.49,25.563,6107,2.3,11.5,5.6,8.55,9.7,0.2,6.325
4,2023-01-07,713575,116.388,0.295,21.667,6131,1.9,10.7,6.4,8.55,12.1,5.2,6.675


In [4]:
out_path = "../cleaned dataset/merged_traffic_weather_main_dataset_clean.csv"
df_merged.to_csv(out_path, index=False)
print("Overwritten merged file:", out_path)


Overwritten merged file: ../cleaned dataset/merged_traffic_weather_main_dataset_clean.csv


In [5]:
df = pd.read_csv("../cleaned dataset/merged_traffic_weather_main_dataset_clean.csv")
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

print("FINAL merged shape:", df.shape)   # MUST be ~177
print(df.columns.tolist())
df.head()


FINAL merged shape: (177, 13)
['date', 'daily_flow_total', 'daily_flow_mean', 'daily_cong_mean', 'daily_dsat_mean', 'num_records', 'rain_mm', 'tmax_c', 'tmin_c', 'tmean', 'wind_speed_knots', 'sunshine_hours', 'soil_temp_c']


Unnamed: 0,date,daily_flow_total,daily_flow_mean,daily_cong_mean,daily_dsat_mean,num_records,rain_mm,tmax_c,tmin_c,tmean,wind_speed_knots,sunshine_hours,soil_temp_c
0,2023-01-03,649194,153.765,0.208,27.207,4222,1.5,13.3,5.6,9.45,12.5,0.0,6.125
1,2023-01-04,551602,90.19,0.173,21.307,6116,0.1,12.6,8.8,10.7,13.7,0.5,8.2
2,2023-01-05,464146,75.903,0.309,24.752,6115,2.7,12.6,6.7,9.65,10.3,0.0,8.025
3,2023-01-06,860701,140.937,0.49,25.563,6107,2.3,11.5,5.6,8.55,9.7,0.2,6.325
4,2023-01-07,713575,116.388,0.295,21.667,6131,1.9,10.7,6.4,8.55,12.1,5.2,6.675


In [6]:
target_col = "daily_flow_total"

X = df.drop(columns=["date", target_col])
y = df[target_col]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (177, 11)
y shape: (177,)


In [7]:
split_idx = int(len(df) * 0.8)

X_train, y_train = X.iloc[:split_idx], y.iloc[:split_idx]
X_test,  y_test  = X.iloc[split_idx:], y.iloc[split_idx:]

print("Train rows:", len(X_train), "| Test rows:", len(X_test))
print("Train dates:", df["date"].iloc[0], "to", df["date"].iloc[split_idx-1])
print("Test dates :", df["date"].iloc[split_idx], "to", df["date"].iloc[-1])


Train rows: 141 | Test rows: 36
Train dates: 2023-01-03 00:00:00 to 2023-05-25 00:00:00
Test dates : 2023-05-26 00:00:00 to 2023-06-30 00:00:00


In [8]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

rmse_dt = np.sqrt(mean_squared_error(y_test, pred_dt))
mae_dt  = mean_absolute_error(y_test, pred_dt)
r2_dt   = r2_score(y_test, pred_dt)

print("Decision Tree Results")
print(f"RMSE: {rmse_dt:,.3f} | MAE: {mae_dt:,.3f} | R²: {r2_dt:,.3f}")


Decision Tree Results
RMSE: 13,325.666 | MAE: 9,511.389 | R²: 0.985


In [9]:
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr.fit(X_train, y_train)
pred_gbr = gbr.predict(X_test)

rmse_gbr = np.sqrt(mean_squared_error(y_test, pred_gbr))
mae_gbr  = mean_absolute_error(y_test, pred_gbr)
r2_gbr   = r2_score(y_test, pred_gbr)

print("Gradient Boosting Results")
print(f"RMSE: {rmse_gbr:,.3f} | MAE: {mae_gbr:,.3f} | R²: {r2_gbr:,.3f}")


Gradient Boosting Results
RMSE: 6,757.653 | MAE: 4,644.951 | R²: 0.996
