In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import warnings
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, VotingClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error, classification_report

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("üöÄ STARTING TURN-KEY MASTER SCRIPT...")
print("   (Calculating Metrics, Training Models, and Generating Files)\n")

# --- HELPER: AUTOMATIC FILE FINDER ---
def find_file(filename):
    search_paths = [
        filename,                  # Current folder
        f"data/{filename}",        # data subfolder
        f"../data/{filename}",     # data folder one level up
        f"../{filename}"           # One level up
    ]
    for path in search_paths:
        if os.path.exists(path):
            return path
    raise FileNotFoundError(f"‚ùå Could not find {filename}")

# Locate Data Files
try:
    path_vio = find_file('violation_ml_features_v2.csv')
    path_cong = find_file('congestion_classification_for_automl.csv')
    path_fcst = find_file('traffic_timeseries_for_automl.csv')
    print("‚úÖ All data files found.")
except FileNotFoundError as e:
    print(e)
    raise

# ==============================================================================
# MODEL 1: VIOLATION FACTORS (Goal 1 & 2)
# ==============================================================================
print("\n" + "="*60)
print("--- 1. VIOLATION MODEL (Random Forest Regressor) ---")
print("="*60)

# 1. Load Data
df_vio = pd.read_csv(path_vio)
features_vio = ['Hour', 'Month', 'TempMax_C', 'Precipitation_mm', 'Snowfall_cm']
X_vio = df_vio[features_vio]
y_vio = df_vio['Violation_Count']

# 2. Validation Step (Split Data)
X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(X_vio, y_vio, test_size=0.2, random_state=42)

# 3. Train & Evaluate
model_vio_val = RandomForestRegressor(n_estimators=100, random_state=42)
model_vio_val.fit(X_train_v, y_train_v)
r2_vio = r2_score(y_test_v, model_vio_val.predict(X_test_v))

print(f"üìä VALIDATION METRIC:")
print(f"   R2 Score: {r2_vio:.4f} (Use this in your Report)")

# 4. Production Training (Full Data)
model_vio_final = RandomForestRegressor(n_estimators=100, random_state=42)
model_vio_final.fit(X_vio, y_vio)

# 5. Save Outputs
# Feature Importance
importances = pd.DataFrame({
    'Factor': features_vio,
    'Importance': model_vio_final.feature_importances_
}).sort_values(by='Importance', ascending=False)

importances.to_csv('Model1_Top_Factors.csv', index=False)
with open('python_model_violation.pkl', 'wb') as f:
    pickle.dump(model_vio_final, f)

print("‚úÖ Saved: 'Model1_Top_Factors.csv'")
print("‚úÖ Saved: 'python_model_violation.pkl'")


# ==============================================================================
# MODEL 2: CONGESTION CLASSIFICATION (Goal 4)
# ==============================================================================
print("\n" + "="*60)
print("--- 2. CONGESTION MODEL (Voting Ensemble: XGB+LGBM) ---")
print("="*60)

# 1. Load Data
df_cong = pd.read_csv(path_cong)
features_cong = ['Freeflow', 'Month', 'Year', 'Temp_Max_Mean_2025', 'Precip_Sum_mm_2025']
X_cong = df_cong[features_cong]
y_cong = df_cong['IsHighCongestion']

# 2. Validation Step (Split Data)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cong, y_cong, test_size=0.2, random_state=42)

# Define Voting Classifier
xgb_c = XGBClassifier(n_estimators=100, eval_metric='logloss', random_state=42)
lgbm_c = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
model_cong_val = VotingClassifier(estimators=[('xgb', xgb_c), ('lgbm', lgbm_c)], voting='soft')
model_cong_val.fit(X_train_c, y_train_c)

acc_c = accuracy_score(y_test_c, model_cong_val.predict(X_test_c))

print(f"üìä VALIDATION METRIC:")
print(f"   Accuracy: {acc_c:.4f} ({acc_c*100:.1f}%) (Use this in your Report)")

# 3. Production Training (Full Data)
model_cong_final = VotingClassifier(estimators=[('xgb', xgb_c), ('lgbm', lgbm_c)], voting='soft')
model_cong_final.fit(X_cong, y_cong)

# 4. Save Outputs
# Congestion Stats (2024 vs 2025)
stats = df_cong.groupby('Year')['IsHighCongestion'].mean().reset_index()
stats.columns = ['Year', 'High_Congestion_Rate']

stats.to_csv('Model2_Congestion_Stats.csv', index=False)
with open('python_model_congestion.pkl', 'wb') as f:
    pickle.dump(model_cong_final, f)

print("‚úÖ Saved: 'Model2_Congestion_Stats.csv'")
print("‚úÖ Saved: 'python_model_congestion.pkl'")


# ==============================================================================
# MODEL 3: TRAFFIC FORECASTING (Goal 5 & 6)
# ==============================================================================
print("\n" + "="*60)
print("--- 3. FORECASTING MODEL (Voting Ensemble: XGB+LGBM) ---")
print("="*60)

# 1. Load Data
df_ts = pd.read_csv(path_fcst)
df_ts['Date'] = pd.to_datetime(df_ts['Date'])

# 2. Feature Engineering (Manual Lags)
def create_features(data):
    d = data.copy()
    d['Year'] = d['Date'].dt.year
    d['Month'] = d['Date'].dt.month
    d['Week'] = d['Date'].dt.isocalendar().week
    d['Traffic_Lag1'] = d['Traffic_Count'].shift(1).fillna(method='bfill')
    d['Traffic_Lag2'] = d['Traffic_Count'].shift(2).fillna(method='bfill')
    return d

df_ml = create_features(df_ts)
features_ts = ['Year', 'Month', 'Week', 'Traffic_Lag1', 'Traffic_Lag2', 'Temp_Max_Mean', 'Precip_Sum_mm']
X_ts = df_ml[features_ts]
y_ts = df_ml['Traffic_Count']

# 3. Validation Step (Time Series Split)
# We train on the first 80% and test on the last 20%
train_size = int(len(X_ts) * 0.8)
X_tr, X_te = X_ts.iloc[:train_size], X_ts.iloc[train_size:]
y_tr, y_te = y_ts.iloc[:train_size], y_ts.iloc[train_size:]

xgb_f = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgbm_f = LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=-1)
model_fcst_val = VotingRegressor(estimators=[('xgb', xgb_f), ('lgbm', lgbm_f)])
model_fcst_val.fit(X_tr, y_tr)

rmse = np.sqrt(mean_squared_error(y_te, model_fcst_val.predict(X_te)))
print(f"üìä VALIDATION METRIC:")
print(f"   Test RMSE: {rmse:,.0f} (Use this in your Report)")

# 4. Production Training (Full Data)
model_fcst_final = VotingRegressor(estimators=[('xgb', xgb_f), ('lgbm', lgbm_f)])
model_fcst_final.fit(X_ts, y_ts)

# 5. Generate Recursive Forecast (2026-2027)
print("üîÆ Generating 2-Year Recursive Forecast...")
FORECAST_WEEKS = 104
last_date = df_ml['Date'].max()
future_dates = [last_date + timedelta(weeks=i) for i in range(1, FORECAST_WEEKS + 1)]

future_df = pd.DataFrame({'Date': future_dates})
future_df['Date'] = pd.to_datetime(future_df['Date'])
future_df['Year'] = future_df['Date'].dt.year
future_df['Month'] = future_df['Date'].dt.month
future_df['Week'] = future_df['Date'].dt.isocalendar().week

# Weather Assumptions (Median)
future_df['Temp_Max_Mean'] = df_ml['Temp_Max_Mean'].median()
future_df['Precip_Sum_mm'] = df_ml['Precip_Sum_mm'].median()

# Recursive Loop
forecast_values = []
current_lag1 = df_ml.iloc[-1]['Traffic_Count']
current_lag2 = df_ml.iloc[-2]['Traffic_Count']

for index, row in future_df.iterrows():
    input_data = pd.DataFrame([[
        row['Year'], row['Month'], row['Week'], current_lag1, current_lag2,
        row['Temp_Max_Mean'], row['Precip_Sum_mm']
    ]], columns=features_ts)
    
    pred = model_fcst_final.predict(input_data)[0]
    forecast_values.append(pred)
    
    current_lag2 = current_lag1
    current_lag1 = pred

future_df['Forecasted_Traffic'] = forecast_values
future_df['Facility'] = 'All Facilities'

# 6. Save Outputs
future_df.to_csv('Final_Forecast_2026_2027.csv', index=False)
with open('python_model_forecasting.pkl', 'wb') as f:
    pickle.dump(model_fcst_final, f)

print("‚úÖ Saved: 'Final_Forecast_2026_2027.csv'")
print("‚úÖ Saved: 'python_model_forecasting.pkl'")

print("\n" + "="*60)
print("üéâ ALL PROCESSES COMPLETE. REPORT DATA IS READY.")
print("="*60)

üöÄ STARTING TURN-KEY MASTER SCRIPT...
   (Calculating Metrics, Training Models, and Generating Files)

‚úÖ All data files found.

--- 1. VIOLATION MODEL (Random Forest Regressor) ---
üìä VALIDATION METRIC:
   R2 Score: 0.1055 (Use this in your Report)
‚úÖ Saved: 'Model1_Top_Factors.csv'
‚úÖ Saved: 'python_model_violation.pkl'

--- 2. CONGESTION MODEL (Voting Ensemble: XGB+LGBM) ---
üìä VALIDATION METRIC:
   Accuracy: 0.9231 (92.3%) (Use this in your Report)
‚úÖ Saved: 'Model2_Congestion_Stats.csv'
‚úÖ Saved: 'python_model_congestion.pkl'

--- 3. FORECASTING MODEL (Voting Ensemble: XGB+LGBM) ---
üìä VALIDATION METRIC:
   Test RMSE: 12,137 (Use this in your Report)
üîÆ Generating 2-Year Recursive Forecast...
‚úÖ Saved: 'Final_Forecast_2026_2027.csv'
‚úÖ Saved: 'python_model_forecasting.pkl'

üéâ ALL PROCESSES COMPLETE. REPORT DATA IS READY.
