In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error

In [3]:
df = pd.read_csv(r"C:\Users\tanzi\Personal Projects\cold_start_project_498r\data\clean_data.csv")
cluster_df = pd.read_csv(r"C:\Users\tanzi\Personal Projects\cold_start_project_498r\data\clustered_time_patterns.csv")

In [4]:
# Merge cluster info back to original data
df = df.merge(cluster_df[['hour', 'dayofweek', 'time_bucket', 'cluster']],
              on=['hour', 'dayofweek', 'time_bucket'], how='left')

In [5]:
# Features and targets
features = ['hour', 'dayofweek', 'is_weekend', 'cluster']
X = df[features]

In [6]:
# Cold start prediction
y_class = df['Cold_Start']
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)

In [7]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [8]:
print("=== Cold Start Classification Report ===")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

=== Cold Start Classification Report ===
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2059
           1       0.62      0.55      0.58        67

    accuracy                           0.98      2126
   macro avg       0.80      0.77      0.78      2126
weighted avg       0.97      0.98      0.97      2126

Confusion Matrix:
[[2036   23]
 [  30   37]]


In [9]:
# Delay prediction (regression)
y_reg = df['Delay (s)']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

In [10]:
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_r, y_train_r)
y_pred_r = reg.predict(X_test_r)

In [11]:
mse = mean_squared_error(y_test_r, y_pred_r)
rmse = np.sqrt(mse)

In [12]:
print("\n=== Delay Regression ===")
print(f"RMSE: {rmse:.2f} seconds")


=== Delay Regression ===
RMSE: 817.08 seconds


In [13]:
import os
import joblib

# Make sure the directory exists
os.makedirs("../models", exist_ok=True)

# Save both models
joblib.dump(clf, '../models/cold_start_classifier.pkl')
joblib.dump(reg, '../models/delay_predictor.pkl')

['../models/delay_predictor.pkl']