In [None]:
# ml_training/train_delay_model.ipynb
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib
from data_preprocessing import load_data, clean_data, split_features_labels, scale_data
from model_evaluation import evaluate_classification

# 1. Load dataset
df = load_data("../database/past_trip_history.csv")
df = clean_data(df)

# Example columns: ['distance_km', 'traffic_index', 'avg_speed', 'delay_flag']
X, y = split_features_labels(df, target="delay_flag")

# 2. Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)

# 3. Train
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# 4. Evaluate
y_pred = clf.predict(X_test_scaled)
metrics = evaluate_classification(y_test, y_pred)
print("Delay Model Performance:", metrics)

# 5. Save
joblib.dump(clf, "../backend/models/delay_detector.pkl")
print("âœ… Delay model saved to backend/models/delay_detector.pkl")
