In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib

DATA_FILE = "bus_data.csv"
MODEL_FILE = "bus_model.joblib"

FEATURE_COLS = [
    "route_id",
    "distance_km",
    "hour_of_day",
    "day_of_week",
    "is_peak_hour",
    "weather",
    "traffic_level",
    "previous_delay_min",
]

TARGET_COL = "arrival_delay_min"

def main():
    df = pd.read_csv(DATA_FILE)
    print("Data shape:", df.shape)
    print(df.head())

    X = df[FEATURE_COLS]
    y = df[TARGET_COL]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Pipeline: scaling + RandomForest
    model = make_pipeline(
        StandardScaler(),
        RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            max_depth=12,
            n_jobs=-1
        )
    )

    print("Training model...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\nEvaluation on test set:")
    print(f"MAE (minutes): {mae:.2f}")
    print(f"R^2 score    : {r2:.3f}")

    # Save model + features
    joblib.dump(
        {"model": model, "feature_cols": FEATURE_COLS},
        MODEL_FILE
    )
    print(f"\nModel saved to {MODEL_FILE}")

if __name__ == "__main__":
    main()

Data shape: (1000, 9)
   route_id  distance_km  hour_of_day  day_of_week  is_peak_hour  weather  \
0         1     5.716607           11            5             0        1   
1         3     3.545760           14            0             0        0   
2         2     8.290389            9            2             1        1   
3         2     7.971608           20            0             0        2   
4         2     7.914796           17            3             1        0   

   traffic_level  previous_delay_min  arrival_delay_min  
0              1            3.850026           1.813084  
1              1            3.964521           4.599542  
2              2            3.093026           9.901383  
3              2            2.127955           4.530515  
4              2            2.253114           6.847969  
Training model...

Evaluation on test set:
MAE (minutes): 1.90
R^2 score    : 0.708

Model saved to bus_model.joblib
