In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load datasets
itms_df = pd.read_csv("itms-22mar-2hours.csv")  # Replace with actual path
routes_df = pd.read_csv("konbert-output-3e875593.csv")  # Replace with actual path

# Convert datetime columns
itms_df['observationDateTime'] = pd.to_datetime(itms_df['observationDateTime'])
itms_df['trip_delay'] = pd.to_numeric(itms_df['trip_delay'], errors='coerce')

# Extract time-based features
itms_df['hour'] = itms_df['observationDateTime'].dt.hour
itms_df['minute'] = itms_df['observationDateTime'].dt.minute
itms_df['dayofweek'] = itms_df['observationDateTime'].dt.dayofweek

# Extract latitude and longitude
itms_df['latitude'] = itms_df['location.coordinates'].apply(
    lambda x: float(x.strip('[]').split(',')[1]) if isinstance(x, str) else np.nan)
itms_df['longitude'] = itms_df['location.coordinates'].apply(
    lambda x: float(x.strip('[]').split(',')[0]) if isinstance(x, str) else np.nan)

# Add previous stop delay feature
itms_df['prev_trip_delay'] = itms_df.groupby('trip_id')['trip_delay'].shift(1)
itms_df['prev_trip_delay'].fillna(0, inplace=True)

# Drop missing values
processed_df = itms_df[['speed', 'hour', 'minute', 'dayofweek',
                         'latitude', 'longitude', 'prev_trip_delay', 'trip_delay']].dropna()




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  itms_df['prev_trip_delay'].fillna(0, inplace=True)


In [15]:
# Define features (X) and target (y)
X = processed_df.drop(columns=['trip_delay'])
y = processed_df['trip_delay']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6,
                         subsample=0.8, colsample_bytree=0.8, random_state=42)

# Train model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)  # R² Score

# Print results
print(f"Mean Absolute Error (MAE): {mae:.0f} seconds")
print(f"Root Mean Squared Error (RMSE): {rmse:.0f} seconds")
print(f"R² Score: {r2:.4f}")



Mean Absolute Error (MAE): 49 seconds
Root Mean Squared Error (RMSE): 399 seconds
R² Score: 0.9533


In [19]:
import pickle

# Save the trained XGBoost model
with open("eta.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

In [21]:
import pickle

# Load the trained model
with open("eta.pkl", "rb") as f:
    model = pickle.load(f)
    new_data = pd.DataFrame({'speed': [30], 'hour': [14], 'minute': [5], 'dayofweek': [3],
                         'latitude': [12.9716], 'longitude': [77.9086], 'prev_trip_delay': [60]})
    predicted_delay = model.predict(new_data)
    print(f"Predicted Delay for New Data: {predicted_delay[0]:.0f} seconds")

Predicted Delay for New Data: 181 seconds
