In [None]:
import pandas as pd
from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [14]:
# Example for tab-separated file
df = pd.read_csv('stop_times.txt')

# Save as CSV
df.to_csv('stop_times.csv')

# Drop multiple columns
df = df.drop(['continuous_pickup','continuous_drop_off','stop_headsign','pickup_type','drop_off_type'], axis=1)

In [15]:
# Convert time columns
# Convert times as duration from midnight
df['departure_time'] = pd.to_timedelta(df['departure_time'])
df['arrival_time'] = pd.to_timedelta(df['arrival_time'])

In [16]:

# Calculate travel time (in seconds)
df['travel_time_sec'] = (df['departure_time'] - df['arrival_time']).dt.total_seconds().abs()
# Calculate distance difference to previous stop
df['distance_m'] = df.groupby('trip_id')['shape_dist_traveled'].diff().fillna(0)

In [17]:
# Calculate average speed
df['speed_kmph'] = (df['distance_m'] / 1000) / (df['travel_time_sec'] / 3600 + 1e-5)  # avoid div by zero

In [18]:
# Extract time features
df['hour'] = df['departure_time'].dt.components['hours']
df['minute'] = df['departure_time'].dt.components['minutes']

In [19]:
# Features and label
features = df[['hour', 'minute', 'stop_id', 'distance_m']]
label = df['travel_time_sec']

In [20]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=42)

In [21]:
# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [22]:
# Evaluate
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

(mae, df[['hour', 'minute', 'stop_id', 'distance_m', 'travel_time_sec']].head())

(0.0,
    hour  minute  stop_id  distance_m  travel_time_sec
 0     5      28       21       0.000             20.0
 1     5      31       20    1202.405             20.0
 2     5      33       19    1278.345             20.0
 3     5      35       18     834.186             20.0
 4     5      38       17     985.280             20.0)

In [23]:
df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,timepoint,travel_time_sec,distance_m,speed_kmph,hour,minute
0,0,0 days 05:28:08,0 days 05:28:28,21,0,0.0,1,20.0,0.0,0.0,5,28
1,0,0 days 05:30:58,0 days 05:31:18,20,1,1202.405,1,20.0,1202.405,216.044021,5,31
2,0,0 days 05:33:28,0 days 05:33:48,19,2,2480.75,1,20.0,1278.345,229.68866,5,33
3,0,0 days 05:35:33,0 days 05:35:53,18,3,3314.936,1,20.0,834.186,149.883689,5,35
4,0,0 days 05:37:53,0 days 05:38:13,17,4,4300.216,1,20.0,985.28,177.031743,5,38


<h1> Merging the stop data set for lat/lon </h1>

In [25]:
stops = pd.read_csv("stops.txt")

In [26]:
stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon
0,1,,Dilshad Garden,,28.675991,77.321495
1,2,,Jhilmil,,28.675648,77.312393
2,3,,Mansrover park,,28.675352,77.301178
3,4,,Shahdara,,28.673531,77.28727
4,5,,Welcome,,28.671986,77.277931


In [27]:
# Merge lat/lon into stop_times
merged = pd.merge(df, stops[['stop_id', 'stop_lat', 'stop_lon']], on='stop_id', how='left')

In [28]:
merged = merged.sort_values(by=['trip_id', 'stop_sequence']).reset_index(drop=True)
merged.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,timepoint,travel_time_sec,distance_m,speed_kmph,hour,minute,stop_lat,stop_lon
0,0,0 days 05:28:08,0 days 05:28:28,21,0,0.0,1,20.0,0.0,0.0,5,28,28.720821,77.105042
1,0,0 days 05:30:58,0 days 05:31:18,20,1,1202.405,1,20.0,1202.405,216.044021,5,31,28.715008,77.115746
2,0,0 days 05:33:28,0 days 05:33:48,19,2,2480.75,1,20.0,1278.345,229.68866,5,33,28.707941,77.125732
3,0,0 days 05:35:33,0 days 05:35:53,18,3,3314.936,1,20.0,834.186,149.883689,5,35,28.70318,77.132355
4,0,0 days 05:37:53,0 days 05:38:13,17,4,4300.216,1,20.0,985.28,177.031743,5,38,28.697943,77.140465


<h>Calculate time taken to travel from one stop to the next </h1>

In [30]:
def time_to_sec(t):
    h, m, s = map(int, t.split(':'))
    return h * 3600 + m * 60 + s

import pandas as pd

# Ensure 'arrival_time' is timedelta
merged['arrival_time'] = pd.to_timedelta(merged['arrival_time'])

# Convert to seconds
merged['arrival_sec'] = merged['arrival_time'].dt.total_seconds()

# Calculate travel time to next stop
merged['next_arrival_sec'] = merged.groupby('trip_id')['arrival_sec'].shift(-1)
merged['travel_time_sec'] = merged['next_arrival_sec'] - merged['arrival_sec']


In [31]:
merged.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,timepoint,travel_time_sec,distance_m,speed_kmph,hour,minute,stop_lat,stop_lon,arrival_sec,next_arrival_sec
0,0,0 days 05:28:08,0 days 05:28:28,21,0,0.0,1,170.0,0.0,0.0,5,28,28.720821,77.105042,19688.0,19858.0
1,0,0 days 05:30:58,0 days 05:31:18,20,1,1202.405,1,150.0,1202.405,216.044021,5,31,28.715008,77.115746,19858.0,20008.0
2,0,0 days 05:33:28,0 days 05:33:48,19,2,2480.75,1,125.0,1278.345,229.68866,5,33,28.707941,77.125732,20008.0,20133.0
3,0,0 days 05:35:33,0 days 05:35:53,18,3,3314.936,1,140.0,834.186,149.883689,5,35,28.70318,77.132355,20133.0,20273.0
4,0,0 days 05:37:53,0 days 05:38:13,17,4,4300.216,1,150.0,985.28,177.031743,5,38,28.697943,77.140465,20273.0,20423.0


In [32]:
merged['hour_of_day'] = merged['arrival_sec'] // 3600  # e.g., 5 AM = 5

<h1> Merging the trips  </h1>

In [34]:
trips = pd.read_csv("trips.txt")
merged = pd.merge(merged, trips[['trip_id', 'service_id', 'direction_id']], on='trip_id', how='left')

<h1> Clean Missing and Anomalous Values </h1>

In [36]:
# Remove rows where travel time is negative or null
cleaned = merged[(merged['travel_time_sec'] > 0) & (merged['travel_time_sec'].notnull())]

# Optional: Remove unreasonable travel times (> 30 minutes between stops)
cleaned = cleaned[cleaned['travel_time_sec'] < 1800]


In [37]:
cleaned.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,timepoint,travel_time_sec,distance_m,speed_kmph,hour,minute,stop_lat,stop_lon,arrival_sec,next_arrival_sec,hour_of_day,service_id,direction_id
0,0,0 days 05:28:08,0 days 05:28:28,21,0,0.0,1,170.0,0.0,0.0,5,28,28.720821,77.105042,19688.0,19858.0,5.0,weekday,
1,0,0 days 05:30:58,0 days 05:31:18,20,1,1202.405,1,150.0,1202.405,216.044021,5,31,28.715008,77.115746,19858.0,20008.0,5.0,weekday,
2,0,0 days 05:33:28,0 days 05:33:48,19,2,2480.75,1,125.0,1278.345,229.68866,5,33,28.707941,77.125732,20008.0,20133.0,5.0,weekday,
3,0,0 days 05:35:33,0 days 05:35:53,18,3,3314.936,1,140.0,834.186,149.883689,5,35,28.70318,77.132355,20133.0,20273.0,5.0,weekday,
4,0,0 days 05:37:53,0 days 05:38:13,17,4,4300.216,1,150.0,985.28,177.031743,5,38,28.697943,77.140465,20273.0,20423.0,5.0,weekday,


In [None]:
cleaned.to_parquet("gtfs_cleaned.parquet")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ MAE: {mae:.2f} sec")
print(f"✅ R² Score: {r2:.3f}")

joblib.dump(model, "models/rf_model.pkl")
print("🎉 Model saved as models/rf_model.pkl")