In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# Load and preview the dataset
# Update the file path to match your dataset location
file_path = './DBtrainrides_final_result.csv'
train_ride_df = pd.read_csv(file_path)

# Display the first few rows of the dataset
train_ride_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,IBNR,long,lat,arrival_plan,departure_plan,arrival_delay_m,transformed_info_message,prev_arrival_delay_m,prev_departure_delay_m,weighted_avg_prev_delay,max_station_number,station_progress
0,-1001326572688500578,2407082041,2,8011118.0,13.375988,52.509379,2024-07-08 20:44:00,2024-07-08 20:45:00,0.0,No message,0.0,0.0,0.0,7,0.285714
1,-1001326572688500578,2407082041,3,8011160.0,9.095851,48.849792,,,,No message,0.0,0.0,0.0,7,0.428571
2,-1001326572688500578,2407082041,4,8011167.0,13.299437,52.530276,2024-07-08 20:55:00,2024-07-08 20:56:00,0.0,No message,0.0,0.0,0.0,7,0.571429
3,-1001326572688500578,2407082041,5,8010404.0,13.196898,52.534648,2024-07-08 21:00:00,2024-07-08 21:03:00,2.0,No message,0.0,0.0,0.0,7,0.714286
4,-1001326572688500578,2407082041,6,8080040.0,13.128917,52.549396,2024-07-08 21:06:00,2024-07-08 21:07:00,1.0,No message,2.0,0.0,0.666667,7,0.857143


In [3]:
# Handle missing values and prepare features and target
# Fill missing values with 0 as a default strategy
train_ride_df.fillna(0, inplace=True)

# Define features and target variable
features = [
    'stop_number', 'IBNR', 'long', 'lat', 
    'prev_arrival_delay_m', 'prev_departure_delay_m', 
    'weighted_avg_prev_delay', 'max_station_number', 'station_progress'
]
target = 'arrival_delay_m'

X = train_ride_df[features]
y = train_ride_df[target]

In [4]:
# Display the feature matrix and target to verify
X.head()

Unnamed: 0,stop_number,IBNR,long,lat,prev_arrival_delay_m,prev_departure_delay_m,weighted_avg_prev_delay,max_station_number,station_progress
0,2,8011118.0,13.375988,52.509379,0.0,0.0,0.0,7,0.285714
1,3,8011160.0,9.095851,48.849792,0.0,0.0,0.0,7,0.428571
2,4,8011167.0,13.299437,52.530276,0.0,0.0,0.0,7,0.571429
3,5,8010404.0,13.196898,52.534648,0.0,0.0,0.0,7,0.714286
4,6,8080040.0,13.128917,52.549396,2.0,0.0,0.666667,7,0.857143


In [5]:
y.head()

0    0.0
1    0.0
2    0.0
3    2.0
4    1.0
Name: arrival_delay_m, dtype: float64

In [6]:
# Standardise the features
# StandardScaler ensures features are scaled appropriately for Lasso
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [7]:
# Hyperparameter tuning with GridSearchCV
# Define the Lasso regression model
lasso = Lasso()

# Define the parameter grid for alpha (regularisation strength)
param_grid = {'alpha': np.logspace(-4, 0, 50)}  # Testing alphas from 0.0001 to 1

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, scoring='r2', cv=5, verbose=1)
grid_search.fit(X_train, y_train)

# Display the best alpha and best score
best_alpha = grid_search.best_params_['alpha']
best_score = grid_search.best_score_

print(f"Best Alpha: {best_alpha}")
print(f"Best Cross-Validation R-squared Score: {best_score}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Alpha: 0.0001
Best Cross-Validation R-squared Score: 0.5768869974226097


In [8]:
# Evaluate the tuned Lasso model on the test set
# Use the best model from GridSearchCV
best_lasso = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score on Test Set: {r2}")

Mean Squared Error: 3.582413054681181
R-squared Score on Test Set: 0.563326250147486


In [11]:
# Analyse feature importance
# Display coefficients for each feature from the best model
coefficients = pd.DataFrame({'Feature': features, 'Coefficient': best_lasso.coef_})
coefficients

Unnamed: 0,Feature,Coefficient
0,stop_number,-0.021734
1,IBNR,0.02409
2,long,-0.021775
3,lat,-0.080226
4,prev_arrival_delay_m,-0.41792
5,prev_departure_delay_m,1.662737
6,weighted_avg_prev_delay,1.00354
7,max_station_number,0.026706
8,station_progress,0.055107
