### Baseline for Modelcomparison
Calculates prediction based on either the average delay time in general and based on the average delay of a station. Predicts at one station the predicted delay and compares it with the actual delay.

In [38]:
import pandas as pd
import numpy as np
base_line_data = pd.read_csv("../DBtrainrides_final_result.csv")
base_line_data

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,IBNR,long,lat,arrival_plan,departure_plan,arrival_delay_m,transformed_info_message,prev_arrival_delay_m,prev_departure_delay_m,weighted_avg_prev_delay,max_station_number,station_progress
0,-1001326572688500578,2407082041,2,8011118.0,13.375988,52.509379,2024-07-08 20:44:00,2024-07-08 20:45:00,0.0,No message,0.0,0.0,0.000000,7,0.285714
1,-1001326572688500578,2407082041,3,8011160.0,9.095851,48.849792,,,,No message,0.0,0.0,0.000000,7,0.428571
2,-1001326572688500578,2407082041,4,8011167.0,13.299437,52.530276,2024-07-08 20:55:00,2024-07-08 20:56:00,0.0,No message,0.0,0.0,0.000000,7,0.571429
3,-1001326572688500578,2407082041,5,8010404.0,13.196898,52.534648,2024-07-08 21:00:00,2024-07-08 21:03:00,2.0,No message,0.0,0.0,0.000000,7,0.714286
4,-1001326572688500578,2407082041,6,8080040.0,13.128917,52.549396,2024-07-08 21:06:00,2024-07-08 21:07:00,1.0,No message,2.0,0.0,0.666667,7,0.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2430246,999976718847540977,2407100447,6,8005649.0,7.110814,49.274763,2024-07-10 05:01:00,2024-07-10 05:02:00,1.0,No message,0.0,0.0,0.000000,6,1.000000
2430247,999976718847540977,2407120447,2,8005241.0,7.018788,49.230425,2024-07-12 04:50:00,2024-07-12 04:51:00,0.0,No message,0.0,0.0,0.000000,6,0.333333
2430248,999976718847540977,2407120447,3,8005306.0,8.243728,50.070788,,,,No message,0.0,0.0,0.000000,6,0.500000
2430249,999976718847540977,2407120447,4,8005332.0,7.057083,49.244018,2024-07-12 04:55:00,2024-07-12 04:56:00,0.0,No message,0.0,0.0,0.000000,6,0.666667


In [39]:
# Drop all delay based features except for the target variable arrival_delay_m
base_line_data.drop(columns=["arrival_plan","departure_plan","prev_arrival_delay_m","prev_departure_delay_m", "weighted_avg_prev_delay"],inplace=True)

# Encode info_messages
message_order = ['No message', 'Information', 'Bauarbeiten', 'Störung', 'Großstörung']

base_line_data['transformed_info_message'] = pd.Categorical(
    base_line_data['transformed_info_message'],
    categories=message_order,
    ordered=True
)
base_line_data['transformed_info_message'] = base_line_data['transformed_info_message'].cat.codes

# Fill arrival_delay_m NaN values with 0
base_line_data["arrival_delay_m"] = base_line_data["arrival_delay_m"].fillna(0.0)
base_line_data

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,IBNR,long,lat,arrival_delay_m,transformed_info_message,max_station_number,station_progress
0,-1001326572688500578,2407082041,2,8011118.0,13.375988,52.509379,0.0,0,7,0.285714
1,-1001326572688500578,2407082041,3,8011160.0,9.095851,48.849792,0.0,0,7,0.428571
2,-1001326572688500578,2407082041,4,8011167.0,13.299437,52.530276,0.0,0,7,0.571429
3,-1001326572688500578,2407082041,5,8010404.0,13.196898,52.534648,2.0,0,7,0.714286
4,-1001326572688500578,2407082041,6,8080040.0,13.128917,52.549396,1.0,0,7,0.857143
...,...,...,...,...,...,...,...,...,...,...
2430246,999976718847540977,2407100447,6,8005649.0,7.110814,49.274763,1.0,0,6,1.000000
2430247,999976718847540977,2407120447,2,8005241.0,7.018788,49.230425,0.0,0,6,0.333333
2430248,999976718847540977,2407120447,3,8005306.0,8.243728,50.070788,0.0,0,6,0.500000
2430249,999976718847540977,2407120447,4,8005332.0,7.057083,49.244018,0.0,0,6,0.666667


In [40]:
# Step 1: Create a unique identifier for groups
base_line_data['Group'] = base_line_data['ID_Base'].astype(str) + "_" + base_line_data['ID_Timestamp'].astype(str)

# Step 2: Get unique groups and shuffle them
unique_groups = base_line_data['Group'].unique()
np.random.seed(42)  # For reproducibility
np.random.shuffle(unique_groups)

# Step 3: Split groups into train and test
split_idx = int(0.8 * len(unique_groups))  # 80-20 split
train_groups = unique_groups[:split_idx]
test_groups = unique_groups[split_idx:]

# Step 4: Assign rows to train and test sets
train_data = base_line_data[base_line_data['Group'].isin(train_groups)]
test_data = base_line_data[~base_line_data['Group'].isin(train_groups)]

# Drop the helper column if necessary
train_data = train_data.drop(columns=['Group'])
test_data = test_data.drop(columns=['Group'])
train_data

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,IBNR,long,lat,arrival_delay_m,transformed_info_message,max_station_number,station_progress
0,-1001326572688500578,2407082041,2,8011118.0,13.375988,52.509379,0.0,0,7,0.285714
1,-1001326572688500578,2407082041,3,8011160.0,9.095851,48.849792,0.0,0,7,0.428571
2,-1001326572688500578,2407082041,4,8011167.0,13.299437,52.530276,0.0,0,7,0.571429
3,-1001326572688500578,2407082041,5,8010404.0,13.196898,52.534648,2.0,0,7,0.714286
4,-1001326572688500578,2407082041,6,8080040.0,13.128917,52.549396,1.0,0,7,0.857143
...,...,...,...,...,...,...,...,...,...,...
2430246,999976718847540977,2407100447,6,8005649.0,7.110814,49.274763,1.0,0,6,1.000000
2430247,999976718847540977,2407120447,2,8005241.0,7.018788,49.230425,0.0,0,6,0.333333
2430248,999976718847540977,2407120447,3,8005306.0,8.243728,50.070788,0.0,0,6,0.500000
2430249,999976718847540977,2407120447,4,8005332.0,7.057083,49.244018,0.0,0,6,0.666667


In [41]:
# Function to compute average delay for each IBNR and replace NaN with 0
def compute_avg_delay(data):
    # Group by IBNR and calculate the mean of arrival_delay_m
    avg_delay_series = data.groupby('IBNR')['arrival_delay_m'].mean()
    # Replace NaN values with 0
    avg_delay_series = avg_delay_series.fillna(0)
    # Convert to dictionary
    avg_delay_dict = avg_delay_series.to_dict()
    return avg_delay_dict

# Compute for train data
train_avg_delay = compute_avg_delay(train_data)
train_avg_delay

{8000001.0: 0.6363636363636364,
 8000002.0: 0.0,
 8000004.0: 0.9273356401384083,
 8000007.0: 1.2789855072463767,
 8000009.0: 1.613941018766756,
 8000010.0: 2.557823129251701,
 8000011.0: 0.5110619469026548,
 8000012.0: 1.7638483965014577,
 8000013.0: 0.9806701030927835,
 8000014.0: 1.8618618618618619,
 8000015.0: 0.8130434782608695,
 8000016.0: 0.8571428571428571,
 8000017.0: 1.7316053511705685,
 8000020.0: 1.5256916996047432,
 8000021.0: 1.4515503875968991,
 8000022.0: 0.27586206896551724,
 8000023.0: 1.169398907103825,
 8000025.0: 1.581888246628131,
 8000026.0: 0.0,
 8000027.0: 2.48506151142355,
 8000028.0: 0.9827586206896551,
 8000029.0: 0.2773722627737226,
 8000031.0: 2.9741100323624594,
 8000032.0: 1.9983221476510067,
 8000033.0: 2.6625155666251556,
 8000034.0: 1.3579676674364896,
 8000036.0: 1.2820512820512822,
 8000037.0: 3.2413793103448274,
 8000038.0: 1.4612546125461254,
 8000039.0: 3.1108374384236455,
 8000041.0: 1.2890428211586902,
 8000042.0: 0.44537815126050423,
 8000043.0

In [42]:
test_data["arrival_delay_prediction"] = test_data['IBNR'].map(train_avg_delay)
test_data

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,IBNR,long,lat,arrival_delay_m,transformed_info_message,max_station_number,station_progress,arrival_delay_prediction
29,-1001326572688500578,2407132041,2,8011118.0,13.375988,52.509379,0.0,0,7,0.285714,1.467162
30,-1001326572688500578,2407132041,3,8011160.0,8.679678,53.008390,0.0,0,7,0.428571,1.018745
31,-1001326572688500578,2407132041,4,8011167.0,13.299437,52.530276,0.0,0,7,0.571429,1.393365
32,-1001326572688500578,2407132041,5,8010404.0,13.196898,52.534648,0.0,0,7,0.714286,1.649462
33,-1001326572688500578,2407132041,6,8080040.0,13.128917,52.549396,0.0,0,7,0.857143,1.758454
...,...,...,...,...,...,...,...,...,...,...,...
2430228,999976718847540977,2407040447,2,8005241.0,7.018788,49.230425,0.0,0,6,0.333333,1.073239
2430229,999976718847540977,2407040447,3,8005306.0,9.741016,52.376763,0.0,0,6,0.500000,0.000000
2430230,999976718847540977,2407040447,4,8005332.0,7.057083,49.244018,0.0,0,6,0.666667,1.146766
2430231,999976718847540977,2407040447,5,8005044.0,8.675741,49.403779,0.0,0,6,0.833333,0.000000


In [44]:
# Calculates for every stop at a specific station the expected average delay for this station and compares it with actual via MSE
from sklearn.metrics import mean_squared_error

# Ensure there are no NaN values in the columns (optional, depending on your data)
baseline_test_data = test_data.dropna(subset=['arrival_delay_prediction', 'arrival_delay_m'])

# Calculate MSE
mse = mean_squared_error(
    baseline_test_data['arrival_delay_m'],
    baseline_test_data['arrival_delay_prediction']
)

print(f"Mean Squared Error: {mse}")


Mean Squared Error: 7.248143130417075


In [45]:
# Predicts for every stop at a specific station the global average delay and compares it with actual via MSE
# Average delay in train dataset
test_data["avg_delay"] = 0.8
# Ensure there are no NaN values in the columns (optional, depending on your data)
baseline_test_data = test_data.dropna(subset=['avg_delay', 'arrival_delay_m'])

# Calculate MSE
mse = mean_squared_error(
    baseline_test_data['arrival_delay_m'],
    baseline_test_data['avg_delay']
)

print(f"Mean Squared Error: {mse}")


Mean Squared Error: 7.851329627694372
