In [1]:
import pandas as pd
import numpy as np

In [2]:
train_rides_df = pd.read_csv('DBtrainrides.csv')

In [3]:
# Split the ID column into 'ID_Base', 'ID_Timestamp', and 'ID_Stop_Number' and convert 'ID_Stop_Number' to numeric
train_rides_df[['ID_Base', 'ID_Timestamp', 'ID_Stop_Number']] = train_rides_df['ID'].str.rsplit('-', n=2, expand=True)
train_rides_df['ID_Stop_Number'] = pd.to_numeric(train_rides_df['ID_Stop_Number'])

In [4]:
# Drop 'ID' column and reorder columns to place 'ID_Base', 'ID_Timestamp', 'ID_Stop_Number' at the start
train_rides_df.drop(columns=['ID'], inplace=True)
new_column_order = ['ID_Base', 'ID_Timestamp', 'ID_Stop_Number'] + [col for col in train_rides_df.columns if
                                                                    col not in ['ID_Base', 'ID_Timestamp',
                                                                                'ID_Stop_Number']]
train_rides_df = train_rides_df[new_column_order]

In [5]:
train_rides_df_copy = train_rides_df.copy()

In [6]:
# Rename 'eva_nr' to 'starting_station_IBNR' and drop unnecessary columns
train_rides_df.rename(columns={'eva_nr': 'starting_station_IBNR'}, inplace=True)
train_rides_df.drop(
    columns=['station', 'state', 'city', 'long', 'lat', 'category',
             'arrival_plan', 'departure_plan', 'arrival_change', 'departure_change',
             'arrival_delay_m', 'departure_delay_m', 'info',
             'arrival_delay_check', 'departure_delay_check'],
    inplace=True
)


In [7]:
# Display the first few rows of the train rides DataFrame
train_rides_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,path,starting_station_IBNR,zip
0,1573967790757085557,2407072312,14,20,Stolberg(Rheinl)Hbf Gl.44|Eschweiler-St.Jöris|...,8000001,52064
1,349781417030375472,2407080017,1,18,,8000001,52064
2,7157250219775883918,2407072120,25,1,Hamm(Westf)Hbf|Kamen|Kamen-Methler|Dortmund-Ku...,8000406,52066
3,349781417030375472,2407080017,2,18,Aachen Hbf,8000404,52072
4,1983158592123451570,2407080010,3,33,Herzogenrath|Kohlscheid,8000404,52072


In [8]:
# Count missing values to understand data quality
train_rides_df.count()

ID_Base                  2061357
ID_Timestamp             2061357
ID_Stop_Number           2061357
line                     2061357
path                     1850002
starting_station_IBNR    2061357
zip                      2061357
dtype: int64

In [9]:
# Group by 'zip' to understand distribution and display result
zip_df = train_rides_df.groupby('zip').size().reset_index(name='count')
zip_df

Unnamed: 0,zip,count
0,1067,2458
1,1069,2045
2,1097,3305
3,1109,1800
4,1127,597
...,...,...
1646,99817,453
1647,99867,494
1648,99880,424
1649,99947,453


In [10]:
# Create a new DataFrame with a count of how often each 'starting_station_IBNR' is mentioned
station_mapping_df = train_rides_df['starting_station_IBNR'].value_counts().reset_index()
station_mapping_df.columns = ['starting_station_IBNR', 'count']
station_mapping_df

Unnamed: 0,starting_station_IBNR,count
0,8004128,8732
1,8089047,8312
2,8000262,7814
3,8004132,7598
4,8004131,7382
...,...,...
1991,8007768,95
1992,8005644,74
1993,8005543,49
1994,8010035,5


In [11]:
# Count number of unique 'starting_station_IBNR' to understand the spread across different stations
station_mapping_df['starting_station_IBNR'].nunique()

1996

In [12]:
# Keep rows with the maximum 'ID_Stop_Number' for each 'ID_Base' and 'ID_Timestamp' combination
max_stop_numbers_df = (
    train_rides_df.loc[train_rides_df.groupby(['ID_Base', 'ID_Timestamp'])['ID_Stop_Number'].idxmax()]
    .drop_duplicates(subset=['ID_Base', 'ID_Timestamp'], keep='first')
    .sort_values(by=['starting_station_IBNR', 'ID_Base', 'ID_Timestamp'])
)

max_stop_numbers_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,path,starting_station_IBNR,zip
286645,-2065137557584893414,2407082237,1,29,,8000001,52064
595517,-2065137557584893414,2407092237,1,29,,8000001,52064
906467,-2065137557584893414,2407102237,1,29,,8000001,52064
1216357,-2065137557584893414,2407112237,1,29,,8000001,52064
1523701,-2065137557584893414,2407122237,1,29,,8000001,52064


In [13]:
# Count non-null entries in each column of max_stop_numbers_df
max_stop_numbers_df.count()

ID_Base                  257818
ID_Timestamp             257818
ID_Stop_Number           257818
line                     257818
path                     217022
starting_station_IBNR    257818
zip                      257818
dtype: int64

In [14]:
# Split 'path' into individual stations, explode it, and track the sequence of stops
max_stop_numbers_df = max_stop_numbers_df.assign(last_station=max_stop_numbers_df['path'].str.split('|'))
exploded_stations_df = (
    max_stop_numbers_df
    .explode('last_station')
    .drop(columns=['path'])
    .assign(stop_number=lambda df: df.groupby(['ID_Base', 'ID_Timestamp']).cumcount() + 1)
    .sort_values(by=['starting_station_IBNR', 'ID_Base', 'ID_Timestamp', 'stop_number'])
    .reset_index(drop=True)
)

# Display the first 5 rows for verification
exploded_stations_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,starting_station_IBNR,zip,last_station,stop_number
0,-2065137557584893414,2407082237,1,29,8000001,52064,,1
1,-2065137557584893414,2407092237,1,29,8000001,52064,,1
2,-2065137557584893414,2407102237,1,29,8000001,52064,,1
3,-2065137557584893414,2407112237,1,29,8000001,52064,,1
4,-2065137557584893414,2407122237,1,29,8000001,52064,,1


In [15]:
# Display a preview of exploded_stations_df
exploded_stations_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,starting_station_IBNR,zip,last_station,stop_number
0,-2065137557584893414,2407082237,1,29,8000001,52064,,1
1,-2065137557584893414,2407092237,1,29,8000001,52064,,1
2,-2065137557584893414,2407102237,1,29,8000001,52064,,1
3,-2065137557584893414,2407112237,1,29,8000001,52064,,1
4,-2065137557584893414,2407122237,1,29,8000001,52064,,1


In [16]:
# Count non-null entries in each column of exploded_stations_df
exploded_stations_df.count()

ID_Base                  2785189
ID_Timestamp             2785189
ID_Stop_Number           2785189
line                     2785189
starting_station_IBNR    2785189
zip                      2785189
last_station             2744393
stop_number              2785189
dtype: int64

In [17]:
# Load ibnr_stations_index.csv and clean 'last_station' and 'Station Name' columns
ibnr_index_df = pd.read_csv('ibnr_stations_index.csv')
exploded_stations_df['last_station'] = exploded_stations_df['last_station'].str.strip().str.lower()
ibnr_index_df['Station Name'] = ibnr_index_df['Station Name'].str.strip().str.lower()

In [18]:
ibnr_index_df

Unnamed: 0,IBNR,Station Name
0,8000001,aachen hbf
1,8000001,ac
2,8000001,aken c
3,8000001,aquisgrana
4,8000001,aix-la-chapelle
...,...,...
9128,8098553,hamburg-altona(s)
9129,8098555,bensersiel ne
9130,8099503,hildesheim gbf
9131,8099506,stolberg(rheinl)gbf


In [19]:
# Merge with ibnr_index_df on 'last_station' and drop unnecessary columns
exploded_stations_df_with_ibnr_df = (
    exploded_stations_df
    .merge(ibnr_index_df, how='left', left_on='last_station', right_on='Station Name')
    .drop(columns=['ID_Stop_Number', 'Station Name'])
)

In [20]:
# Display head in exploded_stations_df_with_ibnr_df
exploded_stations_df_with_ibnr_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,line,starting_station_IBNR,zip,last_station,stop_number,IBNR
0,-2065137557584893414,2407082237,29,8000001,52064,,1,
1,-2065137557584893414,2407092237,29,8000001,52064,,1,
2,-2065137557584893414,2407102237,29,8000001,52064,,1,
3,-2065137557584893414,2407112237,29,8000001,52064,,1,
4,-2065137557584893414,2407122237,29,8000001,52064,,1,


In [21]:
# Count non-null values in exploded_stations_df_with_ibnr_df
exploded_stations_df_with_ibnr_df.count()

ID_Base                  2785189
ID_Timestamp             2785189
line                     2785189
starting_station_IBNR    2785189
zip                      2785189
last_station             2744393
stop_number              2785189
IBNR                     2588987
dtype: int64

In [22]:
# Convert empty strings to NaN in 'last_station'
exploded_stations_df_with_ibnr_df['last_station'] = exploded_stations_df_with_ibnr_df['last_station'].replace('', pd.NA)

# Count rows with NaN in 'last_station' and 'Station Name'
nan_station_count = exploded_stations_df_with_ibnr_df['last_station'].isna().sum()
print(f"Number of rows with NaN station names: {nan_station_count}")

Number of rows with NaN station names: 40796


In [23]:
# Filter rows without IBNR in 'Station Name' and display the first 10 rows
exploded_stations_df_with_no_ibnr_df = exploded_stations_df_with_ibnr_df[
    exploded_stations_df_with_ibnr_df['last_station'].isna()]
exploded_stations_df_with_no_ibnr_df.head(10)

Unnamed: 0,ID_Base,ID_Timestamp,line,starting_station_IBNR,zip,last_station,stop_number,IBNR
0,-2065137557584893414,2407082237,29,8000001,52064,,1,
1,-2065137557584893414,2407092237,29,8000001,52064,,1,
2,-2065137557584893414,2407102237,29,8000001,52064,,1,
3,-2065137557584893414,2407112237,29,8000001,52064,,1,
4,-2065137557584893414,2407122237,29,8000001,52064,,1,
5,-2065137557584893414,2407132237,29,8000001,52064,,1,
6,-2065137557584893414,2407142237,29,8000001,52064,,1,
7,-3561454673811003901,2407082137,29,8000001,52064,,1,
8,-3561454673811003901,2407092137,29,8000001,52064,,1,
9,-3561454673811003901,2407102137,29,8000001,52064,,1,


In [24]:
# Create new column 'canceled' based on empty path
train_rides_df_copy['canceled'] = train_rides_df_copy['path'].isna().astype(bool)

In [25]:
train_rides_df_copy[['path', 'canceled']]

Unnamed: 0,path,canceled
0,Stolberg(Rheinl)Hbf Gl.44|Eschweiler-St.Jöris|...,False
1,,True
2,Hamm(Westf)Hbf|Kamen|Kamen-Methler|Dortmund-Ku...,False
3,Aachen Hbf,False
4,Herzogenrath|Kohlscheid,False
...,...,...
2061352,Bottrop Hbf|Bottrop-Boy|Gladbeck West,False
2061353,Lübeck-Travemünde Strand|Lübeck-Travemünde Haf...,False
2061354,Bad Oldesloe|Reinfeld(Holst),False
2061355,,True


In [26]:
# Drop unnecessary columns from train_rides_df_copy
train_rides_df_copy.drop(columns=['station', 'zip', 'state', 'city', 'category',
                                  'line', 'path', 'eva_nr',
                                  'arrival_delay_check', 'departure_delay_check'],
                         inplace=True)
train_rides_df_copy.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,long,lat,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,canceled
0,1573967790757085557,2407072312,14,6.091499,50.7678,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,False
1,349781417030375472,2407080017,1,6.091499,50.7678,,2024-07-08 00:17:00,,,0,0,,True
2,7157250219775883918,2407072120,25,6.116475,50.770202,2024-07-08 00:03:00,2024-07-08 00:04:00,2024-07-08 00:03:00,2024-07-08 00:04:00,0,0,,False
3,349781417030375472,2407080017,2,6.070715,50.78036,2024-07-08 00:20:00,2024-07-08 00:21:00,,,0,0,,False
4,1983158592123451570,2407080010,3,6.070715,50.78036,2024-07-08 00:20:00,2024-07-08 00:21:00,2024-07-08 00:20:00,2024-07-08 00:21:00,0,0,,False


In [27]:
# Merge with exploded_stations_df_with_ibnr_df on relevant keys
exploded_stations_df_with_ibnr_time_df = (
    exploded_stations_df_with_ibnr_df
    .merge(train_rides_df_copy,
           left_on=['ID_Base', 'ID_Timestamp', 'stop_number'],
           right_on=['ID_Base', 'ID_Timestamp', 'ID_Stop_Number'],
           how='left')
    .sort_values(by=['starting_station_IBNR', 'ID_Base', 'ID_Timestamp'])
)

In [28]:
# Drop duplicate column and reorder columns
exploded_stations_df_with_ibnr_time_df = exploded_stations_df_with_ibnr_time_df.drop('ID_Stop_Number', axis=1)

# Place 'stop_number' after 'ID_Timestamp'
columns = exploded_stations_df_with_ibnr_time_df.columns.tolist()
columns.remove('stop_number') 
columns.insert(columns.index('ID_Timestamp') + 1, 'stop_number')
exploded_stations_df_with_ibnr_time_df = exploded_stations_df_with_ibnr_time_df[columns]

# Sort by relevant columns
exploded_stations_df_with_ibnr_time_df = exploded_stations_df_with_ibnr_time_df.sort_values(
   by=['starting_station_IBNR', 'ID_Base', 'ID_Timestamp']
)

# Display first 10 rows
exploded_stations_df_with_ibnr_time_df.head(10)

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,line,starting_station_IBNR,zip,last_station,IBNR,long,lat,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,canceled
0,-2065137557584893414,2407082237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-08 22:37:00,,,0.0,0.0,,True
1,-2065137557584893414,2407092237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-09 22:37:00,,,0.0,0.0,,True
2,-2065137557584893414,2407102237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-10 22:37:00,,,0.0,0.0,,True
3,-2065137557584893414,2407112237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-11 22:37:00,,,0.0,0.0,,True
4,-2065137557584893414,2407122237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-12 22:37:00,,,0.0,0.0,,True
5,-2065137557584893414,2407132237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-13 22:37:00,,,0.0,0.0,,True
6,-2065137557584893414,2407142237,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-14 22:37:00,,,0.0,0.0,,True
7,-3561454673811003901,2407082137,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-08 21:37:00,,,0.0,0.0,,True
8,-3561454673811003901,2407092137,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-09 21:37:00,,,0.0,0.0,,True
9,-3561454673811003901,2407102137,1,29,8000001,52064,,,6.091499,50.7678,,2024-07-10 21:37:00,,,0.0,0.0,,True


In [29]:
exploded_stations_df_with_ibnr_time_df.count()

ID_Base                  2811480
ID_Timestamp             2811480
stop_number              2811480
line                     2811480
starting_station_IBNR    2811480
zip                      2811480
last_station             2770684
IBNR                     2614240
long                     1839163
lat                      1839163
arrival_plan             1627808
departure_plan           1839163
arrival_change           1388311
departure_change         1529307
arrival_delay_m          1839163
departure_delay_m        1839163
info                      591705
canceled                 1839163
dtype: int64

In [30]:
exploded_stations_df_with_ibnr_time_df

Unnamed: 0,ID_Base,ID_Timestamp,stop_number,line,starting_station_IBNR,zip,last_station,IBNR,long,lat,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,canceled
0,-2065137557584893414,2407082237,1,29,8000001,52064,,,6.091499,50.767800,,2024-07-08 22:37:00,,,0.0,0.0,,True
1,-2065137557584893414,2407092237,1,29,8000001,52064,,,6.091499,50.767800,,2024-07-09 22:37:00,,,0.0,0.0,,True
2,-2065137557584893414,2407102237,1,29,8000001,52064,,,6.091499,50.767800,,2024-07-10 22:37:00,,,0.0,0.0,,True
3,-2065137557584893414,2407112237,1,29,8000001,52064,,,6.091499,50.767800,,2024-07-11 22:37:00,,,0.0,0.0,,True
4,-2065137557584893414,2407122237,1,29,8000001,52064,,,6.091499,50.767800,,2024-07-12 22:37:00,,,0.0,0.0,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2811475,6234297817509604666,2407112012,7,70,8098360,68642,stockstadt(rhein),8005740.0,,,,,,,,,,
2811476,6234297817509604666,2407112012,8,70,8098360,68642,biebesheim,8000951.0,8.473978,49.781977,2024-07-11 20:45:00,2024-07-11 20:45:00,2024-07-11 20:45:00,2024-07-11 20:45:00,0.0,0.0,Information,False
2811477,6234297817509604666,2407112012,9,70,8098360,68642,gernsheim,8002249.0,,,,,,,,,,
2811478,6234297817509604666,2407112012,10,70,8098360,68642,groß-rohrheim,,,,,,,,,,,


In [31]:
count_first_stations = exploded_stations_df_with_ibnr_time_df['canceled'].sum()
# or alternatively:
count_first_stations = len(
    exploded_stations_df_with_ibnr_time_df[exploded_stations_df_with_ibnr_time_df['canceled'] == True])

print(f"Number of first stations: {count_first_stations}")

Number of first stations: 211355


In [32]:
#TODO Merge with others