In [1]:
import pandas as pd

In [2]:
train_rides_df = pd.read_csv('DBtrainrides.csv')

In [3]:
# Split the ID column into 'ID_Base' and 'ID_Number'
train_rides_df[['ID_Base', 'ID_Timestamp', 'ID_Stop_Number']] = train_rides_df['ID'].str.rsplit('-', n=2, expand=True)

# Convert 'ID_Number' to numeric
train_rides_df['ID_Stop_Number'] = pd.to_numeric(train_rides_df['ID_Stop_Number'])

In [4]:
# Rename 'eva_nr' to 'starting_station_IBNR'
train_rides_df = train_rides_df.rename(columns={'eva_nr': 'starting_station_IBNR'})

# Drop starting station string and accompanying data 
train_rides_df.drop(['station', 'state', 'city', 'long', 'lat'], axis=1, inplace=True)

In [5]:
# TODO: Potentially Drop arrival_delay_m, departure_delay_m, arrival_delay_check, departure_delay_check

In [6]:
# Remove 'ID' column and reorder columns so 'ID_Base', 'ID_Number', 'ID_Timestamp' are at the start
new_column_order = ['ID_Base', 'ID_Timestamp', 'ID_Stop_Number'] + [col for col in train_rides_df.columns if
                                                                    col not in ['ID', 'ID_Base',
                                                                                'ID_Timestamp', 'ID_Stop_Number',
                                                                                ]
                                                                    ]
train_rides_df = train_rides_df.drop(columns=['ID'])
train_rides_df = train_rides_df[new_column_order]

In [7]:
# Display the first few rows of the train rides DataFrame
train_rides_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,path,starting_station_IBNR,category,zip,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check
0,1573967790757085557,2407072312,14,20,Stolberg(Rheinl)Hbf Gl.44|Eschweiler-St.Jöris|...,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time
1,349781417030375472,2407080017,1,18,,8000001,2,52064,,2024-07-08 00:17:00,,,0,0,,on_time,on_time
2,7157250219775883918,2407072120,25,1,Hamm(Westf)Hbf|Kamen|Kamen-Methler|Dortmund-Ku...,8000406,4,52066,2024-07-08 00:03:00,2024-07-08 00:04:00,2024-07-08 00:03:00,2024-07-08 00:04:00,0,0,,on_time,on_time
3,349781417030375472,2407080017,2,18,Aachen Hbf,8000404,5,52072,2024-07-08 00:20:00,2024-07-08 00:21:00,,,0,0,,on_time,on_time
4,1983158592123451570,2407080010,3,33,Herzogenrath|Kohlscheid,8000404,5,52072,2024-07-08 00:20:00,2024-07-08 00:21:00,2024-07-08 00:20:00,2024-07-08 00:21:00,0,0,,on_time,on_time


In [8]:
# Count missing values to understand data quality
train_rides_df.count()

ID_Base                  2061357
ID_Timestamp             2061357
ID_Stop_Number           2061357
line                     2061357
path                     1850002
starting_station_IBNR    2061357
category                 2061357
zip                      2061357
arrival_plan             1850002
departure_plan           2061357
arrival_change           1585727
departure_change         1721431
arrival_delay_m          2061357
departure_delay_m        2061357
info                      645341
arrival_delay_check      2061357
departure_delay_check    2061357
dtype: int64

In [9]:
# Group by 'zip' to understand distribution
zip_df = train_rides_df.groupby('zip').size().reset_index(name='count')
zip_df

Unnamed: 0,zip,count
0,1067,2458
1,1069,2045
2,1097,3305
3,1109,1800
4,1127,597
...,...,...
1646,99817,453
1647,99867,494
1648,99880,424
1649,99947,453


### Station Mapping
We create a mapping of stations to understand how often each 'eva_nr' appears in the dataset.

In [10]:
# Step 2: Create a new DataFrame with a count of how often each 'eva_nr' is mentioned
station_mapping_df = train_rides_df['starting_station_IBNR'].value_counts().reset_index()
station_mapping_df.columns = ['starting_station_IBNR', 'count']
station_mapping_df

Unnamed: 0,starting_station_IBNR,count
0,8004128,8732
1,8089047,8312
2,8000262,7814
3,8004132,7598
4,8004131,7382
...,...,...
1991,8007768,95
1992,8005644,74
1993,8005543,49
1994,8010035,5


In [11]:
# Count number of unique 'eva_nr' to understand the spread across different stations
station_mapping_df.count()

starting_station_IBNR    1996
count                    1996
dtype: int64

In [12]:
# Fill missing values in 'path' with empty strings
train_rides_df['path'] = train_rides_df['path'].fillna('')

# Split the 'path' into individual stations by '|', creating a list in each row
train_rides_df = train_rides_df.assign(current_station=train_rides_df['path'].str.split('|'))

# Explode the 'current_station' column to create individual rows for each station
split_stations_df = train_rides_df.explode('current_station').drop(columns=['path'])

# Add a counter to keep track of the sequence of stops within each path
split_stations_df['stop_number'] = split_stations_df.groupby(
    ['ID_Base', 'ID_Timestamp', 'ID_Stop_Number']).cumcount() + 1

# Display the result
split_stations_df.head()

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,starting_station_IBNR,category,zip,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Stolberg(Rheinl)Hbf Gl.44,1
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Eschweiler-St.Jöris,2
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Alsdorf Poststraße,3
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Alsdorf-Mariadorf,4
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Alsdorf-Kellersberg,5


In [13]:
ibnr_index_df = pd.read_csv('ibnr_stations_index.csv')

In [14]:
# Remove any leading or trailing spaces and set to lower case letters
split_stations_df['current_station'] = split_stations_df['current_station'].str.strip().str.lower()
ibnr_index_df['Station Name'] = ibnr_index_df['Station Name'].str.strip().str.lower()

In [15]:
ibnr_index_df

Unnamed: 0,IBNR,Station Name
0,8000001,aachen hbf
1,8000001,ac
2,8000001,aken c
3,8000001,aquisgrana
4,8000001,aix-la-chapelle
...,...,...
9128,8098553,hamburg-altona(s)
9129,8098555,bensersiel ne
9130,8099503,hildesheim gbf
9131,8099506,stolberg(rheinl)gbf


In [16]:
# Updated merge using 'station' instead of 'Station Name'
split_station_with_ibnr_df = split_stations_df.merge(
    ibnr_index_df,
    how='left',
    left_on='current_station',
    right_on='Station Name'
)

In [17]:
split_station_with_ibnr_df.head(10)

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,starting_station_IBNR,category,zip,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,IBNR,Station Name
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,stolberg(rheinl)hbf gl.44,1,,
1,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,eschweiler-st.jöris,2,8001917.0,eschweiler-st.jöris
2,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf poststraße,3,8000510.0,alsdorf poststraße
3,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf-mariadorf,4,8000527.0,alsdorf-mariadorf
4,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf-kellersberg,5,8000521.0,alsdorf-kellersberg
5,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf-annapark,6,8000502.0,alsdorf-annapark
6,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf-busch,7,8000507.0,alsdorf-busch
7,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,herzogenrath-august-schmidt-platz,8,8002816.0,herzogenrath-august-schmidt-platz
8,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,herzogenrath-alt-merkstein,9,8002804.0,herzogenrath-alt-merkstein
9,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,herzogenrath,10,8002806.0,herzogenrath


In [18]:
split_station_with_ibnr_df.columns

Index(['ID_Base', 'ID_Timestamp', 'ID_Stop_Number', 'line',
       'starting_station_IBNR', 'category', 'zip', 'arrival_plan',
       'departure_plan', 'arrival_change', 'departure_change',
       'arrival_delay_m', 'departure_delay_m', 'info', 'arrival_delay_check',
       'departure_delay_check', 'current_station', 'stop_number', 'IBNR',
       'Station Name'],
      dtype='object')

In [19]:
split_station_with_ibnr_df.count()

ID_Base                  19714107
ID_Timestamp             19714107
ID_Stop_Number           19714107
line                     19714107
starting_station_IBNR    19714107
category                 19714107
zip                      19714107
arrival_plan             19502752
departure_plan           19714107
arrival_change           17953196
departure_change         17972570
arrival_delay_m          19714107
departure_delay_m        19714107
info                      6886047
arrival_delay_check      19714107
departure_delay_check    19714107
current_station          19714107
stop_number              19714107
IBNR                     18584536
Station Name             18584536
dtype: int64

In [20]:
# Convert empty strings back to NaN in 'path' column
split_station_with_ibnr_df['current_station'] = split_station_with_ibnr_df['current_station'].replace('',
                                                                                                      pd.NA)

In [21]:
# Count rows where 'current_station' is NaN
nan_station_count = split_station_with_ibnr_df['current_station'].isna().sum()
print(f"Number of rows with NaN station names: {nan_station_count}")

Number of rows with NaN station names: 211355


In [22]:
# Count where 'Station Name' is NaN and 'current_station' is not NaN
count_nan_station_name = split_station_with_ibnr_df[
    (split_station_with_ibnr_df['Station Name'].isna()) &
    (split_station_with_ibnr_df['current_station'].notna())
    ].shape[0]
print(f"Number of rows with NaN IBNR that have a current station: {count_nan_station_name}")

Number of rows with NaN IBNR that have a current station: 918216


In [23]:
# Count rows where 'Station Name' is NaN
nan_station_count = split_station_with_ibnr_df['Station Name'].isna().sum()
print(f"Number of rows with NaN station names: {nan_station_count}")

Number of rows with NaN station names: 1129571


In [24]:
split_station_with_no_ibnr_df = split_station_with_ibnr_df[split_station_with_ibnr_df['Station Name'].isna()]
split_station_with_no_ibnr_df.head(10)

Unnamed: 0,ID_Base,ID_Timestamp,ID_Stop_Number,line,starting_station_IBNR,category,zip,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,IBNR,Station Name
0,1573967790757085557,2407072312,14,20,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,stolberg(rheinl)hbf gl.44,1,,
13,349781417030375472,2407080017,1,18,8000001,2,52064,,2024-07-08 00:17:00,,,0,0,,on_time,on_time,,1,,
14,7157250219775883918,2407072120,25,1,8000406,4,52066,2024-07-08 00:03:00,2024-07-08 00:04:00,2024-07-08 00:03:00,2024-07-08 00:04:00,0,0,,on_time,on_time,hamm(westf)hbf,1,,
47,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,liège-guillemins,1,,
48,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,bressoux,2,,
49,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,vise,3,,
50,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,eijsden,4,,
51,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,maastricht randwyck,5,,
52,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,maastricht,6,,
53,-2100556839975301087,2407072307,13,18,8000404,5,52072,2024-07-08 00:37:00,2024-07-08 00:41:00,2024-07-08 00:37:00,2024-07-08 00:41:00,0,0,,on_time,on_time,meerssen,7,,


In [25]:
# TODO Find the latest version of the routes and remove all older versions