## Data Loading
This section covers the initial loading of the dataset and importing the necessary libraries.

In [8]:
import pandas as pd

In [15]:
train_rides_df = pd.read_csv('DBtrainrides.csv')

In [16]:
# Split the ID column into 'ID_Base' and 'ID_Number'
train_rides_df[['ID_Base', 'ID_Timestamp', 'ID_Stop_Number']] = train_rides_df['ID'].str.rsplit('-', n=2, expand=True)

# Convert 'ID_Number' to numeric
train_rides_df['ID_Stop_Number'] = pd.to_numeric(train_rides_df['ID_Stop_Number'])

In [17]:
# Rename 'eva_nr' to 'starting_station_ibnr'
train_rides_df = train_rides_df.rename(columns={'eva_nr': 'starting_station_ibnr'})

# Drop starting station string and accompanying data 
train_rides_df.drop(['station', 'state', 'city', 'long', 'lat'], axis=1, inplace=True)

In [18]:
# TODO: Potentially Drop arrival_delay_m, departure_delay_m, arrival_delay_check, departure_delay_check

In [19]:
# Remove 'ID' column and reorder columns so 'ID_Base', 'ID_Number', 'ID_Timestamp' are at the start
new_column_order = ['ID_Base', 'ID_Stop_Number', 'ID_Timestamp'] + [col for col in train_rides_df.columns if
                                                               col not in ['ID', 'ID_Base', 'ID_Stop_Number',
                                                                           'ID_Timestamp']]
train_rides_df = train_rides_df.drop(columns=['ID'])
train_rides_df = train_rides_df[new_column_order]

In [20]:
# Display the first few rows of the train rides DataFrame
train_rides_df.head()

Unnamed: 0,ID_Base,ID_Stop_Number,ID_Timestamp,line,path,starting_station_ibnr,category,zip,arrival_plan,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check
0,1573967790757085557,14,2407072312,20,Stolberg(Rheinl)Hbf Gl.44|Eschweiler-St.Jöris|...,8000001,2,52064,2024-07-08 00:00:00,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time
1,349781417030375472,1,2407080017,18,,8000001,2,52064,,2024-07-08 00:17:00,,,0,0,,on_time,on_time
2,7157250219775883918,25,2407072120,1,Hamm(Westf)Hbf|Kamen|Kamen-Methler|Dortmund-Ku...,8000406,4,52066,2024-07-08 00:03:00,2024-07-08 00:04:00,2024-07-08 00:03:00,2024-07-08 00:04:00,0,0,,on_time,on_time
3,349781417030375472,2,2407080017,18,Aachen Hbf,8000404,5,52072,2024-07-08 00:20:00,2024-07-08 00:21:00,,,0,0,,on_time,on_time
4,1983158592123451570,3,2407080010,33,Herzogenrath|Kohlscheid,8000404,5,52072,2024-07-08 00:20:00,2024-07-08 00:21:00,2024-07-08 00:20:00,2024-07-08 00:21:00,0,0,,on_time,on_time


## Data Cleaning
In this section, we clean and preprocess the data to prepare it for analysis.

In [21]:
# Count missing values to understand data quality
train_rides_df.count()

ID_Base                  2061357
ID_Stop_Number           2061357
ID_Timestamp             2061357
line                     2061357
path                     1850002
starting_station_ibnr    2061357
category                 2061357
zip                      2061357
arrival_plan             1850002
departure_plan           2061357
arrival_change           1585727
departure_change         1721431
arrival_delay_m          2061357
departure_delay_m        2061357
info                      645341
arrival_delay_check      2061357
departure_delay_check    2061357
dtype: int64

In [22]:
# Group by 'zip' to understand distribution
zip_df = train_rides_df.groupby('zip').size().reset_index(name='count')
zip_df

Unnamed: 0,zip,count
0,1067,2458
1,1069,2045
2,1097,3305
3,1109,1800
4,1127,597
...,...,...
1646,99817,453
1647,99867,494
1648,99880,424
1649,99947,453


### Station Mapping
We create a mapping of stations to understand how often each 'eva_nr' appears in the dataset.

In [23]:
# Step 2: Create a new DataFrame with a count of how often each 'eva_nr' is mentioned
station_mapping_df = train_rides_df['starting_station_ibnr'].value_counts().reset_index()
station_mapping_df.columns = ['starting_station_ibnr', 'count']
station_mapping_df

Unnamed: 0,starting_station_ibnr,count
0,8004128,8732
1,8089047,8312
2,8000262,7814
3,8004132,7598
4,8004131,7382
...,...,...
1991,8007768,95
1992,8005644,74
1993,8005543,49
1994,8010035,5


In [24]:
# Count number of unique 'eva_nr' to understand the spread across different stations
station_mapping_df.count()

starting_station_ibnr    1996
count                    1996
dtype: int64

### Station Stops Analysis
We now perform grouping and transformation operations to extract insights from the station-related data.

In [8]:
# Fill missing values in 'path' with empty strings
train_rides_df['path'] = train_rides_df['path'].fillna('')

# Split the 'path' into individual stations by '|', creating a list in each row
train_rides_df = train_rides_df.assign(current_station=train_rides_df['path'].str.split('|'))

# Explode the 'current_station' column to create individual rows for each station
split_stations_df = train_rides_df.explode('current_station').drop(columns=['path'])

# Add a counter to keep track of the sequence of stops within each path
split_stations_df['stop_number'] = split_stations_df.groupby('ID').cumcount() + 1

# Display the result
split_stations_df.head()

Unnamed: 0,ID,line,eva_nr,category,station,state,city,zip,long,lat,...,departure_plan,arrival_change,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.7678,...,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Stolberg(Rheinl)Hbf Gl.44,1
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.7678,...,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Eschweiler-St.Jöris,2
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.7678,...,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Alsdorf Poststraße,3
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.7678,...,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Alsdorf-Mariadorf,4
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.7678,...,2024-07-08 00:01:00,2024-07-08 00:03:00,2024-07-08 00:04:00,3,3,,on_time,on_time,Alsdorf-Kellersberg,5


In [12]:
eva_df = pd.read_csv('stations.csv')
eva_df

Unnamed: 0,Station Name,IBNR
0,Aachen Hbf,8000001
1,Aachen Schanz,8070704
2,Aachen Süd(Gr),8000403
3,Aachen West,8000404
4,Aachen-Rothe Erde,8000406
...,...,...
7662,Zwingenberg(Bergstr),8006687
7663,Zwönitz,8013435
7664,Zwota,8013437
7665,Zwota-Zechenbach,8013436


In [13]:
# Entferne mögliche führende oder nachgestellte Leerzeichen und setze auf Kleinbuchstaben
split_stations_df['current_station'] = split_stations_df['current_station'].str.strip().str.lower()
eva_df['Station Name'] = eva_df['Station Name'].str.strip().str.lower()

# Updated merge using 'station' instead of 'Station Name'
station_external_eva_mapping_df = split_stations_df.merge(
    eva_df,
    how='left',
    left_on='current_station',
    right_on='Station Name'
)

In [14]:
station_external_eva_mapping_df

Unnamed: 0,ID,line,eva_nr,category,station,state,city,zip,long,lat,...,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,Station Name,IBNR
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,stolberg(rheinl)hbf gl.44,1,,
1,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,eschweiler-st.jöris,2,eschweiler-st.jöris,8001917.0
2,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf poststraße,3,alsdorf poststraße,8000510.0
3,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf-mariadorf,4,alsdorf-mariadorf,8000527.0
4,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,alsdorf-kellersberg,5,alsdorf-kellersberg,8000521.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19785712,3370285438001482281-2407142234-7,8,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,2024-07-14 23:12:00,1,1,Information,on_time,on_time,lübeck hbf,6,lübeck hbf,8000237.0
19785713,-8774053210575864323-2407142305-3,80,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,2024-07-14 23:18:00,0,0,Information,on_time,on_time,bad oldesloe,1,bad oldesloe,8000023.0
19785714,-8774053210575864323-2407142305-3,80,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,2024-07-14 23:18:00,0,0,Information,on_time,on_time,reinfeld(holst),2,reinfeld(holst),8005019.0
19785715,-1537118689903044118-2407142354-1,11,8001580,4,Düsseldorf Flughafen Terminal,Nordrhein-Westfalen,Düsseldorf,40474,6.766979,51.278517,...,,0,0,Information. (Quelle: zuginfo.nrw),on_time,on_time,,1,,


In [15]:
station_external_eva_mapping_df.count()

ID                       19785717
line                     19785717
eva_nr                   19785717
category                 19785717
station                  19785717
state                    19785717
city                     19785717
zip                      19785717
long                     19785717
lat                      19785717
arrival_plan             19574362
departure_plan           19785717
arrival_change           18020530
departure_change         18039856
arrival_delay_m          19785717
departure_delay_m        19785717
info                      6933524
arrival_delay_check      19785717
departure_delay_check    19785717
current_station          19785717
stop_number              19785717
Station Name             18475441
IBNR                     18475441
dtype: int64

In [16]:
# Count where 'Station Name' is NaN and 'current_station' is not NaN
count_nan_station_name = station_external_eva_mapping_df[
    (station_external_eva_mapping_df['Station Name'].isna()) &
    (station_external_eva_mapping_df['current_station'].notna())
    ].shape[0]
count_nan_station_name

1310276

In [17]:
station_external_eva_mapping_df.columns

Index(['ID', 'line', 'eva_nr', 'category', 'station', 'state', 'city', 'zip',
       'long', 'lat', 'arrival_plan', 'departure_plan', 'arrival_change',
       'departure_change', 'arrival_delay_m', 'departure_delay_m', 'info',
       'arrival_delay_check', 'departure_delay_check', 'current_station',
       'stop_number', 'Station Name', 'IBNR'],
      dtype='object')

In [18]:
# Convert empty strings back to NaN in 'path' column
station_external_eva_mapping_df['current_station'] = station_external_eva_mapping_df['current_station'].replace('',
                                                                                                                pd.NA)

# Count rows where 'station' is NaN
nan_station_count = station_external_eva_mapping_df['current_station'].isna().sum()
print(f"Number of rows with NaN station names: {nan_station_count}")

Number of rows with NaN station names: 211355


In [19]:
# Count rows where 'station' is NaN
nan_station_count = station_external_eva_mapping_df['Station Name'].isna().sum()
print(f"Number of rows with NaN station names: {nan_station_count}")

Number of rows with NaN station names: 1310276


In [20]:
filtered_df = station_external_eva_mapping_df[station_external_eva_mapping_df['Station Name'].isna()]

In [21]:
filtered_df

Unnamed: 0,ID,line,eva_nr,category,station,state,city,zip,long,lat,...,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,Station Name,IBNR
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,stolberg(rheinl)hbf gl.44,1,,
13,349781417030375472-2407080017-1,18,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,,0,0,,on_time,on_time,,1,,
14,7157250219775883918-2407072120-25,1,8000406,4,Aachen-Rothe Erde,Nordrhein-Westfalen,Aachen,52066,6.116475,50.770202,...,2024-07-08 00:04:00,0,0,,on_time,on_time,hamm(westf)hbf,1,,
47,-2100556839975301087-2407072307-13,18,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:41:00,0,0,,on_time,on_time,liège-guillemins,1,,
48,-2100556839975301087-2407072307-13,18,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:41:00,0,0,,on_time,on_time,bressoux,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19785699,-4498532330426324655-2407142201-14,RE18,8003105,5,Jaderberg,Niedersachsen,Jaderberg,26349,8.184538,53.344878,...,2024-07-14 23:56:00,0,0,Bauarbeiten,on_time,on_time,oldenburg(oldb)hbf,12,,
19785701,-5558360799253050120-2407142310-4,RE18,8003105,5,Jaderberg,Niedersachsen,Jaderberg,26349,8.184538,53.344878,...,2024-07-14 23:33:00,0,0,Bauarbeiten,on_time,on_time,wilhelmshaven,1,,
19785711,3370285438001482281-2407142234-7,8,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,2024-07-14 23:12:00,1,1,Information,on_time,on_time,lübeck-dänischburg ikea,5,,
19785715,-1537118689903044118-2407142354-1,11,8001580,4,Düsseldorf Flughafen Terminal,Nordrhein-Westfalen,Düsseldorf,40474,6.766979,51.278517,...,,0,0,Information. (Quelle: zuginfo.nrw),on_time,on_time,,1,,


In [22]:
eva_v2_df = pd.read_csv('stations_v2.csv')
eva_v2_df['Station Name'] = eva_df['Station Name'].str.strip().str.lower()
eva_v2_df

Unnamed: 0,IBNR,Station Name
0,8000001,aachen hbf
1,8000001,aachen schanz
2,8000001,aachen süd(gr)
3,8000001,aachen west
4,8000001,aachen-rothe erde
...,...,...
9128,8098553,
9129,8098555,
9130,8099503,
9131,8099506,


In [23]:
# Updated merge using 'station' instead of 'Station Name'
station_external_eva_mapping_v2_df = split_stations_df.merge(
    eva_v2_df,
    how='left',
    left_on='current_station',
    right_on='Station Name'
)

In [24]:
# Convert empty strings in the 'current_station' column to NaN values
station_external_eva_mapping_v2_df['current_station'] = station_external_eva_mapping_v2_df['current_station'].replace(
    '', pd.NA)

# Count and display the number of rows with NaN values in the 'current_station' column
nan_current_station_count = station_external_eva_mapping_v2_df['current_station'].isna().sum()
print(f"Number of rows with NaN values in 'current_station': {nan_current_station_count}")

# Count and display the number of rows with NaN values in the 'Station Name' column
nan_station_name_count = station_external_eva_mapping_v2_df['Station Name'].isna().sum()
print(f"Number of rows with NaN values in 'Station Name': {nan_station_name_count}")

Number of rows with NaN values in 'current_station': 211355
Number of rows with NaN values in 'Station Name': 1310276


In [25]:
filtered_v2_df = station_external_eva_mapping_df[
    station_external_eva_mapping_df['Station Name'].isna() & station_external_eva_mapping_df['current_station'].notna()]
filtered_v2_df

Unnamed: 0,ID,line,eva_nr,category,station,state,city,zip,long,lat,...,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,Station Name,IBNR
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,2024-07-08 00:04:00,3,3,,on_time,on_time,stolberg(rheinl)hbf gl.44,1,,
14,7157250219775883918-2407072120-25,1,8000406,4,Aachen-Rothe Erde,Nordrhein-Westfalen,Aachen,52066,6.116475,50.770202,...,2024-07-08 00:04:00,0,0,,on_time,on_time,hamm(westf)hbf,1,,
47,-2100556839975301087-2407072307-13,18,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:41:00,0,0,,on_time,on_time,liège-guillemins,1,,
48,-2100556839975301087-2407072307-13,18,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:41:00,0,0,,on_time,on_time,bressoux,2,,
49,-2100556839975301087-2407072307-13,18,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:41:00,0,0,,on_time,on_time,vise,3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19785381,5391302266099657590-2407142320-8,68,8006283,5,Weinheim-Sulzbach,Baden-Württemberg,Weinheim,69469,8.656360,49.575800,...,2024-07-14 23:51:00,0,1,Information,on_time,on_time,neu-edingen/friedrichsfeld,3,,
19785385,5391302266099657590-2407142320-8,68,8006283,5,Weinheim-Sulzbach,Baden-Württemberg,Weinheim,69469,8.656360,49.575800,...,2024-07-14 23:51:00,0,1,Information,on_time,on_time,weinheim(bergstr)hbf,7,,
19785699,-4498532330426324655-2407142201-14,RE18,8003105,5,Jaderberg,Niedersachsen,Jaderberg,26349,8.184538,53.344878,...,2024-07-14 23:56:00,0,0,Bauarbeiten,on_time,on_time,oldenburg(oldb)hbf,12,,
19785701,-5558360799253050120-2407142310-4,RE18,8003105,5,Jaderberg,Niedersachsen,Jaderberg,26349,8.184538,53.344878,...,2024-07-14 23:33:00,0,0,Bauarbeiten,on_time,on_time,wilhelmshaven,1,,


In [40]:
station_external_eva_mapping_v2_df

Unnamed: 0,ID,line,eva_nr,category,station,state,city,zip,long,lat,...,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,IBNR,Station Name,ID_Base,ID_Number
0,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,3,,on_time,on_time,stolberg(rheinl)hbf gl.44,1,,,1573967790757085557-2407072312,14
1,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,3,,on_time,on_time,eschweiler-st.jöris,2,8001854.0,eschweiler-st.jöris,1573967790757085557-2407072312,14
2,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,3,,on_time,on_time,alsdorf poststraße,3,8000047.0,alsdorf poststraße,1573967790757085557-2407072312,14
3,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,3,,on_time,on_time,alsdorf-mariadorf,4,8000049.0,alsdorf-mariadorf,1573967790757085557-2407072312,14
4,1573967790757085557-2407072312-14,20,8000001,2,Aachen Hbf,Nordrhein-Westfalen,Aachen,52064,6.091499,50.767800,...,3,,on_time,on_time,alsdorf-kellersberg,5,8000049.0,alsdorf-kellersberg,1573967790757085557-2407072312,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19785712,3370285438001482281-2407142234-7,8,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,1,Information,on_time,on_time,lübeck hbf,6,8004905.0,lübeck hbf,3370285438001482281-2407142234,7
19785713,-8774053210575864323-2407142305-3,80,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,0,Information,on_time,on_time,bad oldesloe,1,8000218.0,bad oldesloe,-8774053210575864323-2407142305,3
19785714,-8774053210575864323-2407142305-3,80,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,0,Information,on_time,on_time,reinfeld(holst),2,8007335.0,reinfeld(holst),-8774053210575864323-2407142305,3
19785715,-1537118689903044118-2407142354-1,11,8001580,4,Düsseldorf Flughafen Terminal,Nordrhein-Westfalen,Düsseldorf,40474,6.766979,51.278517,...,0,Information. (Quelle: zuginfo.nrw),on_time,on_time,,1,,,-1537118689903044118-2407142354,1


In [27]:
# station_external_eva_mapping_v2_df.head(50)

In [28]:
station_external_eva_mapping_v2_df.count()

ID                       19785717
line                     19785717
eva_nr                   19785717
category                 19785717
station                  19785717
state                    19785717
city                     19785717
zip                      19785717
long                     19785717
lat                      19785717
arrival_plan             19574362
departure_plan           19785717
arrival_change           18020530
departure_change         18039856
arrival_delay_m          19785717
departure_delay_m        19785717
info                      6933524
arrival_delay_check      19785717
departure_delay_check    19785717
current_station          19574362
stop_number              19785717
IBNR                     18475441
Station Name             18475441
dtype: int64

In [29]:
filtered_v2_df.count()

ID                       1098921
line                     1098921
eva_nr                   1098921
category                 1098921
station                  1098921
state                    1098921
city                     1098921
zip                      1098921
long                     1098921
lat                      1098921
arrival_plan             1098921
departure_plan           1098921
arrival_change            983602
departure_change          963084
arrival_delay_m          1098921
departure_delay_m        1098921
info                      274345
arrival_delay_check      1098921
departure_delay_check    1098921
current_station          1098921
stop_number              1098921
Station Name                   0
IBNR                           0
dtype: int64

In [39]:
# Split the ID column into 'ID_Base' and 'ID_Number'
station_external_eva_mapping_v2_df[['ID_Base', 'ID_Number']] = station_external_eva_mapping_v2_df['ID'].str.rsplit('-',
                                                                                                                   n=1,
                                                                                                                   expand=True)

# Convert 'ID_Number' to numeric
station_external_eva_mapping_v2_df['ID_Number'] = pd.to_numeric(station_external_eva_mapping_v2_df['ID_Number'])

In [37]:
# Get the indices of the rows with the highest 'ID_Number' for each 'ID_Base'
idx_max_id_number = station_external_eva_mapping_v2_df.groupby('ID_Base')['ID_Number'].idxmax()

In [38]:
# Step 1: Identify all IDs with at least one missing IBNR
ids_with_missing_ibnr = station_external_eva_mapping_v2_df[
    station_external_eva_mapping_v2_df['IBNR'].isna()
]['ID'].unique()

# Step 2: Filter out all rows belonging to those IDs
filtered_routes_df = station_external_eva_mapping_v2_df[
    ~station_external_eva_mapping_v2_df['ID'].isin(ids_with_missing_ibnr)
]

# Get the indices of the rows with the highest 'ID_Number' for each 'ID_Base'
idx_max_id_number = filtered_routes_df.groupby('ID_Base')['ID_Number'].idxmax()

# Use the indices to filter the original DataFrame
filtered_routes_df = filtered_routes_df.loc[idx_max_id_number]

Unnamed: 0,ID,line,eva_nr,category,station,state,city,zip,long,lat,...,departure_change,arrival_delay_m,departure_delay_m,info,arrival_delay_check,departure_delay_check,current_station,stop_number,IBNR,Station Name
38,349781417030375472-2407080017-2,18,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,,0,0,,on_time,on_time,aachen hbf,1,8000001.0,aachen hbf
39,1983158592123451570-2407080010-3,33,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:21:00,0,0,,on_time,on_time,herzogenrath,1,8003378.0,herzogenrath
40,1983158592123451570-2407080010-3,33,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:21:00,0,0,,on_time,on_time,kohlscheid,2,8004167.0,kohlscheid
41,-5293934437045765939-2407080023-2,4,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,2024-07-08 00:31:00,0,0,Bauarbeiten. (Quelle: zuginfo.nrw),on_time,on_time,herzogenrath,1,8003378.0,herzogenrath
42,6845762881043426854-2407072357-6,RB33,8000404,5,Aachen West,Nordrhein-Westfalen,Aachen,52072,6.070715,50.780360,...,,0,0,,on_time,on_time,lindern,1,8004765.0,lindern
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19785704,-3877986638624297828-2407142237-4,S9,8002795,5,Herten (Westf),Nordrhein-Westfalen,Herten,45699,7.139053,51.597508,...,,0,0,,on_time,on_time,bottrop hbf,1,8000746.0,bottrop hbf
19785705,-3877986638624297828-2407142237-4,S9,8002795,5,Herten (Westf),Nordrhein-Westfalen,Herten,45699,7.139053,51.597508,...,,0,0,,on_time,on_time,bottrop-boy,2,8000749.0,bottrop-boy
19785706,-3877986638624297828-2407142237-4,S9,8002795,5,Herten (Westf),Nordrhein-Westfalen,Herten,45699,7.139053,51.597508,...,,0,0,,on_time,on_time,gladbeck west,3,8002461.0,gladbeck west
19785713,-8774053210575864323-2407142305-3,80,8003775,5,Lübeck-Moisling,Schleswig-Holstein,Lübeck,23560,10.629500,53.836800,...,2024-07-14 23:18:00,0,0,Information,on_time,on_time,bad oldesloe,1,8000218.0,bad oldesloe


In [32]:
# # Step 1: Identify all IDs with at least one missing IBNR
# ids_with_missing_ibnr = station_external_eva_mapping_v2_df[
#     station_external_eva_mapping_v2_df['IBNR'].isna()
# ]['ID'].unique()
#
# # Step 2: Filter out all rows belonging to those IDs
# filtered_routes_df = station_external_eva_mapping_v2_df[
#     ~station_external_eva_mapping_v2_df['ID'].isin(ids_with_missing_ibnr)
# ].copy()  # Use `.copy()` to create an explicit copy
#
# # Split the ID column into 'ID_Base' and 'ID_Number'
# filtered_routes_df[['ID_Base', 'ID_Number']] = filtered_routes_df['ID'].str.rsplit('-', n=1, expand=True)
#
# # Convert 'ID_Number' to numeric for comparison
# filtered_routes_df['ID_Number'] = pd.to_numeric(filtered_routes_df['ID_Number'])
#
# # Sort the DataFrame by 'ID_Base' and 'ID_Number' to ensure correct ordering
# filtered_routes_df = filtered_routes_df.sort_values(by=['ID_Base', 'ID_Number'], ascending=[True, False])
#
# # Get the indices of the rows with the highest 'ID_Number' for each 'ID_Base'
# idx_max_id_number = filtered_routes_df.groupby('ID_Base')['ID_Number'].idxmax()
#
# # Use the indices to filter the original DataFrame
# filtered_routes_df = filtered_routes_df.loc[idx_max_id_number]

In [33]:
filtered_routes_df.head(1000)