# Data Exploration

### Import libraries

In [1]:
import pandas as pd

### Load & combine data

In [2]:
df = pd.concat([pd.read_parquet(f'../data/data-2025-{i:02}.parquet') for i in range(1, 5)], ignore_index=True)
pd.set_option('display.max_columns', None)

### Display data shape

In [3]:
df.shape

(7748819, 13)

### Display first 5 rows

In [4]:
df.head()

Unnamed: 0,station,train_name,final_destination_station,delay_in_min,time,is_canceled,train_type,train_line_ride_id,train_line_station_num,arrival_planned_time,arrival_change_time,departure_planned_time,departure_change_time
0,Augsburg Hbf,NJ,Hamburg-Altona,45,2025-01-01 00:10:00,False,NJ,-8677182982400525824-2412312046,7,2024-12-31 23:23:00,2024-12-31 23:46:00,2024-12-31 23:25:00,2025-01-01 00:10:00
1,Augsburg Hbf,NJ,Amsterdam Centraal,45,2025-01-01 00:10:00,False,NJ,3272146161175325165-2412312046,7,2024-12-31 23:23:00,2024-12-31 23:46:00,2024-12-31 23:25:00,2025-01-01 00:10:00
2,Bielefeld Hbf,ICE 102,Hannover Hbf,48,2025-01-01 00:17:00,False,ICE,-9170319321262747816-2412311713,16,2024-12-31 23:28:00,2025-01-01 00:16:00,2024-12-31 23:29:00,2025-01-01 00:17:00
3,Bietigheim-Bissingen,MEX 18,Heilbronn Hbf,0,2025-01-01 00:00:00,False,MEX,-5488879328755233091-2412312233,13,2024-12-31 23:58:00,2024-12-31 23:59:00,2025-01-01 00:00:00,2025-01-01 00:00:00
4,Bietigheim-Bissingen,MEX 17a,Bietigheim-Bissingen,1,2025-01-01 00:00:00,False,MEX,-2703871813082427678-2412312323,12,2024-12-31 23:59:00,2025-01-01 00:00:00,NaT,NaT


In [5]:
df.dtypes

station                      string[python]
train_name                   string[python]
final_destination_station    string[python]
delay_in_min                          int32
time                         datetime64[ns]
is_canceled                         boolean
train_type                   string[python]
train_line_ride_id           string[python]
train_line_station_num                int32
arrival_planned_time         datetime64[ns]
arrival_change_time          datetime64[ns]
departure_planned_time       datetime64[ns]
departure_change_time        datetime64[ns]
dtype: object

### Create time columns for better analysis

In [6]:
# Create time columns
df["month"] = pd.to_datetime(df["time"]).dt.month
# rename month values to actual names
df["month"] = df["month"].replace({
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
})

df["weekday"] = pd.to_datetime(df["time"]).dt.weekday
# rename weekday values to actual names
df["weekday"] = df["weekday"].replace({
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday"
})
df["day"] = pd.to_datetime(df["time"]).dt.day
df["hour"] = pd.to_datetime(df["time"]).dt.hour
df["minute"] = pd.to_datetime(df["time"]).dt.minute

### Display one specific ride

In [7]:
df[df['train_line_ride_id'] == "-9170319321262747816-2501011713"].head(20)

Unnamed: 0,station,train_name,final_destination_station,delay_in_min,time,is_canceled,train_type,train_line_ride_id,train_line_station_num,arrival_planned_time,arrival_change_time,departure_planned_time,departure_change_time,month,weekday,day,hour,minute
1994,Bielefeld Hbf,ICE 102,Hannover Hbf,15,2025-01-01 23:33:00,False,ICE,-9170319321262747816-2501011713,16,2025-01-01 23:17:00,2025-01-01 23:32:00,2025-01-01 23:18:00,2025-01-01 23:33:00,January,Wednesday,1,23,33
5748,Dortmund Hbf,ICE 102,Hannover Hbf,18,2025-01-01 22:50:00,False,ICE,-9170319321262747816-2501011713,13,2025-01-01 22:30:00,2025-01-01 22:48:00,2025-01-01 22:32:00,2025-01-01 22:50:00,January,Wednesday,1,22,50
6744,Düsseldorf Hbf,ICE 102,Hannover Hbf,1,2025-01-01 21:35:00,False,ICE,-9170319321262747816-2501011713,10,2025-01-01 21:32:00,2025-01-01 21:32:00,2025-01-01 21:34:00,2025-01-01 21:35:00,January,Wednesday,1,21,35
7541,Duisburg Hbf,ICE 102,Hannover Hbf,6,2025-01-01 21:53:00,False,ICE,-9170319321262747816-2501011713,11,2025-01-01 21:45:00,2025-01-01 21:50:00,2025-01-01 21:47:00,2025-01-01 21:53:00,January,Wednesday,1,21,53
9522,Essen Hbf,ICE 102,Hannover Hbf,7,2025-01-01 22:07:00,False,ICE,-9170319321262747816-2501011713,12,2025-01-01 21:58:00,2025-01-01 22:03:00,2025-01-01 22:00:00,2025-01-01 22:07:00,January,Wednesday,1,22,7
10929,Freiburg(Breisgau) Hbf,ICE 102,Hannover Hbf,5,2025-01-01 18:00:00,False,ICE,-9170319321262747816-2501011713,3,2025-01-01 17:53:00,2025-01-01 17:58:00,2025-01-01 17:55:00,2025-01-01 18:00:00,January,Wednesday,1,18,0
14347,Hamm(Westf)Hbf,ICE 102,Hannover Hbf,18,2025-01-01 23:06:00,False,ICE,-9170319321262747816-2501011713,14,2025-01-01 22:46:00,2025-01-01 23:04:00,2025-01-01 22:48:00,2025-01-01 23:06:00,January,Wednesday,1,23,6
16253,Herford,ICE 102,Hannover Hbf,13,2025-01-01 23:41:00,False,ICE,-9170319321262747816-2501011713,17,2025-01-01 23:26:00,2025-01-01 23:40:00,2025-01-01 23:28:00,2025-01-01 23:41:00,January,Wednesday,1,23,41
18294,Karlsruhe Hbf,ICE 102,Hannover Hbf,3,2025-01-01 19:04:00,False,ICE,-9170319321262747816-2501011713,6,2025-01-01 18:58:00,2025-01-01 19:02:00,2025-01-01 19:01:00,2025-01-01 19:04:00,January,Wednesday,1,19,4
20334,Köln Hbf,ICE 102,Hannover Hbf,1,2025-01-01 21:12:00,False,ICE,-9170319321262747816-2501011713,9,2025-01-01 21:05:00,2025-01-01 21:05:00,2025-01-01 21:11:00,2025-01-01 21:12:00,January,Wednesday,1,21,12


### Add holiday column

In [11]:
# Add holiday column
df['holiday'] = df.apply(lambda x: 1 if (x['day'] == 1 and x['month'] == 'January') or (x['day'] == 18 and x['month'] == 'April' or (x['day'] == 20 and x['month'] == "April") or (x['day'] == 21 and x["month"] == "May") or (x['day'] == 1 and x['month'] == "May")) else 0, axis=1)

In [12]:
df.tail()

Unnamed: 0,station,train_name,final_destination_station,delay_in_min,time,is_canceled,train_type,train_line_ride_id,train_line_station_num,arrival_planned_time,arrival_change_time,departure_planned_time,departure_change_time,month,weekday,day,hour,minute,holiday
7748814,Hamburg-Altona,S 1,Hamburg-Blankenese,0,2025-04-30 23:21:00,False,S,1196912460244153290-2504302242,17,2025-04-30 23:20:00,2025-04-30 23:20:00,2025-04-30 23:21:00,2025-04-30 23:21:00,April,Wednesday,30,23,21,0
7748815,Hamburg-Altona,S 1,Wedel(Holst),0,2025-04-30 23:51:00,False,S,2348175559750561464-2504302312,17,2025-04-30 23:50:00,2025-04-30 23:50:00,2025-04-30 23:51:00,2025-04-30 23:51:00,April,Wednesday,30,23,51,0
7748816,Hamburg-Altona,S 2,Aumühle,0,2025-04-30 23:32:00,False,S,2422774709714075071-2504302332,1,NaT,NaT,2025-04-30 23:32:00,2025-04-30 23:32:00,April,Wednesday,30,23,32,0
7748817,Hamburg-Altona,S 1,Wedel(Holst),0,2025-04-30 23:31:00,False,S,8805686558806681772-2504302248,20,2025-04-30 23:30:00,2025-04-30 23:30:00,2025-04-30 23:31:00,2025-04-30 23:31:00,April,Wednesday,30,23,31,0
7748818,Hamburg-Altona,S 2,Aumühle,0,2025-04-30 23:52:00,False,S,-8259733845561586295-2504302352,1,NaT,NaT,2025-04-30 23:52:00,2025-04-30 23:52:00,April,Wednesday,30,23,52,0


In [13]:
# Print trains for 18th April
df[(df['day'] == 18) & (df['month'] == 'April')].head(20)

Unnamed: 0,station,train_name,final_destination_station,delay_in_min,time,is_canceled,train_type,train_line_ride_id,train_line_station_num,arrival_planned_time,arrival_change_time,departure_planned_time,departure_change_time,month,weekday,day,hour,minute,holiday
6876642,Aschaffenburg Hbf,ICE 20,Frankfurt(Main)Hbf,84,2025-04-18 00:29:00,False,ICE,5903101010412136106-2504171713,11,2025-04-17 23:04:00,2025-04-18 00:28:00,2025-04-17 23:05:00,2025-04-18 00:29:00,April,Friday,18,0,29,1
6876643,Aschaffenburg Hbf,RE 54,Frankfurt(Main)Hbf,40,2025-04-18 00:31:00,False,RE,9117006994590991223-2504172236,17,2025-04-17 23:50:00,2025-04-18 00:16:00,2025-04-17 23:51:00,2025-04-18 00:31:00,April,Friday,18,0,31,1
6876647,Aschaffenburg Hbf,ICE 520,Frankfurt(Main)Hbf,43,2025-04-18 00:18:00,False,ICE,8573811631662133764-2504172013,4,2025-04-17 23:34:00,2025-04-18 00:17:00,2025-04-17 23:35:00,2025-04-18 00:18:00,April,Friday,18,0,18,1
6877353,Augsburg Hbf,ICE 801,München Hbf,66,2025-04-18 00:53:00,False,ICE,7074087403146415227-2504171413,15,2025-04-17 23:46:00,2025-04-18 00:52:00,2025-04-17 23:47:00,2025-04-18 00:53:00,April,Friday,18,0,53,1
6877365,Augsburg Hbf,ICE 885,München Hbf,50,2025-04-18 00:11:00,False,ICE,3763506680802059955-2504171812,7,2025-04-17 23:19:00,2025-04-18 00:09:00,2025-04-17 23:21:00,2025-04-18 00:11:00,April,Friday,18,0,11,1
6877367,Augsburg Hbf,RE 9,Ulm Hbf,18,2025-04-18 00:09:00,False,RE,9157866427519413881-2504172258,11,2025-04-17 23:46:00,2025-04-18 00:04:00,2025-04-17 23:51:00,2025-04-18 00:09:00,April,Friday,18,0,9,1
6878702,Bietigheim-Bissingen,MEX 18,Heilbronn Hbf,0,2025-04-18 00:00:00,False,MEX,-5488879328755233091-2504172233,13,2025-04-17 23:58:00,2025-04-18 00:01:00,2025-04-18 00:00:00,2025-04-18 00:00:00,April,Friday,18,0,0,1
6879606,Braunschweig Hbf,erx RB47,Braunschweig Hbf,27,2025-04-18 00:21:00,False,erx,8128416888604096807-2504172208,16,2025-04-17 23:54:00,2025-04-18 00:21:00,NaT,NaT,April,Friday,18,0,21,1
6880198,Bremen Hbf,RE 9,Bremen Hbf,24,2025-04-18 00:08:00,False,RE,7968851056265898096-2504172227,10,2025-04-17 23:44:00,2025-04-18 00:08:00,NaT,NaT,April,Friday,18,0,8,1
6881073,Darmstadt Hbf,S 6,Darmstadt Hbf,19,2025-04-18 00:15:00,False,S,-7007015678719047037-2504172255,27,2025-04-17 23:56:00,2025-04-18 00:15:00,NaT,NaT,April,Friday,18,0,15,1
