# Data Exploration

### Import libraries

In [5]:
import pandas as pd

### Load data

In [6]:
df = pd.read_parquet('../data/data-2025-01.parquet')
pd.set_option('display.max_columns', None)

### Display data shape

In [8]:
df.shape

(2016880, 13)

### Display first 5 rows

In [9]:
df.head()

Unnamed: 0,station,train_name,final_destination_station,delay_in_min,time,is_canceled,train_type,train_line_ride_id,train_line_station_num,arrival_planned_time,arrival_change_time,departure_planned_time,departure_change_time
0,Augsburg Hbf,NJ,Hamburg-Altona,45,2025-01-01 00:10:00,False,NJ,-8677182982400525824-2412312046,7,2024-12-31 23:23:00,2024-12-31 23:46:00,2024-12-31 23:25:00,2025-01-01 00:10:00
1,Augsburg Hbf,NJ,Amsterdam Centraal,45,2025-01-01 00:10:00,False,NJ,3272146161175325165-2412312046,7,2024-12-31 23:23:00,2024-12-31 23:46:00,2024-12-31 23:25:00,2025-01-01 00:10:00
2,Bielefeld Hbf,ICE 102,Hannover Hbf,48,2025-01-01 00:17:00,False,ICE,-9170319321262747816-2412311713,16,2024-12-31 23:28:00,2025-01-01 00:16:00,2024-12-31 23:29:00,2025-01-01 00:17:00
3,Bietigheim-Bissingen,MEX 18,Heilbronn Hbf,0,2025-01-01 00:00:00,False,MEX,-5488879328755233091-2412312233,13,2024-12-31 23:58:00,2024-12-31 23:59:00,2025-01-01 00:00:00,2025-01-01 00:00:00
4,Bietigheim-Bissingen,MEX 17a,Bietigheim-Bissingen,1,2025-01-01 00:00:00,False,MEX,-2703871813082427678-2412312323,12,2024-12-31 23:59:00,2025-01-01 00:00:00,NaT,NaT


In [10]:
df.dtypes

station                      string[python]
train_name                   string[python]
final_destination_station    string[python]
delay_in_min                          int32
time                         datetime64[ns]
is_canceled                         boolean
train_type                   string[python]
train_line_ride_id           string[python]
train_line_station_num                int32
arrival_planned_time         datetime64[ns]
arrival_change_time          datetime64[ns]
departure_planned_time       datetime64[ns]
departure_change_time        datetime64[ns]
dtype: object

### Create time columns for better analysis

In [11]:
# Create time columns
df["month"] = pd.to_datetime(df["time"]).dt.month
# rename month values to actual names
df["month"] = df["month"].replace({
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December"
})

df["weekday"] = pd.to_datetime(df["time"]).dt.weekday
# rename weekday values to actual names
df["weekday"] = df["weekday"].replace({
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday"
})
df["day"] = pd.to_datetime(df["time"]).dt.day
df["hour"] = pd.to_datetime(df["time"]).dt.hour
df["minute"] = pd.to_datetime(df["time"]).dt.minute

# ToDo: Add column for Feiertage

### Display one specific ride

In [12]:
df[df['train_line_ride_id'] == "-9170319321262747816-2501011713"].head(20)

Unnamed: 0,station,train_name,final_destination_station,delay_in_min,time,is_canceled,train_type,train_line_ride_id,train_line_station_num,arrival_planned_time,arrival_change_time,departure_planned_time,departure_change_time,month,weekday,day,hour,minute
1994,Bielefeld Hbf,ICE 102,Hannover Hbf,15,2025-01-01 23:33:00,False,ICE,-9170319321262747816-2501011713,16,2025-01-01 23:17:00,2025-01-01 23:32:00,2025-01-01 23:18:00,2025-01-01 23:33:00,January,Wednesday,1,23,33
5748,Dortmund Hbf,ICE 102,Hannover Hbf,18,2025-01-01 22:50:00,False,ICE,-9170319321262747816-2501011713,13,2025-01-01 22:30:00,2025-01-01 22:48:00,2025-01-01 22:32:00,2025-01-01 22:50:00,January,Wednesday,1,22,50
6744,Düsseldorf Hbf,ICE 102,Hannover Hbf,1,2025-01-01 21:35:00,False,ICE,-9170319321262747816-2501011713,10,2025-01-01 21:32:00,2025-01-01 21:32:00,2025-01-01 21:34:00,2025-01-01 21:35:00,January,Wednesday,1,21,35
7541,Duisburg Hbf,ICE 102,Hannover Hbf,6,2025-01-01 21:53:00,False,ICE,-9170319321262747816-2501011713,11,2025-01-01 21:45:00,2025-01-01 21:50:00,2025-01-01 21:47:00,2025-01-01 21:53:00,January,Wednesday,1,21,53
9522,Essen Hbf,ICE 102,Hannover Hbf,7,2025-01-01 22:07:00,False,ICE,-9170319321262747816-2501011713,12,2025-01-01 21:58:00,2025-01-01 22:03:00,2025-01-01 22:00:00,2025-01-01 22:07:00,January,Wednesday,1,22,7
10929,Freiburg(Breisgau) Hbf,ICE 102,Hannover Hbf,5,2025-01-01 18:00:00,False,ICE,-9170319321262747816-2501011713,3,2025-01-01 17:53:00,2025-01-01 17:58:00,2025-01-01 17:55:00,2025-01-01 18:00:00,January,Wednesday,1,18,0
14347,Hamm(Westf)Hbf,ICE 102,Hannover Hbf,18,2025-01-01 23:06:00,False,ICE,-9170319321262747816-2501011713,14,2025-01-01 22:46:00,2025-01-01 23:04:00,2025-01-01 22:48:00,2025-01-01 23:06:00,January,Wednesday,1,23,6
16253,Herford,ICE 102,Hannover Hbf,13,2025-01-01 23:41:00,False,ICE,-9170319321262747816-2501011713,17,2025-01-01 23:26:00,2025-01-01 23:40:00,2025-01-01 23:28:00,2025-01-01 23:41:00,January,Wednesday,1,23,41
18294,Karlsruhe Hbf,ICE 102,Hannover Hbf,3,2025-01-01 19:04:00,False,ICE,-9170319321262747816-2501011713,6,2025-01-01 18:58:00,2025-01-01 19:02:00,2025-01-01 19:01:00,2025-01-01 19:04:00,January,Wednesday,1,19,4
20334,Köln Hbf,ICE 102,Hannover Hbf,1,2025-01-01 21:12:00,False,ICE,-9170319321262747816-2501011713,9,2025-01-01 21:05:00,2025-01-01 21:05:00,2025-01-01 21:11:00,2025-01-01 21:12:00,January,Wednesday,1,21,12
