In [None]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import calendar

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")


In [None]:
train_delay_df = pd.read_csv('Data_raw_punctuality_202201.csv')


In [None]:
train_delay_df.sample(10)


In [None]:
train_delay_df.shape


In [None]:
train_delay_df.info()


In [None]:
train_delay_df.isnull().sum()


In [None]:
train_delay_df = train_delay_df.dropna()


In [None]:
train_delay_df.isnull().sum()


In [None]:
#converting the object type to Hour:Minute:Second datetime type
train_delay_df['REAL_TIME_ARR'] = pd.to_datetime(train_delay_df['REAL_TIME_ARR'], format='%H:%M:%S')
train_delay_df['REAL_TIME_DEP'] = pd.to_datetime(train_delay_df['REAL_TIME_DEP'], format='%H:%M:%S')
train_delay_df['PLANNED_TIME_ARR'] = pd.to_datetime(train_delay_df['PLANNED_TIME_ARR'], format='%H:%M:%S').dt.time
train_delay_df['PLANNED_TIME_DEP'] = pd.to_datetime(train_delay_df['PLANNED_TIME_DEP'], format='%H:%M:%S').dt.time

#converting the object type datetime type
#train_delay_df['DATDEP'] = pd.to_datetime(train_delay_df['DATDEP'], format='%d%b%Y')
train_delay_df['PLANNED_DATE_ARR'] = pd.to_datetime(train_delay_df['PLANNED_DATE_ARR'], format="%d%b%Y")
train_delay_df['PLANNED_DATE_DEP'] = pd.to_datetime(train_delay_df['PLANNED_DATE_DEP'], format="%d%b%Y")
train_delay_df['REAL_DATE_ARR'] = pd.to_datetime(train_delay_df['REAL_DATE_ARR'], format='%d%b%Y')
train_delay_df['REAL_DATE_DEP'] = pd.to_datetime(train_delay_df['REAL_DATE_DEP'], format='%d%b%Y')


In [None]:
#Remove the train_type name strings before the colon(:)
train_delay_df['RELATION_DIRECTION'] = train_delay_df['RELATION_DIRECTION'].apply(lambda x: x.split(": ")[1])

#Create a start location feature
train_delay_df['START_LOCATION'] = train_delay_df['RELATION_DIRECTION'].apply(lambda x: x.split(" ->")[0])

#Creating a new feature
train_delay_df['FINAL_STOP_LOCATION'] = train_delay_df['RELATION_DIRECTION'].apply(lambda x: x.split( "> ")[1])

# dropping unwanted feature
train_delay_df = train_delay_df.drop("RELATION_DIRECTION", axis=1)


In [None]:
#Create day_of_departure and arrival
train_delay_df["DAY_OF_DEPARTURE"] = train_delay_df["REAL_DATE_DEP"].apply(lambda x: x.strftime("%A"))
train_delay_df["DAY_OF_ARRIVAL"] = train_delay_df["REAL_DATE_ARR"].apply(lambda x: x.strftime("%A"))

#Create hour of departure
train_delay_df['HOUR_OF_DEPARTURE'] = train_delay_df.REAL_TIME_DEP.dt.hour.astype(int)
#Create hour of arrival
train_delay_df['HOUR_OF_ARRIVAL'] = train_delay_df.REAL_TIME_ARR.dt.hour.astype(int)

train_delay_df['REAL_TIME_ARR'] = train_delay_df['REAL_TIME_ARR'].dt.time
train_delay_df['REAL_TIME_DEP'] = train_delay_df['REAL_TIME_DEP'].dt.time


In [None]:
#Creating new DELAYED categorical features
train_delay_df['DELAYED_ARRIVAL'] = train_delay_df.apply(lambda x: 'Yes' if x['DELAY_ARR'] > 0 else 'No', axis=1)
train_delay_df['DELAYED_DEPARTURE'] = train_delay_df.apply(lambda x: 'Yes' if x['DELAY_DEP'] > 0 else 'No', axis=1)


In [None]:
##combine date column and time column
train_delay_df['ARRIVAL_DATE_TIME'] = pd.to_datetime(
    train_delay_df['REAL_DATE_ARR'].astype(str) + ' ' + train_delay_df['REAL_TIME_ARR'].astype(str))

train_delay_df['DEPARTURE_DATE_TIME'] = pd.to_datetime(
    train_delay_df['REAL_DATE_DEP'].astype(str) + ' ' + train_delay_df['REAL_TIME_DEP'].astype(str))


In [None]:
#checking the new dataframe
train_delay_df.sample(5)


In [None]:
# List of columns to be dropped
drop_cols = ['PTCAR_NO', 'DATDEP', 'TRAIN_NO']
train_delay_df.drop(columns=drop_cols, inplace=True)


In [None]:
train_delay_df.shape


In [None]:
train_delay_df.info()


In [None]:
train_delay_df['TRAIN_SERV'].unique()


In [None]:
train_delay_df['RELATION'].unique()


In [None]:
def rail_ops_trains(railop):
    rail_op = train_delay_df[train_delay_df['TRAIN_SERV'] == railop]
    rail_op_train_types = rail_op['RELATION'].unique()
    return rail_op_train_types


In [None]:
rail_ops_trains('SNCB/NMBS')


In [None]:
list(rail_ops_trains('EUROSTARFR'))


In [None]:
list(rail_ops_trains('THI-FACT'))


In [None]:
px.histogram(train_delay_df, x="TRAIN_SERV", 
             title="most active railway operator").update_xaxes(categoryorder='total descending')


In [None]:
px.histogram(train_delay_df[train_delay_df['TRAIN_SERV'] == 'SNCB/NMBS'], x="RELATION", 
             title="most busy train type").update_xaxes(categoryorder='total descending')


In [None]:
#create pie chart
train_delay_df['DELAYED_DEPARTURE'].value_counts().plot.pie(autopct='%1.0f%%')


In [None]:
#create pie chart
train_delay_df['DELAYED_ARRIVAL'].value_counts().plot.pie(autopct='%1.0f%%')


In [None]:
# creating a function to plot a bar chart 
def most_busy_day(df,column):
    #remove all the zero values
    df = df[df.DAY_OF_DEPARTURE != 'nan']
    #plotting graph
    plt.figure(figsize=(10,5))
    data = df[str(column)].value_counts()
    ax = sns.barplot(x = data.index, y = data, order= data.index, palette='dark', edgecolor="black")
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), 
                fontsize=11, ha='center', va='bottom')
    plt.title(f'Most busy train day', fontsize=14)
    plt.xlabel('Days')
    plt.ylabel('Train depature count')
    plt.xticks(np.arange(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
    plt.xticks(rotation=90)
    plt.show()

# Call Function
most_busy_day(train_delay_df,'DAY_OF_DEPARTURE')


In [None]:
# plotting categorical features
fig, ax=plt.subplots(nrows=2, figsize=[10,10])
sns.countplot(data=train_delay_df, x='HOUR_OF_ARRIVAL', hue='DELAYED_ARRIVAL',
ax=ax[0])
ax[0].legend(title='Delayed arrival hours')
sns.countplot(data=train_delay_df, x='HOUR_OF_DEPARTURE', hue='DELAYED_DEPARTURE',
 ax=ax[1])
ax[1].legend(title='Delayed depature hours')

fig.suptitle('Count of train delays by hours', fontsize=18)
plt.show()


In [None]:
#plot grouped bar chart
px.histogram(train_delay_df, x="DAY_OF_DEPARTURE", 
             title="Day of most depature delay", 
             color='DELAYED_DEPARTURE', barmode = 'group')


In [None]:
# Creating box plot
order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.boxplot(data=train_delay_df[train_delay_df['DELAYED_ARRIVAL']=='Yes'], 
                                x="DAY_OF_ARRIVAL", y="DELAY_ARR", showfliers=False, order=order);


In [None]:
# Creating box plot
sns.boxplot(data=train_delay_df[train_delay_df['DELAYED_ARRIVAL']=='Yes'], 
                                x="HOUR_OF_ARRIVAL", y="DELAY_ARR", showfliers=False);


In [None]:
early_travel = train_delay_df[train_delay_df['HOUR_OF_ARRIVAL'] == 2]
early_travel[['RELATION', 'LINE_NO_DEP', 'LINE_NO_ARR', 'DELAY_ARR', 'DELAY_DEP', 
              'START_LOCATION', 'FINAL_STOP_LOCATION', 'HOUR_OF_ARRIVAL' ]]


In [None]:
px.histogram(train_delay_df[train_delay_df['DELAYED_ARRIVAL'] != 'Yes'], x="RELATION", 
             title="Delay count").update_xaxes(categoryorder='total descending')


In [None]:
px.histogram(train_delay_df[train_delay_df['DELAYED_ARRIVAL'] != 'No'], x="RELATION", 
             title="Delay count").update_xaxes(categoryorder='total descending')


In [None]:
#extract details of IC 25 from dataframe and create line plot
ic_25 = train_delay_df[train_delay_df['RELATION'] == 'IC 25']
ic_25 = ic_25[['RELATION', 'ARRIVAL_DATE_TIME', 'LINE_NO_DEP', 'LINE_NO_DEP', 'DELAY_ARR', 'DELAY_DEP']]
ic_25.sample(10)


In [None]:
ic_25 = ic_25.sort_values(by='ARRIVAL_DATE_TIME')
fig = px.line(ic_25, x='ARRIVAL_DATE_TIME', y="DELAY_ARR")
fig.show()


In [None]:
# create function to plot time delay on any train type
def delay_line_plot(relation):
    train_type = train_delay_df[train_delay_df['RELATION'] == relation]
    train_type = train_type[['ARRIVAL_DATE_TIME', 'DELAY_ARR', 'DELAY_DEP']]
    train_type = train_type.sort_values(by='ARRIVAL_DATE_TIME')
    fig = px.line(train_type, x='ARRIVAL_DATE_TIME', y="DELAY_ARR")
    return fig.show()


In [None]:
delay_line_plot('IC 01')


In [None]:
px.histogram(train_delay_df, x="LINE_NO_ARR", 
             title="Most used rail line").update_xaxes(categoryorder='total descending')


In [None]:
list(train_delay_df[train_delay_df["LINE_NO_ARR"] == '161']['RELATION'].unique())
