In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/fromAPI/cleaned_hourly_all.csv")

In [3]:
date_format = "%Y-%m-%d %H:%M:%S"
df["arrival_plan"] = pd.to_datetime(df["arrival_plan"], format=date_format)
df["departure_plan"] = pd.to_datetime(df["departure_plan"], format=date_format)
df["arrival_change"] = pd.to_datetime(df["arrival_change"], format=date_format)
df["departure_change"] = pd.to_datetime(df["departure_change"], format=date_format)
df["arrival_plan_hour"] = df["arrival_plan"].dt.hour


In [4]:
df.dropna(subset=["arrival_plan"], inplace=True)

In [5]:
ml_arrival = df.drop([
    'ID', 'train', 'path', 'eva_nr', 'category', 
    #'name', 
    'state', 
    'city',
    'zip', 
    #'long', 'lat', 
    'arrival_plan', 'departure_plan',
    'arrival_change', 'departure_change', 'arrival_delay_m',
    'depature_delay_m', 
    'hour', 
    #'arrival_delay_check',
    'departure_delay_check'
    #'arrival_plan_hour'
    ], axis=1)

In [6]:
ml_arrival.loc[ml_arrival["arrival_delay_check"] == "long_delay", ["arrival_delay_check"]] = "delay"

In [7]:
ml_arrival.loc[ml_arrival["arrival_delay_check"] == "delay", ["arrival_delay_check"]] = 1
ml_arrival.loc[ml_arrival["arrival_delay_check"] == "on_time", ["arrival_delay_check"]] = 0
ml_arrival["arrival_delay_check"] = ml_arrival["arrival_delay_check"].astype(int)

In [9]:
ml_arrival.head()

Unnamed: 0,name,long,lat,arrival_delay_check,arrival_plan_hour
0,Aachen Hbf,6.091499,50.7678,0,13.0
1,Aachen Hbf,6.091499,50.7678,0,13.0
7,Aachen Hbf,6.091499,50.7678,0,13.0
8,Aachen Hbf,6.091499,50.7678,0,13.0
10,Aachen Hbf,6.091499,50.7678,1,13.0


In [10]:
ml_arrival.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1241603 entries, 0 to 1370586
Data columns (total 5 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   name                 1241603 non-null  object 
 1   long                 1241603 non-null  float64
 2   lat                  1241603 non-null  float64
 3   arrival_delay_check  1241603 non-null  int64  
 4   arrival_plan_hour    1241603 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 56.8+ MB


In [11]:
ml_arrival.name.nunique()

1996

In [12]:
ml_arrival.arrival_delay_check.value_counts()

arrival_delay_check
0    1135941
1     105662
Name: count, dtype: int64

In [13]:
ml_arrival_bystations = pd.get_dummies(ml_arrival, columns=["name"])

In [14]:
ml_arrival_bystations.head()

Unnamed: 0,long,lat,arrival_delay_check,arrival_plan_hour,name_Aachen Hbf,name_Aachen Schanz,name_Aachen West,name_Aachen-Rothe Erde,name_Aalen Hbf,name_Achern,...,name_Zorneding,name_Zossen,name_Zweibrücken Hbf,name_Zwickau (Sachs) Hbf,name_Zwingenberg (Bergstr),name_Züssow,name_Öhringen Hbf,name_Ötigheim,name_Übach-Palenberg,name_Übersee
0,6.091499,50.7678,0,13.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,6.091499,50.7678,0,13.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,6.091499,50.7678,0,13.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,6.091499,50.7678,0,13.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10,6.091499,50.7678,1,13.0,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
X = ml_arrival_bystations.drop('arrival_delay_check',axis=1)
y = ml_arrival_bystations['arrival_delay_check']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True, stratify=y)

In [None]:
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)

In [None]:
y_pred = dtree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

In [None]:
import seaborn as sns
import numpy as np

In [None]:
cmat = confusion_matrix(y_test, y_pred)
fig = sns.heatmap(cmat, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_pred))
fig.set_title('Confusion Matrix Tree')
fig.set_xlabel('Predicted')
fig.set_ylabel('True')

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
model = DecisionTreeClassifier(random_state=42)

In [24]:
param_grid = { 
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [32, 64, None],
    #'min_samples_split': [2,8,16],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, 
    scoring="f1_macro",
    verbose=3,
    
    
    ) #scoring defaults to accuracy if not stated otherwise, n_jobs=-1 means use all available cores for computing.
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END ......criterion=gini, max_depth=32;, score=0.501 total time= 1.5min
[CV 2/3] END ......criterion=gini, max_depth=32;, score=0.503 total time= 1.3min
[CV 3/3] END ......criterion=gini, max_depth=32;, score=0.500 total time= 1.6min
[CV 1/3] END ......criterion=gini, max_depth=64;, score=0.501 total time= 1.5min
[CV 2/3] END ......criterion=gini, max_depth=64;, score=0.504 total time= 1.6min
[CV 3/3] END ......criterion=gini, max_depth=64;, score=0.501 total time= 2.5min
[CV 1/3] END ....criterion=gini, max_depth=None;, score=0.501 total time= 1.6min
[CV 2/3] END ....criterion=gini, max_depth=None;, score=0.504 total time= 1.7min
[CV 3/3] END ....criterion=gini, max_depth=None;, score=0.501 total time= 2.7min
[CV 1/3] END ...criterion=entropy, max_depth=32;, score=0.501 total time= 1.5min
[CV 2/3] END ...criterion=entropy, max_depth=32;, score=0.503 total time= 1.3min
[CV 3/3] END ...criterion=entropy, max_depth=32;,

In [19]:
param_grid = { 
    'criterion': ['gini'],
    'max_depth': [None],
    'min_samples_split': [2,8,16],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, 
    scoring="f1_macro",
    verbose=3,
    
    
    ) #scoring defaults to accuracy if not stated otherwise, n_jobs=-1 means use all available cores for computing.
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 1/3] END criterion=gini, max_depth=None, min_samples_split=2;, score=0.501 total time= 1.6min
[CV 2/3] END criterion=gini, max_depth=None, min_samples_split=2;, score=0.504 total time= 1.7min
[CV 3/3] END criterion=gini, max_depth=None, min_samples_split=2;, score=0.501 total time= 2.8min
[CV 1/3] END criterion=gini, max_depth=None, min_samples_split=8;, score=0.501 total time= 1.6min
[CV 2/3] END criterion=gini, max_depth=None, min_samples_split=8;, score=0.504 total time= 1.6min
[CV 3/3] END criterion=gini, max_depth=None, min_samples_split=8;, score=0.502 total time= 2.8min
[CV 1/3] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.501 total time= 1.7min
[CV 2/3] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.504 total time= 1.9min
[CV 3/3] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.502 total time= 2.8min


In [21]:
param_grid = { 
    'criterion': ['gini'],
    'max_depth': [None],
    'min_samples_split': [16,32,64],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, 
    scoring="f1_macro",
    verbose=3,
    
    
    ) #scoring defaults to accuracy if not stated otherwise, n_jobs=-1 means use all available cores for computing.
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.501 total time= 1.9min
[CV 2/5] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.500 total time= 1.8min
[CV 3/5] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.503 total time= 2.1min
[CV 4/5] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.500 total time= 2.4min
[CV 5/5] END criterion=gini, max_depth=None, min_samples_split=16;, score=0.501 total time= 2.8min
[CV 1/5] END criterion=gini, max_depth=None, min_samples_split=32;, score=0.500 total time= 1.9min
[CV 2/5] END criterion=gini, max_depth=None, min_samples_split=32;, score=0.499 total time= 2.2min
[CV 3/5] END criterion=gini, max_depth=None, min_samples_split=32;, score=0.501 total time= 2.3min
[CV 4/5] END criterion=gini, max_depth=None, min_samples_split=32;, score=0.499 total time= 2.3min
[CV 5/5] END criterion=gini, max_depth=None, min_

In [22]:
print(best_params)

{'criterion': 'gini', 'max_depth': None, 'min_samples_split': 16}


In [23]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    227189
           1       0.42      0.02      0.04     21132

    accuracy                           0.91    248321
   macro avg       0.67      0.51      0.50    248321
weighted avg       0.87      0.91      0.88    248321

Confusion Matrix:
[[226498    691]
 [ 20633    499]]
