# Prediction of tram delays in Cracow

## Load dataset

In [4]:
import pandas as pd
import numpy as np
np.random.seed(0)

In [5]:
df_train = pd.read_hdf('../input/tram.train.h5')
df_test = pd.read_hdf('../input/tram.test.h5') #dataset without labels

## Concat datasets

Concatting `df_train` with ` df_test` and signing it to `df`.

In [6]:
df = pd.concat([df_train, df_test])
df.shape

(308152, 11)

In [7]:
df.sample(8)

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,seq_num
46049,46049,180.0,2018-07-23 22:46:12,409,Centralna,22,Kombinat,2018-07-23 22:43:00,6.352185e+18,6351558574044899599,
70448,70448,,,367,Francesco Nullo,14,Mistrzejowice,2018-07-24 14:09:00,6.352185e+18,6351558574044727817,18.0
82986,82986,,,89,Bronowice,4,Wzgórza K.,2018-07-24 18:33:00,6.352185e+18,6351558574044457485,4.0
260805,260805,0.0,2018-07-30 21:18:35,2691,Chmieleniec,11,Czerwone Maki P+R,2018-07-30 21:19:00,6.352185e+18,6351558574044655637,21.0
151230,151230,,,79,Plac Inwalidów,24,Kurdwanów P+R,2018-07-26 11:06:00,6.352185e+18,6351558574047583239,9.0
229003,229003,0.0,2018-07-30 08:50:34,320,Reymana,20,Cichy Kącik,2018-07-30 08:50:00,6.352185e+18,6351558574044835847,18.0
61238,61238,,,129,Cystersów,4,Wzgórza K.,2018-07-24 10:14:00,6.352185e+18,6351558574044482053,16.0
193317,193317,240.0,2018-07-27 11:01:29,567,Kuklińskiego,20,Mały Płaszów,2018-07-27 10:57:00,6.352185e+18,6351558574046678282,15.0


## Missing Values

In [8]:
df.isna().sum()

id                   0
delay           132166
datetime        132166
stop                 0
stop_name            0
number               0
direction            0
planned_time         0
vehicle_id           0
trip_id              0
seq_num           2382
dtype: int64

## Dealing with features (`feature engineering`)

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["stop_name_cat"] = le.fit_transform(df["stop_name"])
df["direction_cat"] = le.fit_transform(df["direction"])

# df["stop_name_cat"] = df["stop_name"].factorize()[0] 
# df["direction_cat"] = df["direction"].factorize()[0]


df['trip_id_log'] = df['trip_id'].map(lambda x: np.log10(x))
df["trip_id_cat"] = le.fit_transform(df["trip_id"])

# df['trip_id_cat'] = df['trip_id'].factorize()[0]

In [10]:
df['planned_time'] = pd.to_datetime(df['planned_time'], format='%Y-%m-%d %H:%M:%S')
df['day'] = df['planned_time'].dt.dayofweek
df['month'] = df['planned_time'].dt.month
df['hour'] = df['planned_time'].dt.hour
df['minute'] = df['planned_time'].dt.minute
df['rush_hours'] = df['hour'].apply(lambda x: 1 if (x > 5 and x < 10) | (x > 14 and x < 19) else 0) # godziny szczytu od 5 do 10 lub od 14 do 19
df['night_hours'] = df['hour'].apply(lambda x: 1 if (x < 6 or x > 21) else 0) # godziny nocne od 21 do 6 rano

df['vehicle_id_log'] = df['vehicle_id'].map(lambda x: np.log(x))
df['vehicle_id_cat'] = pd.factorize(df['vehicle_id_log'])[0]

df['stop_hour'] = df['stop']/df['hour']

In [11]:
df['high_seq_num'] = df['seq_num'].map(lambda x: x > 20)
df['seq_num'] = df['seq_num'].fillna(99)
df['trip_id_seq_num_number'] = df['trip_id_log']*df['seq_num']*df['number']

In [12]:
df['time_diff'] = df['planned_time'].diff()
df['time_diff'] = df['time_diff'].dt.total_seconds()
df['stop_diff'] = df['stop'].diff()

df['time_diff'] = df['time_diff'].fillna(0)

df['time_diff'].value_counts()

 0.0         84271
 60.0        59102
-60.0        57264
 120.0       29546
-120.0       28559
 180.0       13618
-180.0       12921
 240.0        6114
-240.0        5795
 300.0        2719
-300.0        2611
 360.0        1308
-360.0        1214
-420.0         684
 420.0         658
 480.0         365
-480.0         360
-540.0         244
 540.0         236
-600.0         126
 600.0         125
 660.0          68
-660.0          60
 720.0          48
-720.0          39
-780.0          32
 780.0          28
 840.0          10
-840.0          10
 86520.0         2
 86400.0         1
 16680.0         1
 1140.0          1
-960.0           1
 187800.0        1
-691620.0        1
 86640.0         1
 86820.0         1
-1080.0          1
 15660.0         1
 15420.0         1
 15300.0         1
 14280.0         1
-900.0           1
 259080.0        1
Name: time_diff, dtype: int64

### Bus stops
In the previous starter, I already counted the statistics at stops. Now I can go one step further, but to make it easier for us to count, let's do the `df_group_delay` function first.

In [13]:
def df_group_delay(df_train, groupby_feats):
    agg_params = {
        "mean_{}_delay".format("_".join(groupby_feats)): ("delay", "mean"),
        "median_{}_delay".format("_".join(groupby_feats)): ("delay", "median"),
        "count_{}_delay".format("_".join(groupby_feats)): ("delay", "count"),
        "std_{}_delay".format("_".join(groupby_feats)): ("delay", "std"),
        "count_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: len([x for x in vals if x == 0]) ),
        "prob_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: np.mean([x == 0 for x in vals]) ),
    }
    
    return df_train[groupby_feats + ["delay"]].groupby(groupby_feats).agg(
        **agg_params
    ).reset_index()

Now let's use `df_group_delay` and that will be what we achieved recently.

In [14]:
df_tmp = df_group_delay(df_train, ["stop_name"])

if "mean_stopname_delay" not in df:
    df = pd.merge(df, df_tmp, on="stop_name", how="left")

Now let's add the driving direction (meaning `direction`). The same stop, but you can go in different directions so it will be different cases.

In [15]:
df_tmp = df_group_delay(df_train, ["stop_name", "direction"])
if "mean_stopname_direction_delay" not in df:
    df = pd.merge(df, df_tmp, on=["stop_name", "direction"], how="left")

## Select features

In addition, we can ignore some features, because if you check them more carefully, it turns out that they do not contribute much.

In [16]:
feats = df.select_dtypes("number").columns
black_list = ["id", "delay", "trip_id", 'trip_id_log', 'vehicle_id', 'super_seq', 'stop_name_cat', 'est_delay', 'minute', 'seq_num_^2_median', 
              'seq_num_tmp','no_delays_stops','seq_num_power2', 'trip_id_seq_num_number_m_frec','day_frec','trip_id_cat', 'stop_diff']
feats = [x for x in feats if x not in black_list]

feats

['stop',
 'number',
 'seq_num',
 'direction_cat',
 'day',
 'month',
 'hour',
 'rush_hours',
 'night_hours',
 'vehicle_id_log',
 'vehicle_id_cat',
 'stop_hour',
 'trip_id_seq_num_number',
 'time_diff',
 'mean_stop_name_delay',
 'median_stop_name_delay',
 'count_stop_name_delay',
 'std_stop_name_delay',
 'count_zeros_stop_name_delay',
 'prob_zeros_stop_name_delay',
 'mean_stop_name_direction_delay',
 'median_stop_name_direction_delay',
 'count_stop_name_direction_delay',
 'std_stop_name_direction_delay',
 'count_zeros_stop_name_direction_delay',
 'prob_zeros_stop_name_direction_delay']

## Preparing `X` and `y`

In [17]:
df_train = df[ df["delay"].notnull() ].copy().fillna(-1)
df_test = df[ df["delay"].isnull() ].copy().fillna(-1)

X_train = df_train[feats].fillna(-1).values
y_train = df_train["delay"].values
X_test = df_test[feats].fillna(-1).values

## Building the model

In [19]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb

In [20]:
training_data = xgb.DMatrix(data=X_train,label=y_train)

params = {
    'objective':'reg:squarederror',
    'colsample_bytree': 0.3,
    'learning_rate': 0.1,
    'max_depth': 5
}

cv_results_mae = xgb.cv(
    dtrain=training_data,
    params=untuned_params,
    nfold=4,
    num_boost_round=200,
    metrics='mae',
    as_pandas=True
)

In [21]:
np.mean(cv_results_mae['train-mae-mean']), np.mean(cv_results_mae['test-mae-mean'])

(32.33015102125, 32.600306885)