In [1]:
import pandas as pd
import numpy as np
np.random.seed(0)
import xgboost as xgb
from sklearn.model_selection import cross_val_score

In [2]:
df_train = pd.read_hdf('../input/tram.train.h5')
df_train["planned_time"] = pd.to_datetime(df_train["planned_time"])
df_test = pd.read_hdf('../input/tram.test.h5') 

In [3]:
df = pd.concat([df_train, df_test])
df["planned_time"] = pd.to_datetime(df["planned_time"])
df["planned_time_ymd"] = df["planned_time"].map(lambda x: x.strftime("%Y-%m-%d"))

In [4]:
#feature engineering
df["planned_time_hour"] = df["planned_time"].dt.hour
df["stop_name_cat"] = df["stop_name"].factorize()[0]
df["direction_cat"] = df["direction"].factorize()[0]

In [5]:
## group
def df_group_delay(df_train, groupby_feats):
    agg_params = {
        "mean_{}_delay".format("_".join(groupby_feats)): ("delay", "mean"),
        "median_{}_delay".format("_".join(groupby_feats)): ("delay", "median"),
        "count_{}_delay".format("_".join(groupby_feats)): ("delay", "count"),
        "std_{}_delay".format("_".join(groupby_feats)): ("delay", "std"),
        "count_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: len([x for x in vals if x == 0]) ),
        "prob_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: np.mean([x == 0 for x in vals]) ),
    }
    
    return df_train[groupby_feats + ["delay"]].groupby(groupby_feats).agg(
        **agg_params
    ).reset_index()

df_tmp = df_group_delay(df_train, ["stop_name"])
if "mean_stopname_delay" not in df:
    df = pd.merge(df, df_tmp, on="stop_name", how="left")
    
df_tmp = df_group_delay(df_train, ["stop_name", "direction"])
if "mean_stopname_direction_delay" not in df:
    df = pd.merge(df, df_tmp, on=["stop_name", "direction"], how="left")

In [18]:
df_tmp

Unnamed: 0,planned_time_1d_ymd,planned_time_1d_hour,number,mean_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,median_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,count_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,std_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,count_1d_zeros_planned_time_1d_ymd_planned_time_1d_hour_number_delay,prob_1d_zeros_planned_time_1d_ymd_planned_time_1d_hour_number_delay
0,2018-07-24,5,1,120.000000,120,2,0.000000,0,0.000000
1,2018-07-24,5,5,180.000000,180,1,,0,0.000000
2,2018-07-24,5,14,120.000000,120,1,,0,0.000000
3,2018-07-24,5,50,90.000000,90,2,42.426407,0,0.000000
4,2018-07-24,6,1,57.623762,60,202,57.978354,77,0.381188
...,...,...,...,...,...,...,...,...,...
1601,2018-07-31,23,52,42.857143,0,49,57.445626,28,0.571429
1602,2018-08-01,0,13,0.000000,0,1,,1,1.000000
1603,2018-08-01,0,24,0.000000,0,1,,1,1.000000
1604,2018-08-02,0,10,660.000000,660,4,0.000000,0,0.000000


In [6]:
### shift
def df_group_delay_shift(df_train, groupby_feats, shift):
    df_train["planned_time_{}d".format(shift)] = df_train["planned_time"] + pd.Timedelta(days=shift)
    df_train["planned_time_{}d_hour".format(shift)] = df_train["planned_time_{}d".format(shift)].dt.hour
    df_train["planned_time_{}d_ymd".format(shift)] = df_train["planned_time_{}d".format(shift)].map(lambda x: x.strftime("%Y-%m-%d"))
    
    groupby_feats = ["planned_time_1d_ymd", "planned_time_1d_hour"] + groupby_feats
    
    agg_params = {
        "mean_1d_{}_delay".format("_".join(groupby_feats)): ("delay", "mean"),
        "median_1d_{}_delay".format("_".join(groupby_feats)): ("delay", "median"),
        "count_1d_{}_delay".format("_".join(groupby_feats)): ("delay", "count"),
        "std_1d_{}_delay".format("_".join(groupby_feats)): ("delay", "std"),
        "count_1d_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: len([x for x in vals if x == 0]) ),
        "prob_1d_zeros_{}_delay".format("_".join(groupby_feats)): ("delay", lambda vals: np.mean([x == 0 for x in vals]) ),
    }
    
    return df_train[groupby_feats + ["delay"]].groupby(groupby_feats).agg(
        **agg_params
    ).reset_index()




if "mean_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay" not in df:
    df_tmp = df_group_delay_shift(df_train, ["number"], shift=1)
    
    df = pd.merge(df, df_tmp, left_on=["planned_time_ymd", "planned_time_hour", "number"]
    , right_on=["planned_time_1d_ymd", "planned_time_1d_hour", "number"]
    , how="left")

In [7]:
### select features
feats = df.select_dtypes("number").columns
black_list = ["id", "delay", "vehicle_id", "trip_id"]
feats = [x for x in feats if x not in black_list]
print(feats)

['stop', 'number', 'seq_num', 'planned_time_hour', 'stop_name_cat', 'direction_cat', 'mean_stop_name_delay', 'median_stop_name_delay', 'count_stop_name_delay', 'std_stop_name_delay', 'count_zeros_stop_name_delay', 'prob_zeros_stop_name_delay', 'mean_stop_name_direction_delay', 'median_stop_name_direction_delay', 'count_stop_name_direction_delay', 'std_stop_name_direction_delay', 'count_zeros_stop_name_direction_delay', 'prob_zeros_stop_name_direction_delay', 'planned_time_1d_hour', 'mean_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay', 'median_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay', 'count_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay', 'std_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay', 'count_1d_zeros_planned_time_1d_ymd_planned_time_1d_hour_number_delay', 'prob_1d_zeros_planned_time_1d_ymd_planned_time_1d_hour_number_delay']


In [8]:
#X, y
df_train = df[ df["delay"].notnull() ].copy()
df_test = df[ df["delay"].isnull() ].copy()

X_train = df_train[feats].fillna(-1).values
y_train = df_train["delay"].values
X_test = df_test[feats].fillna(-1).values

In [9]:
df.sample(5)

Unnamed: 0,id,delay,datetime,stop,stop_name,number,direction,planned_time,vehicle_id,trip_id,...,count_zeros_stop_name_direction_delay,prob_zeros_stop_name_direction_delay,planned_time_1d_ymd,planned_time_1d_hour,mean_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,median_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,count_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,std_1d_planned_time_1d_ymd_planned_time_1d_hour_number_delay,count_1d_zeros_planned_time_1d_ymd_planned_time_1d_hour_number_delay,prob_1d_zeros_planned_time_1d_ymd_planned_time_1d_hour_number_delay
46049,46049,180.0,2018-07-23 22:46:12,409,Centralna,22,Kombinat,2018-07-23 22:43:00,6.352185e+18,6351558574044899599,...,2.0,0.181818,,,,,,,,
199219,70448,,2018-07-24 14:10:42,367,Francesco Nullo,14,Mistrzejowice,2018-07-24 14:09:00,6.352185e+18,6351558574044727817,...,62.0,0.455882,2018-07-24,14.0,93.284672,60.0,137.0,111.045231,43.0,0.313869
211757,82986,,2018-07-24 18:33:52,89,Bronowice,4,Wzgórza K.,2018-07-24 18:33:00,6.352185e+18,6351558574044457485,...,82.0,0.473988,2018-07-24,18.0,48.214286,60.0,168.0,53.447847,74.0,0.440476
172286,260805,0.0,2018-07-30 21:18:35,2691,Chmieleniec,11,Czerwone Maki P+R,2018-07-30 21:19:00,6.352185e+18,6351558574044655637,...,433.0,0.601389,,,,,,,,
235797,151230,,2018-07-26 11:07:25,79,Plac Inwalidów,24,Kurdwanów P+R,2018-07-26 11:06:00,6.352185e+18,6351558574047583239,...,94.0,0.398305,2018-07-26,11.0,132.66055,90.0,218.0,125.427506,47.0,0.215596


In [12]:
#train & predict
model = xgb.XGBRegressor(max_depth=5, n_estimators=50, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred[ y_pred < 0 ] = 0
df_test["delay"] = y_pred

In [13]:
# local score
model = xgb.XGBRegressor(max_depth=5, n_estimators=50, random_state=0)

scores = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_mean_absolute_error")
print("local score", np.mean(scores), np.std(scores))

local score -48.523974809616924 0.8137735643888347


In [22]:
#save
df_test[ ["id", "delay"] ].to_csv('../output/prob_stopnames_direct_number_hour_1d_xgboost.csv', index=False) 