In [1]:
import warnings
import pandas as pd
import numpy as np

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:


def merge(orders, nodes_path):
    nodes = pd.read_csv(nodes_path)
    mean_speed_by_hour = pd.read_csv("data/mean_speed_by_hour_concat.csv", )
    merged = pd.merge(nodes, orders, how='left', on='Id')
    merged = pd.merge(merged, mean_speed_by_hour, how='left', on='Id')
    merged = merged.dropna()
    
    return merged
    
train_orders = pd.read_csv("data/orders.csv")
test_orders = pd.read_csv("data/final_test.csv")

train_df = merge(train_orders, "data/nodes.csv")
test_df =  merge(test_orders, 'data/nodes_test.csv')

df = pd.concat([train_df, test_df])


#### Weather

In [4]:
import re

wearher_df = pd.read_excel('data/weather.xlsx')

cols_to_process = ['Temperature', 'Dew Point', 'Humidity', 'Wind Speed', 'Wind Gust', 'Pressure', 'Precip.']

def process_numerical(x):
    return re.compile(r'(\d+(?:\.\d+)?)').findall(x)[0]

for col in cols_to_process:
    wearher_df[col] = wearher_df[col].apply(lambda x: process_numerical(x)).astype(float)

    
wearher_df['isWindy'] = wearher_df['Condition'].apply(lambda x: 1 if len(x.split('/'))==2 else 0)
wearher_df['Condition'] = wearher_df['Condition'].apply(lambda x: x.split('/')[-1] if len(x.split('/'))==2 else x)

wearher_df['time_h_m'] = pd.to_datetime(wearher_df['Time']).apply(lambda x: x.strftime("%H:%M"))
wearher_df.drop('Time', axis=1, inplace=True)

#### Merge weather

In [6]:

def merge_weather(merged):
    merged['time_h_m'] = pd.to_datetime(pd.to_datetime(merged['running_time'])
                                       .apply(lambda x: x.strftime("%H:%M")))
    wearher_df['time_h_m'] = pd.to_datetime(wearher_df['time_h_m'])

    merged.sort_values('time_h_m', inplace=True)
    wearher_df.sort_values('time_h_m', inplace=True)

    merged = pd.merge_asof(merged, wearher_df, on='time_h_m')
    merged.drop('time_h_m', axis=1, inplace=True)

    wind = pd.get_dummies(merged['Wind'])

    merged.drop(['Wind'], axis=1, inplace=True)

    condition = pd.get_dummies(merged['Condition'])

    merged.drop(['Condition'], axis=1, inplace=True)

    merged = pd.concat([merged, wind, condition],axis=1)
    
    return merged

df = merge_weather(df)

In [7]:
def time_preprocess(df):
    df['running_time'] = pd.to_datetime(df['running_time'])
    df['seconds'] = df['running_time'].apply(lambda x: x.hour * 3600 + x.minute * 60 + x.second)

    df['mean_time'] = df['route_distance_km']/df['speed']
    
    return df


df = time_preprocess(df)


In [8]:

def agregate_df(df):
    agregated_df = pd.DataFrame()

    gpby = df.groupby('Id')

    for name in df.columns:
        if name == 'distance':
            agregated_df[name] = gpby[name].sum()
        elif name in ['node_start','node_finish','running_time','completed_time']:
            pass
        else:
            agregated_df[name] = gpby[name].mean()
    
    agregated_df['node_list'] = gpby['node_start'].apply(list)+gpby['node_finish'].apply(list)
            
    for i in agregated_df.index.tolist():
        agregated_df.at[i, 'node_list'] = list(set(agregated_df['node_list'][i]))
    
    try:
        df = df.drop(['Id','node_start','node_finish','running_time','completed_time'],axis=1)
    except:
        df = df.drop(['Id', 'node_start','node_finish','running_time'],axis=1)
    
    return agregated_df

df = agregate_df(df)

# OSM

In [9]:
import pandas as pd
import ast

def add_osm(df):
    osm_fin = pd.read_csv('data/osm.csv',index_col=0)

    # shop replace travel_agency 2 yes

    del_list = [
    'opening_hours', 'name:en', 'mapillary','name:uk','addr:postcode','addr:city','traffic_sign:forward','motor_vehicle:conditional',
    'name:ru','addr:housenumber','local_ref','man_made','addr:street','natural','website','source','water_source','description','cuisine','power',
    'ref_name','phone','old_name','name:be', 'addr:state','name','ref','operator','maxheight']

    osm_fin = osm_fin[[x for x in list(osm_fin) if x not in del_list]]

    df_dummies = pd.get_dummies(osm_fin, columns=list(osm_fin)[1:])
    df_dummies = df_dummies.drop(columns=[s for s in list(df_dummies) if "_0" in s])
    df_dummies = df_dummies.drop(columns=[s for s in list(df_dummies) if "_no" in s])

    for index in list(df_dummies)[1:]:
        df[index] = 0

    slice_list = []

    for index in range(len(df)):
        ans = df_dummies[df_dummies.id.isin(df.node_list.values[index])].sum().values.tolist()[1:]
        slice_list.append(ans)

    for count, index in enumerate(df.index.tolist()):
        df.loc[index, list(df_dummies)[1:]] = slice_list[count]

    df = df.drop(columns=['node_list'])
    return df

# train_df = add_osm(train_df)
# test_df =  add_osm(test_df)

df = add_osm(df)

  osm_fin = pd.read_csv('data/osm.csv',index_col=0)


In [10]:
test_df = df[df['Id'].isin(test_df['Id'].unique())]
train_df = df[~df['Id'].isin(test_df['Id'].unique())]

In [11]:

train_df.drop(['Id'],axis=1, inplace=True)
test_df.drop(['Id'],axis=1, inplace=True)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

y_train = np.array(train_df['delta_time']).reshape(-1, 1)
X_train = train_df.drop(['delta_time'], axis=1)

# y_test = np.array(test_df['delta_time']).reshape(-1, 1)
X_test = test_df.drop(['delta_time'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(['Id'],axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['Id'],axis=1, inplace=True)


In [12]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)

X_test = X_scaler.transform(X_test)


In [54]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

params = {'max_depth': 3,
                'min_child_weight': 4,
                'gamma': 0.15,
                'colsample_bytree': 0.85,
                'subsample': 1.0}

kf = KFold(n_splits=10, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[test_index]
    y_tr, y_val = y_train[train_index], y_train[test_index]
    regressor = XGBRegressor(**params)
    regressor.fit(X_tr, y_tr)
    predictions_array.append(regressor.predict(X_test))

predictions = np.mean(predictions_array,axis=0)


In [20]:
from pytorch_tabnet.tab_model import TabNetRegressor

kf = KFold(n_splits=10, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[test_index]
    y_tr, y_val = y_train[train_index], y_train[test_index]
    regressor = TabNetRegressor(verbose=0,seed=42)
    regressor.fit(X_train=X_tr, y_train=y_tr,
              eval_set=[(X_val, y_val)],
              patience=300, max_epochs=300,
              eval_metric=['rmse'])
    CV_score_array.append(regressor.best_cost)
    predictions_array.append(regressor.predict(X_test))

predictions = np.mean(predictions_array,axis=0)

Stop training because you reached max_epochs = 300 with best_epoch = 85 and best_val_0_rmse = 123.96305




Stop training because you reached max_epochs = 300 with best_epoch = 118 and best_val_0_rmse = 124.33405




Stop training because you reached max_epochs = 300 with best_epoch = 140 and best_val_0_rmse = 121.84334




Stop training because you reached max_epochs = 300 with best_epoch = 93 and best_val_0_rmse = 121.82551




Stop training because you reached max_epochs = 300 with best_epoch = 77 and best_val_0_rmse = 127.39133




Stop training because you reached max_epochs = 300 with best_epoch = 136 and best_val_0_rmse = 118.4771




Stop training because you reached max_epochs = 300 with best_epoch = 133 and best_val_0_rmse = 114.03483




Stop training because you reached max_epochs = 300 with best_epoch = 80 and best_val_0_rmse = 125.45485




Stop training because you reached max_epochs = 300 with best_epoch = 144 and best_val_0_rmse = 123.33119




Stop training because you reached max_epochs = 300 with best_epoch = 87 and best_val_0_rmse = 125.00476


