In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv("trainData.csv")

In [3]:
df_train.shape

(10000, 11)

In [4]:
df_train.head()

Unnamed: 0,ID,tpep_pickup_datetime,tpep_dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,latitude_difference,longitude_difference,distance
0,10000,2016-03-10 08:01:18,2016-03-10 08:11:26,-74.002968,40.760349,-73.985107,40.760189,10,-0.00016,0.01786,1.245105
1,10001,2016-03-10 08:01:18,2016-03-10 08:16:50,-73.965683,40.774269,-73.966766,40.758808,16,-0.015461,-0.001083,1.143103
2,10002,2016-03-10 08:01:19,2016-03-11 07:51:43,-73.958603,40.800125,-73.958023,40.784805,1430,-0.01532,0.00058,1.09856
3,10003,2016-03-10 08:01:20,2016-03-10 08:12:16,-73.99044,40.75621,-73.972229,40.75943,11,0.00322,0.018211,1.480736
4,10004,2016-03-10 08:01:20,2016-03-10 08:09:50,-73.982124,40.774906,-73.96917,40.798092,8,0.023186,0.012955,2.497062


As trip duration is alsready calculated pickup_datetime, dropoff_datetime are excluded from training

In [5]:
X = df_train.drop(["trip_duration", "tpep_pickup_datetime", "tpep_dropoff_datetime"], axis=1)
Y = df_train["trip_duration"]

In [10]:
#Spliting the data into training, test, and valdiation sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=100)
X_train, X_eval, Y_train, Y_eval = train_test_split(X_train, Y_train, test_size=0.25, random_state=100)

In [6]:
#evaluation metric
def evalMatric(y_actual, y_pred):
    assert len(y_actual) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_actual + 1)).mean() ** 0.5

In [7]:
#parameters for XGboost 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'evalMatric'
}

In [8]:
nrounds = 1000

In [11]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(Y_train+1))
dval = xgb.DMatrix(X_eval, np.log(Y_eval+1))

#track the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [12]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:2.07228	train-rmse:2.04992
[1]	eval-rmse:1.97555	train-rmse:1.95416
[2]	eval-rmse:1.88184	train-rmse:1.86128
[3]	eval-rmse:1.79268	train-rmse:1.77284
[4]	eval-rmse:1.70833	train-rmse:1.68930
[5]	eval-rmse:1.62851	train-rmse:1.60990
[6]	eval-rmse:1.55414	train-rmse:1.53541
[7]	eval-rmse:1.48236	train-rmse:1.46407
[8]	eval-rmse:1.41408	train-rmse:1.39660
[9]	eval-rmse:1.35004	train-rmse:1.33242
[10]	eval-rmse:1.28980	train-rmse:1.27180
[11]	eval-rmse:1.23253	train-rmse:1.21389
[12]	eval-rmse:1.17788	train-rmse:1.15914
[13]	eval-rmse:1.12659	train-rmse:1.10709
[14]	eval-rmse:1.07844	train-rmse:1.05849
[15]	eval-rmse:1.03408	train-rmse:1.01281
[16]	eval-rmse:0.99044	train-rmse:0.96845
[17]	eval-rmse:0.94986	train-rmse:0.92611
[18]	eval-rmse:0.91121	train-rmse:0.88637
[19]	eval-rmse:0.87577	train-rmse:0.84907
[20]	eval-rmse:0.84128	train-rmse:0.81320
[21]	eval-rmse:0.80908	train-rmse:0.77924
[22]	eval-rmse:0.77833	train-rmse:0.74711
[23]	eval-rmse:0.74983	train-rmse:0.71611
[2

[194]	eval-rmse:0.37468	train-rmse:0.05131
[195]	eval-rmse:0.37471	train-rmse:0.05099
[196]	eval-rmse:0.37473	train-rmse:0.05045
[197]	eval-rmse:0.37472	train-rmse:0.04979
[198]	eval-rmse:0.37468	train-rmse:0.04943
[199]	eval-rmse:0.37466	train-rmse:0.04916
[200]	eval-rmse:0.37470	train-rmse:0.04868
[201]	eval-rmse:0.37474	train-rmse:0.04833
[202]	eval-rmse:0.37474	train-rmse:0.04794
[203]	eval-rmse:0.37476	train-rmse:0.04748
[204]	eval-rmse:0.37475	train-rmse:0.04712
[205]	eval-rmse:0.37477	train-rmse:0.04687
[206]	eval-rmse:0.37478	train-rmse:0.04637
[207]	eval-rmse:0.37480	train-rmse:0.04620
[208]	eval-rmse:0.37483	train-rmse:0.04584
[209]	eval-rmse:0.37483	train-rmse:0.04541
[210]	eval-rmse:0.37484	train-rmse:0.04513
[211]	eval-rmse:0.37483	train-rmse:0.04495
[212]	eval-rmse:0.37486	train-rmse:0.04447
[213]	eval-rmse:0.37489	train-rmse:0.04407
[214]	eval-rmse:0.37493	train-rmse:0.04385
[215]	eval-rmse:0.37495	train-rmse:0.04340
[216]	eval-rmse:0.37502	train-rmse:0.04312
[217]	eval-

[385]	eval-rmse:0.37576	train-rmse:0.01447
[386]	eval-rmse:0.37574	train-rmse:0.01435
[387]	eval-rmse:0.37575	train-rmse:0.01425
[388]	eval-rmse:0.37574	train-rmse:0.01417
[389]	eval-rmse:0.37574	train-rmse:0.01412
[390]	eval-rmse:0.37574	train-rmse:0.01405
[391]	eval-rmse:0.37574	train-rmse:0.01402
[392]	eval-rmse:0.37574	train-rmse:0.01398
[393]	eval-rmse:0.37574	train-rmse:0.01393
[394]	eval-rmse:0.37574	train-rmse:0.01389
[395]	eval-rmse:0.37574	train-rmse:0.01384
[396]	eval-rmse:0.37574	train-rmse:0.01374
[397]	eval-rmse:0.37572	train-rmse:0.01363
[398]	eval-rmse:0.37572	train-rmse:0.01359
[399]	eval-rmse:0.37573	train-rmse:0.01348
[400]	eval-rmse:0.37574	train-rmse:0.01335
[401]	eval-rmse:0.37574	train-rmse:0.01323
[402]	eval-rmse:0.37574	train-rmse:0.01313
[403]	eval-rmse:0.37574	train-rmse:0.01310
[404]	eval-rmse:0.37575	train-rmse:0.01305
[405]	eval-rmse:0.37575	train-rmse:0.01301
[406]	eval-rmse:0.37576	train-rmse:0.01295
[407]	eval-rmse:0.37576	train-rmse:0.01292
[408]	eval-

[576]	eval-rmse:0.37606	train-rmse:0.00603
[577]	eval-rmse:0.37605	train-rmse:0.00602
[578]	eval-rmse:0.37605	train-rmse:0.00595
[579]	eval-rmse:0.37606	train-rmse:0.00587
[580]	eval-rmse:0.37607	train-rmse:0.00582
[581]	eval-rmse:0.37607	train-rmse:0.00581
[582]	eval-rmse:0.37607	train-rmse:0.00580
[583]	eval-rmse:0.37607	train-rmse:0.00579
[584]	eval-rmse:0.37607	train-rmse:0.00576
[585]	eval-rmse:0.37607	train-rmse:0.00574
[586]	eval-rmse:0.37607	train-rmse:0.00571
[587]	eval-rmse:0.37607	train-rmse:0.00568
[588]	eval-rmse:0.37607	train-rmse:0.00565
[589]	eval-rmse:0.37607	train-rmse:0.00563
[590]	eval-rmse:0.37607	train-rmse:0.00561
[591]	eval-rmse:0.37607	train-rmse:0.00560
[592]	eval-rmse:0.37607	train-rmse:0.00558
[593]	eval-rmse:0.37606	train-rmse:0.00557
[594]	eval-rmse:0.37606	train-rmse:0.00555
[595]	eval-rmse:0.37607	train-rmse:0.00553
[596]	eval-rmse:0.37607	train-rmse:0.00552
[597]	eval-rmse:0.37607	train-rmse:0.00548
[598]	eval-rmse:0.37607	train-rmse:0.00545
[599]	eval-

[767]	eval-rmse:0.37616	train-rmse:0.00340
[768]	eval-rmse:0.37616	train-rmse:0.00336
[769]	eval-rmse:0.37616	train-rmse:0.00335
[770]	eval-rmse:0.37616	train-rmse:0.00335
[771]	eval-rmse:0.37616	train-rmse:0.00335
[772]	eval-rmse:0.37617	train-rmse:0.00330
[773]	eval-rmse:0.37617	train-rmse:0.00330
[774]	eval-rmse:0.37617	train-rmse:0.00329
[775]	eval-rmse:0.37617	train-rmse:0.00329
[776]	eval-rmse:0.37617	train-rmse:0.00329
[777]	eval-rmse:0.37617	train-rmse:0.00329
[778]	eval-rmse:0.37617	train-rmse:0.00329
[779]	eval-rmse:0.37617	train-rmse:0.00329
[780]	eval-rmse:0.37617	train-rmse:0.00329
[781]	eval-rmse:0.37617	train-rmse:0.00326
[782]	eval-rmse:0.37617	train-rmse:0.00325
[783]	eval-rmse:0.37617	train-rmse:0.00325
[784]	eval-rmse:0.37617	train-rmse:0.00325
[785]	eval-rmse:0.37617	train-rmse:0.00324
[786]	eval-rmse:0.37617	train-rmse:0.00324
[787]	eval-rmse:0.37617	train-rmse:0.00324
[788]	eval-rmse:0.37617	train-rmse:0.00323
[789]	eval-rmse:0.37617	train-rmse:0.00323
[790]	eval-

[958]	eval-rmse:0.37623	train-rmse:0.00222
[959]	eval-rmse:0.37623	train-rmse:0.00222
[960]	eval-rmse:0.37623	train-rmse:0.00222
[961]	eval-rmse:0.37623	train-rmse:0.00222
[962]	eval-rmse:0.37623	train-rmse:0.00222
[963]	eval-rmse:0.37623	train-rmse:0.00221
[964]	eval-rmse:0.37623	train-rmse:0.00221
[965]	eval-rmse:0.37623	train-rmse:0.00221
[966]	eval-rmse:0.37623	train-rmse:0.00220
[967]	eval-rmse:0.37623	train-rmse:0.00220
[968]	eval-rmse:0.37624	train-rmse:0.00217
[969]	eval-rmse:0.37624	train-rmse:0.00216
[970]	eval-rmse:0.37624	train-rmse:0.00216
[971]	eval-rmse:0.37624	train-rmse:0.00216
[972]	eval-rmse:0.37624	train-rmse:0.00216
[973]	eval-rmse:0.37623	train-rmse:0.00216
[974]	eval-rmse:0.37623	train-rmse:0.00216
[975]	eval-rmse:0.37623	train-rmse:0.00216
[976]	eval-rmse:0.37623	train-rmse:0.00216
[977]	eval-rmse:0.37623	train-rmse:0.00216
[978]	eval-rmse:0.37624	train-rmse:0.00214
[979]	eval-rmse:0.37624	train-rmse:0.00213
[980]	eval-rmse:0.37624	train-rmse:0.00213
[981]	eval-

In [13]:
#prediction using test data
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [15]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - Y_test)).mean()
mae

7.44418228884538

In [16]:
filename = "xgb_routeOptimization.sav"
pickle.dump(gbm, open(filename, 'wb'))