In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv("trainData.csv")

In [3]:
df_train.shape

(10000, 11)

In [4]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,latitude_difference,longitude_difference,distance
0,10000,2016-03-10 08:01:18,2016-03-10 08:11:26,-74.002968,40.760349,-73.985107,40.760189,10.0,-0.00016,0.01786,1.245105
1,10001,2016-03-10 08:01:18,2016-03-10 08:16:50,-73.965683,40.774269,-73.966766,40.758808,16.0,-0.015461,-0.001083,1.143103
2,10002,2016-03-10 08:01:19,2016-03-11 07:51:43,-73.958603,40.800125,-73.958023,40.784805,1430.0,-0.01532,0.00058,1.09856
3,10003,2016-03-10 08:01:20,2016-03-10 08:12:16,-73.99044,40.75621,-73.972229,40.75943,11.0,0.00322,0.018211,1.480736
4,10004,2016-03-10 08:01:20,2016-03-10 08:09:50,-73.982124,40.774906,-73.96917,40.798092,8.0,0.023186,0.012955,2.497062


As trip duration is alsready calculated pickup_datetime, dropoff_datetime are excluded from training

In [5]:
X = df_train.drop(["trip_duration", "tpep_pickup_datetime", "tpep_dropoff_datetime","Unnamed: 0"], axis=1)
Y = df_train["trip_duration"]

In [6]:
#Spliting the data into training, test, and valdiation sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=100)
X_train, X_eval, Y_train, Y_eval = train_test_split(X_train, Y_train, test_size=0.25, random_state=100)

In [7]:
#evaluation metric
def evalMatric(y_actual, y_pred):
    assert len(y_actual) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_actual + 1)).mean() ** 0.5

In [8]:
#parameters for XGboost 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'evalMatric'
}

In [9]:
nrounds = 1000

In [10]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(Y_train+1))
dval = xgb.DMatrix(X_eval, np.log(Y_eval+1))

#track the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [11]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:2.07188	train-rmse:2.05001
[1]	eval-rmse:1.97390	train-rmse:1.95273
[2]	eval-rmse:1.88263	train-rmse:1.86198
[3]	eval-rmse:1.79574	train-rmse:1.77554
[4]	eval-rmse:1.71130	train-rmse:1.69187
[5]	eval-rmse:1.63130	train-rmse:1.61254
[6]	eval-rmse:1.55734	train-rmse:1.53896
[7]	eval-rmse:1.48549	train-rmse:1.46775
[8]	eval-rmse:1.41769	train-rmse:1.40006
[9]	eval-rmse:1.35337	train-rmse:1.33615
[10]	eval-rmse:1.29216	train-rmse:1.27523
[11]	eval-rmse:1.23480	train-rmse:1.21783
[12]	eval-rmse:1.18024	train-rmse:1.16305
[13]	eval-rmse:1.12870	train-rmse:1.11140
[14]	eval-rmse:1.08027	train-rmse:1.06259
[15]	eval-rmse:1.03436	train-rmse:1.01588
[16]	eval-rmse:0.99191	train-rmse:0.97238
[17]	eval-rmse:0.95077	train-rmse:0.93021
[18]	eval-rmse:0.91231	train-rmse:0.89054
[19]	eval-rmse:0.87577	train-rmse:0.85286
[20]	eval-rmse:0.84133	train-rmse:0.81705
[21]	eval-rmse:0.80955	train-rmse:0.78363
[22]	eval-rmse:0.77946	train-rmse:0.75156
[23]	eval-rmse:0.75103	train-rmse:0.72148
[2

[194]	eval-rmse:0.36662	train-rmse:0.11100
[195]	eval-rmse:0.36661	train-rmse:0.11078
[196]	eval-rmse:0.36651	train-rmse:0.11053
[197]	eval-rmse:0.36653	train-rmse:0.11024
[198]	eval-rmse:0.36649	train-rmse:0.10999
[199]	eval-rmse:0.36652	train-rmse:0.10973
[200]	eval-rmse:0.36652	train-rmse:0.10944
[201]	eval-rmse:0.36646	train-rmse:0.10920
[202]	eval-rmse:0.36647	train-rmse:0.10909
[203]	eval-rmse:0.36647	train-rmse:0.10890
[204]	eval-rmse:0.36653	train-rmse:0.10866
[205]	eval-rmse:0.36655	train-rmse:0.10853
[206]	eval-rmse:0.36656	train-rmse:0.10836
[207]	eval-rmse:0.36658	train-rmse:0.10824
[208]	eval-rmse:0.36659	train-rmse:0.10809
[209]	eval-rmse:0.36661	train-rmse:0.10791
[210]	eval-rmse:0.36659	train-rmse:0.10781
[211]	eval-rmse:0.36659	train-rmse:0.10773
[212]	eval-rmse:0.36659	train-rmse:0.10753
[213]	eval-rmse:0.36658	train-rmse:0.10741
[214]	eval-rmse:0.36654	train-rmse:0.10719
[215]	eval-rmse:0.36654	train-rmse:0.10695
[216]	eval-rmse:0.36651	train-rmse:0.10686
[217]	eval-

[385]	eval-rmse:0.36728	train-rmse:0.09626
[386]	eval-rmse:0.36728	train-rmse:0.09624
[387]	eval-rmse:0.36728	train-rmse:0.09622
[388]	eval-rmse:0.36729	train-rmse:0.09621
[389]	eval-rmse:0.36729	train-rmse:0.09619
[390]	eval-rmse:0.36728	train-rmse:0.09617
[391]	eval-rmse:0.36728	train-rmse:0.09615
[392]	eval-rmse:0.36727	train-rmse:0.09614
[393]	eval-rmse:0.36727	train-rmse:0.09612
[394]	eval-rmse:0.36728	train-rmse:0.09610
[395]	eval-rmse:0.36728	train-rmse:0.09608
[396]	eval-rmse:0.36728	train-rmse:0.09606
[397]	eval-rmse:0.36728	train-rmse:0.09604
[398]	eval-rmse:0.36728	train-rmse:0.09602
[399]	eval-rmse:0.36728	train-rmse:0.09599
[400]	eval-rmse:0.36729	train-rmse:0.09598
[401]	eval-rmse:0.36730	train-rmse:0.09596
[402]	eval-rmse:0.36730	train-rmse:0.09595
[403]	eval-rmse:0.36729	train-rmse:0.09594
[404]	eval-rmse:0.36728	train-rmse:0.09591
[405]	eval-rmse:0.36729	train-rmse:0.09591
[406]	eval-rmse:0.36727	train-rmse:0.09589
[407]	eval-rmse:0.36728	train-rmse:0.09588
[408]	eval-

[576]	eval-rmse:0.36747	train-rmse:0.09475
[577]	eval-rmse:0.36747	train-rmse:0.09475
[578]	eval-rmse:0.36746	train-rmse:0.09475
[579]	eval-rmse:0.36746	train-rmse:0.09474
[580]	eval-rmse:0.36747	train-rmse:0.09474
[581]	eval-rmse:0.36747	train-rmse:0.09474
[582]	eval-rmse:0.36748	train-rmse:0.09474
[583]	eval-rmse:0.36748	train-rmse:0.09473
[584]	eval-rmse:0.36749	train-rmse:0.09473
[585]	eval-rmse:0.36749	train-rmse:0.09473
[586]	eval-rmse:0.36749	train-rmse:0.09473
[587]	eval-rmse:0.36748	train-rmse:0.09473
[588]	eval-rmse:0.36747	train-rmse:0.09472
[589]	eval-rmse:0.36747	train-rmse:0.09472
[590]	eval-rmse:0.36748	train-rmse:0.09472
[591]	eval-rmse:0.36749	train-rmse:0.09472
[592]	eval-rmse:0.36750	train-rmse:0.09472
[593]	eval-rmse:0.36750	train-rmse:0.09472
[594]	eval-rmse:0.36750	train-rmse:0.09472
[595]	eval-rmse:0.36750	train-rmse:0.09472
[596]	eval-rmse:0.36750	train-rmse:0.09471
[597]	eval-rmse:0.36750	train-rmse:0.09471
[598]	eval-rmse:0.36751	train-rmse:0.09471
[599]	eval-

[767]	eval-rmse:0.36752	train-rmse:0.09456
[768]	eval-rmse:0.36752	train-rmse:0.09456
[769]	eval-rmse:0.36752	train-rmse:0.09456
[770]	eval-rmse:0.36752	train-rmse:0.09456
[771]	eval-rmse:0.36752	train-rmse:0.09456
[772]	eval-rmse:0.36752	train-rmse:0.09456
[773]	eval-rmse:0.36752	train-rmse:0.09456
[774]	eval-rmse:0.36752	train-rmse:0.09456
[775]	eval-rmse:0.36751	train-rmse:0.09456
[776]	eval-rmse:0.36752	train-rmse:0.09456
[777]	eval-rmse:0.36752	train-rmse:0.09456
[778]	eval-rmse:0.36751	train-rmse:0.09456
[779]	eval-rmse:0.36752	train-rmse:0.09456
[780]	eval-rmse:0.36752	train-rmse:0.09456
[781]	eval-rmse:0.36750	train-rmse:0.09456
[782]	eval-rmse:0.36751	train-rmse:0.09456
[783]	eval-rmse:0.36751	train-rmse:0.09456
[784]	eval-rmse:0.36751	train-rmse:0.09456
[785]	eval-rmse:0.36751	train-rmse:0.09456
[786]	eval-rmse:0.36752	train-rmse:0.09455
[787]	eval-rmse:0.36752	train-rmse:0.09455
[788]	eval-rmse:0.36751	train-rmse:0.09455
[789]	eval-rmse:0.36751	train-rmse:0.09455
[790]	eval-

[958]	eval-rmse:0.36756	train-rmse:0.09453
[959]	eval-rmse:0.36755	train-rmse:0.09453
[960]	eval-rmse:0.36756	train-rmse:0.09453
[961]	eval-rmse:0.36756	train-rmse:0.09453
[962]	eval-rmse:0.36754	train-rmse:0.09453
[963]	eval-rmse:0.36754	train-rmse:0.09453
[964]	eval-rmse:0.36754	train-rmse:0.09453
[965]	eval-rmse:0.36755	train-rmse:0.09453
[966]	eval-rmse:0.36755	train-rmse:0.09453
[967]	eval-rmse:0.36754	train-rmse:0.09453
[968]	eval-rmse:0.36755	train-rmse:0.09453
[969]	eval-rmse:0.36755	train-rmse:0.09453
[970]	eval-rmse:0.36755	train-rmse:0.09453
[971]	eval-rmse:0.36756	train-rmse:0.09453
[972]	eval-rmse:0.36755	train-rmse:0.09453
[973]	eval-rmse:0.36755	train-rmse:0.09453
[974]	eval-rmse:0.36755	train-rmse:0.09453
[975]	eval-rmse:0.36754	train-rmse:0.09453
[976]	eval-rmse:0.36753	train-rmse:0.09453
[977]	eval-rmse:0.36753	train-rmse:0.09453
[978]	eval-rmse:0.36754	train-rmse:0.09453
[979]	eval-rmse:0.36754	train-rmse:0.09453
[980]	eval-rmse:0.36755	train-rmse:0.09453
[981]	eval-

In [12]:
#prediction using test data
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [13]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - Y_test)).mean()
mae

7.391758913000425

In [14]:
feature_scores = gbm.get_fscore()
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'pickup_longitude': 0.16814912169132715,
 'dropoff_latitude': 0.14836421192464777,
 'pickup_latitude': 0.15538785489181894,
 'longitude_difference': 0.13413603538672433,
 'latitude_difference': 0.13226494820593265,
 'dropoff_longitude': 0.1480533061997428,
 'distance': 0.11364452169980639}

In [15]:
filename = "xgb_routeOptimization.sav"
pickle.dump(gbm, open(filename, 'wb'))