In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [2]:
sample_df = pd.read_csv("Solar1.csv",encoding='latin1')

In [3]:
sample_df.head()

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup_date,dropoff_date,trip_duration
0,9.426832,-0.927321,,,22-01-20,23-01-20,0
1,9.426832,-0.927321,,,22-01-20,23-01-20,0
2,9.411328,-0.850727,9.426832,-0.927321,22-01-20,23-01-20,300
3,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,1860
4,9.415693,-0.851178,9.426832,-0.927321,23-01-20,24-01-20,960


In [4]:
sample_df.shape

(449, 7)

In [5]:
sample_df.isnull().sum()  #Null values

pickup_latitude       4
pickup_longitude      4
dropoff_latitude     27
dropoff_longitude    27
pickup_date           8
dropoff_date          8
trip_duration         0
dtype: int64

In [6]:
sample_df.dtypes  #data types 

pickup_latitude      float64
pickup_longitude     float64
dropoff_latitude     float64
dropoff_longitude    float64
pickup_date           object
dropoff_date          object
trip_duration          int64
dtype: object

In [7]:
sample_df.dropna(inplace=True)   #dropping rows with null values

In [8]:
sample_df.isnull().sum() #confirm

pickup_latitude      0
pickup_longitude     0
dropoff_latitude     0
dropoff_longitude    0
pickup_date          0
dropoff_date         0
trip_duration        0
dtype: int64

In [9]:
sample_df.duplicated().sum()  #check duplicated values 

4

In [10]:
sample_df[sample_df.duplicated()]   #dataframe for duplicated values 

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup_date,dropoff_date,trip_duration
62,9.426832,-0.927321,9.426832,-0.927321,25-01-20,26-01-20,600
257,9.462402,-0.865485,9.462402,-0.865485,07-12-19,07-12-19,480
366,9.462402,-0.865485,9.462402,-0.865485,20-12-19,20-12-19,480
371,9.462402,-0.865485,9.462402,-0.865485,20-12-19,20-12-19,420


In [11]:
sample_df['pickup_latitude'].duplicated().sum() #check app duplication

214

In [12]:
sample_df.drop_duplicates(inplace=True)  #remove duplicate

In [13]:
sample_df.duplicated().sum()    #confirm 

0

In [14]:
sample_df.shape

(412, 7)

In [15]:
#Get latitude and longitude differences 
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

In [16]:
#Convert duration to minutes for easier interpretation
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60)) 

In [17]:
sample_df

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup_date,dropoff_date,trip_duration,latitude_difference,longitude_difference
2,9.411328,-0.850727,9.426832,-0.927321,22-01-20,23-01-20,5,0.015503,-0.076593
3,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,31,0.015503,-0.076593
4,9.415693,-0.851178,9.426832,-0.927321,23-01-20,24-01-20,16,0.011138,-0.076142
5,9.426832,-0.927321,9.426832,-0.927321,23-01-20,24-01-20,74,0.000000,0.000000
6,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,10,0.015503,-0.076593
...,...,...,...,...,...,...,...,...,...
436,9.404727,-0.841894,9.456058,-0.860088,24-12-19,24-12-19,9,0.051331,-0.018194
437,9.414402,-0.848086,9.403919,-0.841275,24-12-19,24-12-19,29,-0.010483,0.006811
438,9.462402,-0.865485,9.417728,-0.850240,24-12-19,24-12-19,23,-0.044674,0.015245
439,9.462402,-0.865485,9.462402,-0.865485,24-12-19,24-12-19,8,0.000000,0.000000


In [18]:
#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [19]:
sample_df

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup_date,dropoff_date,trip_duration,latitude_difference,longitude_difference,trip_distance
2,9.411328,-0.850727,9.426832,-0.927321,22-01-20,23-01-20,5,0.015503,-0.076593,6.363279
3,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,31,0.015503,-0.076593,6.363279
4,9.415693,-0.851178,9.426832,-0.927321,23-01-20,24-01-20,16,0.011138,-0.076142,6.030519
5,9.426832,-0.927321,9.426832,-0.927321,23-01-20,24-01-20,74,0.000000,0.000000,0.000000
6,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,10,0.015503,-0.076593,6.363279
...,...,...,...,...,...,...,...,...,...,...
436,9.404727,-0.841894,9.456058,-0.860088,24-12-19,24-12-19,9,0.051331,-0.018194,4.803713
437,9.414402,-0.848086,9.403919,-0.841275,24-12-19,24-12-19,29,-0.010483,0.006811,1.194877
438,9.462402,-0.865485,9.417728,-0.850240,24-12-19,24-12-19,23,-0.044674,0.015245,4.139979
439,9.462402,-0.865485,9.462402,-0.865485,24-12-19,24-12-19,8,0.000000,0.000000,0.000000


In [20]:
sample_df.shape

(412, 10)

In [21]:
sample_df.drop([])

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,pickup_date,dropoff_date,trip_duration,latitude_difference,longitude_difference,trip_distance
2,9.411328,-0.850727,9.426832,-0.927321,22-01-20,23-01-20,5,0.015503,-0.076593,6.363279
3,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,31,0.015503,-0.076593,6.363279
4,9.415693,-0.851178,9.426832,-0.927321,23-01-20,24-01-20,16,0.011138,-0.076142,6.030519
5,9.426832,-0.927321,9.426832,-0.927321,23-01-20,24-01-20,74,0.000000,0.000000,0.000000
6,9.411328,-0.850727,9.426832,-0.927321,23-01-20,24-01-20,10,0.015503,-0.076593,6.363279
...,...,...,...,...,...,...,...,...,...,...
436,9.404727,-0.841894,9.456058,-0.860088,24-12-19,24-12-19,9,0.051331,-0.018194,4.803713
437,9.414402,-0.848086,9.403919,-0.841275,24-12-19,24-12-19,29,-0.010483,0.006811,1.194877
438,9.462402,-0.865485,9.417728,-0.850240,24-12-19,24-12-19,23,-0.044674,0.015245,4.139979
439,9.462402,-0.865485,9.462402,-0.865485,24-12-19,24-12-19,8,0.000000,0.000000,0.000000


In [22]:
X = sample_df.drop(["trip_duration","pickup_date", "dropoff_date"], axis=1)
y = sample_df["trip_duration"]

In [23]:
print(X)

     pickup_latitude  pickup_longitude  dropoff_latitude  dropoff_longitude  \
2           9.411328         -0.850727          9.426832          -0.927321   
3           9.411328         -0.850727          9.426832          -0.927321   
4           9.415693         -0.851178          9.426832          -0.927321   
5           9.426832         -0.927321          9.426832          -0.927321   
6           9.411328         -0.850727          9.426832          -0.927321   
..               ...               ...               ...                ...   
436         9.404727         -0.841894          9.456058          -0.860088   
437         9.414402         -0.848086          9.403919          -0.841275   
438         9.462402         -0.865485          9.417728          -0.850240   
439         9.462402         -0.865485          9.462402          -0.865485   
440         9.462402         -0.865485          9.417540          -0.849999   

     latitude_difference  longitude_difference  tri

In [24]:
print(y)

2       5
3      31
4      16
5      74
6      10
       ..
436     9
437    29
438    23
439     8
440    19
Name: trip_duration, Length: 412, dtype: int64


In [25]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [26]:
print(X_train)

     pickup_latitude  pickup_longitude  dropoff_latitude  dropoff_longitude  \
180         9.392212         -0.818977          9.462402          -0.865485   
438         9.462402         -0.865485          9.417728          -0.850240   
273         9.417641         -0.850397          9.404611          -0.860418   
259         9.417839         -0.850402          9.404780          -0.860259   
418         9.417752         -0.850277          9.462402          -0.865485   
..               ...               ...               ...                ...   
305         9.462402         -0.865485          9.416295          -0.849474   
391         9.420258         -0.852484          9.462402          -0.865485   
264         9.394955         -0.823792          9.413039          -0.814505   
119         9.430649         -1.070065          9.411328          -0.850727   
44          9.411328         -0.850727          9.287833          -1.591846   

     latitude_difference  longitude_difference  tri

In [27]:
print(y_train)

180     2
438    23
273     3
259    19
418    16
       ..
305     0
391    11
264     5
119     5
44     24
Name: trip_duration, Length: 154, dtype: int64


In [28]:
print(X_test)

     pickup_latitude  pickup_longitude  dropoff_latitude  dropoff_longitude  \
336         9.462402         -0.865485          9.401405          -0.845859   
113         9.452230         -0.857609          9.404612          -0.862615   
65          9.411328         -0.850727          9.426832          -0.927321   
358         9.401223         -0.846822          9.462402          -0.865485   
356         9.462402         -0.865485          9.450021          -0.858755   
..               ...               ...               ...                ...   
286         9.462402         -0.865485          9.426917          -0.879002   
54          9.411328         -0.850727          9.393014          -0.825670   
195         9.415896         -0.822287          9.462402          -0.865485   
386         9.462402         -0.865485          9.417714          -0.850234   
436         9.404727         -0.841894          9.456058          -0.860088   

     latitude_difference  longitude_difference  tri

In [29]:
print(y_test)

336     4
113    30
65      3
358     5
356     7
       ..
286    15
54      9
195    11
386     5
436     9
Name: trip_duration, Length: 206, dtype: int64


In [30]:
#Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [31]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [32]:
nrounds = 2000

In [33]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [34]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:2.10793	train-rmse:2.11387
[1]	eval-rmse:2.03129	train-rmse:2.02792
[2]	eval-rmse:1.96650	train-rmse:1.95230
[3]	eval-rmse:1.89698	train-rmse:1.87813
[4]	eval-rmse:1.83848	train-rmse:1.80352
[5]	eval-rmse:1.77810	train-rmse:1.73181
[6]	eval-rmse:1.73457	train-rmse:1.66680
[7]	eval-rmse:1.68092	train-rmse:1.60512
[8]	eval-rmse:1.64195	train-rmse:1.54621
[9]	eval-rmse:1.59663	train-rmse:1.48915
[10]	eval-rmse:1.55781	train-rmse:1.43592
[11]	eval-rmse:1.52489	train-rmse:1.38174
[12]	eval-rmse:1.49646	train-rmse:1.33498
[13]	eval-rmse:1.46295	train-rmse:1.28829
[14]	eval-rmse:1.43946	train-rmse:1.24670
[15]	eval-rmse:1.41844	train-rmse:1.20208
[16]	eval-rmse:1.40457	train-rmse:1.16382
[17]	eval-rmse:1.38630	train-rmse:1.12400
[18]	eval-rmse:1.36333	train-rmse:1.08709
[19]	eval-rmse:1.34165	train-rmse:1.04956
[20]	eval-rmse:1.33098	train-rmse:1.01424
[21]	eval-rmse:1.31940	train-rmse:0.98388
[22]	eval-rmse:1.30383	train-rmse:0.95414
[23]	eval-rmse:1.29151	train-rmse:0.92408
[2

[194]	eval-rmse:1.21724	train-rmse:0.30979
[195]	eval-rmse:1.21736	train-rmse:0.30978
[196]	eval-rmse:1.21742	train-rmse:0.30976
[197]	eval-rmse:1.21731	train-rmse:0.30974
[198]	eval-rmse:1.21754	train-rmse:0.30973
[199]	eval-rmse:1.21717	train-rmse:0.30975
[200]	eval-rmse:1.21708	train-rmse:0.30974
[201]	eval-rmse:1.21718	train-rmse:0.30974
[202]	eval-rmse:1.21691	train-rmse:0.30976
[203]	eval-rmse:1.21698	train-rmse:0.30975
[204]	eval-rmse:1.21731	train-rmse:0.30973
[205]	eval-rmse:1.21733	train-rmse:0.30973
[206]	eval-rmse:1.21736	train-rmse:0.30972
[207]	eval-rmse:1.21756	train-rmse:0.30972
[208]	eval-rmse:1.21747	train-rmse:0.30972
[209]	eval-rmse:1.21741	train-rmse:0.30972
[210]	eval-rmse:1.21708	train-rmse:0.30976
[211]	eval-rmse:1.21612	train-rmse:0.30980
[212]	eval-rmse:1.21638	train-rmse:0.30980
[213]	eval-rmse:1.21613	train-rmse:0.30979
[214]	eval-rmse:1.21656	train-rmse:0.30977
[215]	eval-rmse:1.21663	train-rmse:0.30977
[216]	eval-rmse:1.21711	train-rmse:0.30976
[217]	eval-

[385]	eval-rmse:1.21789	train-rmse:0.30966
[386]	eval-rmse:1.21791	train-rmse:0.30967
[387]	eval-rmse:1.21803	train-rmse:0.30970
[388]	eval-rmse:1.21801	train-rmse:0.30970
[389]	eval-rmse:1.21796	train-rmse:0.30968
[390]	eval-rmse:1.21822	train-rmse:0.30968
[391]	eval-rmse:1.21850	train-rmse:0.30971
[392]	eval-rmse:1.21823	train-rmse:0.30970
[393]	eval-rmse:1.21795	train-rmse:0.30970
[394]	eval-rmse:1.21803	train-rmse:0.30970
[395]	eval-rmse:1.21747	train-rmse:0.30967
[396]	eval-rmse:1.21754	train-rmse:0.30966
[397]	eval-rmse:1.21722	train-rmse:0.30964
[398]	eval-rmse:1.21721	train-rmse:0.30964
[399]	eval-rmse:1.21708	train-rmse:0.30963
[400]	eval-rmse:1.21729	train-rmse:0.30965
[401]	eval-rmse:1.21709	train-rmse:0.30964
[402]	eval-rmse:1.21709	train-rmse:0.30964
[403]	eval-rmse:1.21697	train-rmse:0.30966
[404]	eval-rmse:1.21707	train-rmse:0.30963
[405]	eval-rmse:1.21737	train-rmse:0.30963
[406]	eval-rmse:1.21726	train-rmse:0.30962
[407]	eval-rmse:1.21718	train-rmse:0.30963
[408]	eval-

[576]	eval-rmse:1.21736	train-rmse:0.30964
[577]	eval-rmse:1.21734	train-rmse:0.30964
[578]	eval-rmse:1.21698	train-rmse:0.30965
[579]	eval-rmse:1.21720	train-rmse:0.30963
[580]	eval-rmse:1.21761	train-rmse:0.30965
[581]	eval-rmse:1.21751	train-rmse:0.30965
[582]	eval-rmse:1.21764	train-rmse:0.30964
[583]	eval-rmse:1.21738	train-rmse:0.30964
[584]	eval-rmse:1.21743	train-rmse:0.30964
[585]	eval-rmse:1.21765	train-rmse:0.30964
[586]	eval-rmse:1.21774	train-rmse:0.30965
[587]	eval-rmse:1.21803	train-rmse:0.30965
[588]	eval-rmse:1.21821	train-rmse:0.30968
[589]	eval-rmse:1.21795	train-rmse:0.30965
[590]	eval-rmse:1.21786	train-rmse:0.30965
[591]	eval-rmse:1.21820	train-rmse:0.30969
[592]	eval-rmse:1.21816	train-rmse:0.30969
[593]	eval-rmse:1.21797	train-rmse:0.30968
[594]	eval-rmse:1.21811	train-rmse:0.30969
[595]	eval-rmse:1.21792	train-rmse:0.30966
[596]	eval-rmse:1.21816	train-rmse:0.30967
[597]	eval-rmse:1.21803	train-rmse:0.30971
[598]	eval-rmse:1.21825	train-rmse:0.30971
[599]	eval-

[767]	eval-rmse:1.21934	train-rmse:0.30964
[768]	eval-rmse:1.21928	train-rmse:0.30965
[769]	eval-rmse:1.21927	train-rmse:0.30965
[770]	eval-rmse:1.21914	train-rmse:0.30965
[771]	eval-rmse:1.21910	train-rmse:0.30967
[772]	eval-rmse:1.21876	train-rmse:0.30964
[773]	eval-rmse:1.21890	train-rmse:0.30965
[774]	eval-rmse:1.21917	train-rmse:0.30965
[775]	eval-rmse:1.21939	train-rmse:0.30966
[776]	eval-rmse:1.21927	train-rmse:0.30966
[777]	eval-rmse:1.21892	train-rmse:0.30964
[778]	eval-rmse:1.21939	train-rmse:0.30964
[779]	eval-rmse:1.21939	train-rmse:0.30964
[780]	eval-rmse:1.21920	train-rmse:0.30964
[781]	eval-rmse:1.21919	train-rmse:0.30964
[782]	eval-rmse:1.21914	train-rmse:0.30965
[783]	eval-rmse:1.21913	train-rmse:0.30963
[784]	eval-rmse:1.21914	train-rmse:0.30963
[785]	eval-rmse:1.21923	train-rmse:0.30963
[786]	eval-rmse:1.21931	train-rmse:0.30963
[787]	eval-rmse:1.21926	train-rmse:0.30963
[788]	eval-rmse:1.21933	train-rmse:0.30963
[789]	eval-rmse:1.21935	train-rmse:0.30964
[790]	eval-

[958]	eval-rmse:1.21747	train-rmse:0.30966
[959]	eval-rmse:1.21774	train-rmse:0.30966
[960]	eval-rmse:1.21772	train-rmse:0.30966
[961]	eval-rmse:1.21773	train-rmse:0.30967
[962]	eval-rmse:1.21769	train-rmse:0.30967
[963]	eval-rmse:1.21833	train-rmse:0.30972
[964]	eval-rmse:1.21845	train-rmse:0.30973
[965]	eval-rmse:1.21828	train-rmse:0.30972
[966]	eval-rmse:1.21812	train-rmse:0.30971
[967]	eval-rmse:1.21807	train-rmse:0.30975
[968]	eval-rmse:1.21810	train-rmse:0.30974
[969]	eval-rmse:1.21824	train-rmse:0.30976
[970]	eval-rmse:1.21821	train-rmse:0.30975
[971]	eval-rmse:1.21791	train-rmse:0.30979
[972]	eval-rmse:1.21795	train-rmse:0.30978
[973]	eval-rmse:1.21746	train-rmse:0.30976
[974]	eval-rmse:1.21757	train-rmse:0.30975
[975]	eval-rmse:1.21755	train-rmse:0.30974
[976]	eval-rmse:1.21785	train-rmse:0.30974
[977]	eval-rmse:1.21749	train-rmse:0.30971
[978]	eval-rmse:1.21757	train-rmse:0.30971
[979]	eval-rmse:1.21780	train-rmse:0.30972
[980]	eval-rmse:1.21770	train-rmse:0.30972
[981]	eval-

[1146]	eval-rmse:1.21653	train-rmse:0.30972
[1147]	eval-rmse:1.21653	train-rmse:0.30971
[1148]	eval-rmse:1.21630	train-rmse:0.30970
[1149]	eval-rmse:1.21608	train-rmse:0.30968
[1150]	eval-rmse:1.21606	train-rmse:0.30970
[1151]	eval-rmse:1.21586	train-rmse:0.30969
[1152]	eval-rmse:1.21584	train-rmse:0.30969
[1153]	eval-rmse:1.21613	train-rmse:0.30966
[1154]	eval-rmse:1.21599	train-rmse:0.30966
[1155]	eval-rmse:1.21637	train-rmse:0.30968
[1156]	eval-rmse:1.21641	train-rmse:0.30967
[1157]	eval-rmse:1.21640	train-rmse:0.30967
[1158]	eval-rmse:1.21628	train-rmse:0.30967
[1159]	eval-rmse:1.21653	train-rmse:0.30966
[1160]	eval-rmse:1.21653	train-rmse:0.30966
[1161]	eval-rmse:1.21696	train-rmse:0.30970
[1162]	eval-rmse:1.21693	train-rmse:0.30969
[1163]	eval-rmse:1.21692	train-rmse:0.30970
[1164]	eval-rmse:1.21682	train-rmse:0.30970
[1165]	eval-rmse:1.21615	train-rmse:0.30972
[1166]	eval-rmse:1.21598	train-rmse:0.30976
[1167]	eval-rmse:1.21617	train-rmse:0.30971
[1168]	eval-rmse:1.21606	train-r

[1333]	eval-rmse:1.21803	train-rmse:0.30963
[1334]	eval-rmse:1.21817	train-rmse:0.30964
[1335]	eval-rmse:1.21826	train-rmse:0.30964
[1336]	eval-rmse:1.21811	train-rmse:0.30963
[1337]	eval-rmse:1.21853	train-rmse:0.30964
[1338]	eval-rmse:1.21884	train-rmse:0.30966
[1339]	eval-rmse:1.21896	train-rmse:0.30967
[1340]	eval-rmse:1.21891	train-rmse:0.30966
[1341]	eval-rmse:1.21915	train-rmse:0.30970
[1342]	eval-rmse:1.21899	train-rmse:0.30969
[1343]	eval-rmse:1.21859	train-rmse:0.30971
[1344]	eval-rmse:1.21861	train-rmse:0.30969
[1345]	eval-rmse:1.21852	train-rmse:0.30967
[1346]	eval-rmse:1.21888	train-rmse:0.30972
[1347]	eval-rmse:1.21890	train-rmse:0.30971
[1348]	eval-rmse:1.21873	train-rmse:0.30972
[1349]	eval-rmse:1.21833	train-rmse:0.30970
[1350]	eval-rmse:1.21823	train-rmse:0.30966
[1351]	eval-rmse:1.21806	train-rmse:0.30966
[1352]	eval-rmse:1.21762	train-rmse:0.30969
[1353]	eval-rmse:1.21752	train-rmse:0.30969
[1354]	eval-rmse:1.21775	train-rmse:0.30968
[1355]	eval-rmse:1.21779	train-r

[1520]	eval-rmse:1.21342	train-rmse:0.30983
[1521]	eval-rmse:1.21344	train-rmse:0.30981
[1522]	eval-rmse:1.21365	train-rmse:0.30975
[1523]	eval-rmse:1.21372	train-rmse:0.30975
[1524]	eval-rmse:1.21362	train-rmse:0.30971
[1525]	eval-rmse:1.21360	train-rmse:0.30971
[1526]	eval-rmse:1.21360	train-rmse:0.30970
[1527]	eval-rmse:1.21362	train-rmse:0.30969
[1528]	eval-rmse:1.21353	train-rmse:0.30966
[1529]	eval-rmse:1.21377	train-rmse:0.30967
[1530]	eval-rmse:1.21377	train-rmse:0.30967
[1531]	eval-rmse:1.21391	train-rmse:0.30969
[1532]	eval-rmse:1.21384	train-rmse:0.30967
[1533]	eval-rmse:1.21398	train-rmse:0.30969
[1534]	eval-rmse:1.21413	train-rmse:0.30969
[1535]	eval-rmse:1.21361	train-rmse:0.30968
[1536]	eval-rmse:1.21378	train-rmse:0.30967
[1537]	eval-rmse:1.21323	train-rmse:0.30968
[1538]	eval-rmse:1.21350	train-rmse:0.30965
[1539]	eval-rmse:1.21345	train-rmse:0.30964
[1540]	eval-rmse:1.21366	train-rmse:0.30964
[1541]	eval-rmse:1.21377	train-rmse:0.30963
[1542]	eval-rmse:1.21368	train-r

[1707]	eval-rmse:1.21280	train-rmse:0.30963
[1708]	eval-rmse:1.21256	train-rmse:0.30965
[1709]	eval-rmse:1.21245	train-rmse:0.30968
[1710]	eval-rmse:1.21294	train-rmse:0.30967
[1711]	eval-rmse:1.21296	train-rmse:0.30967
[1712]	eval-rmse:1.21283	train-rmse:0.30967
[1713]	eval-rmse:1.21285	train-rmse:0.30968
[1714]	eval-rmse:1.21333	train-rmse:0.30967
[1715]	eval-rmse:1.21350	train-rmse:0.30969
[1716]	eval-rmse:1.21366	train-rmse:0.30967
[1717]	eval-rmse:1.21379	train-rmse:0.30968
[1718]	eval-rmse:1.21376	train-rmse:0.30967
[1719]	eval-rmse:1.21351	train-rmse:0.30968
[1720]	eval-rmse:1.21335	train-rmse:0.30969
[1721]	eval-rmse:1.21336	train-rmse:0.30968
[1722]	eval-rmse:1.21339	train-rmse:0.30968
[1723]	eval-rmse:1.21321	train-rmse:0.30973
[1724]	eval-rmse:1.21317	train-rmse:0.30973
[1725]	eval-rmse:1.21303	train-rmse:0.30978
[1726]	eval-rmse:1.21312	train-rmse:0.30976
[1727]	eval-rmse:1.21317	train-rmse:0.30974
[1728]	eval-rmse:1.21324	train-rmse:0.30974
[1729]	eval-rmse:1.21349	train-r

[1894]	eval-rmse:1.21271	train-rmse:0.30967
[1895]	eval-rmse:1.21264	train-rmse:0.30967
[1896]	eval-rmse:1.21248	train-rmse:0.30966
[1897]	eval-rmse:1.21276	train-rmse:0.30964
[1898]	eval-rmse:1.21274	train-rmse:0.30965
[1899]	eval-rmse:1.21278	train-rmse:0.30964
[1900]	eval-rmse:1.21305	train-rmse:0.30964
[1901]	eval-rmse:1.21259	train-rmse:0.30965
[1902]	eval-rmse:1.21210	train-rmse:0.30968
[1903]	eval-rmse:1.21240	train-rmse:0.30965
[1904]	eval-rmse:1.21241	train-rmse:0.30964
[1905]	eval-rmse:1.21257	train-rmse:0.30963
[1906]	eval-rmse:1.21234	train-rmse:0.30964
[1907]	eval-rmse:1.21244	train-rmse:0.30963
[1908]	eval-rmse:1.21234	train-rmse:0.30963
[1909]	eval-rmse:1.21236	train-rmse:0.30962
[1910]	eval-rmse:1.21197	train-rmse:0.30964
[1911]	eval-rmse:1.21220	train-rmse:0.30963
[1912]	eval-rmse:1.21230	train-rmse:0.30963
[1913]	eval-rmse:1.21229	train-rmse:0.30962
[1914]	eval-rmse:1.21262	train-rmse:0.30961
[1915]	eval-rmse:1.21255	train-rmse:0.30962
[1916]	eval-rmse:1.21224	train-r

In [35]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [36]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

15.880267682584744

In [37]:
#Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'pickup_longitude': 7272,
 'dropoff_longitude': 5784,
 'trip_distance': 6927,
 'dropoff_latitude': 6585,
 'pickup_latitude': 8202,
 'longitude_difference': 7160,
 'latitude_difference': 7731}

In [38]:
#This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'pickup_longitude': 0.14643281448218926,
 'dropoff_longitude': 0.11646966432411751,
 'trip_distance': 0.13948571313505567,
 'dropoff_latitude': 0.13259902136485371,
 'pickup_latitude': 0.16515978333098408,
 'longitude_difference': 0.14417752361007632,
 'latitude_difference': 0.15567547975272347}

In [39]:
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))