##NYC taxi dataset competition.
#implement the solution using xgboost 
Code referenced from : https://www.kaggle.com/sandeepkumar121995/eda-data-cleaning-xg-boost

# load some default Python modules

In [1]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#loading the nyc taxi dataset

In [3]:
import pandas
import random


df = pandas.read_csv('/content/drive/MyDrive/new-york-city-taxi-fare-prediction/Copy of train-sample.csv')
df.to_csv("train_sample.csv")

In [4]:
# datatypes
df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [5]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,1048575.0,1048575.0,1048575.0,1048565.0,1048565.0,1048575.0
mean,11.34548,-72.52724,39.93094,-72.5275,39.92496,1.684902
std,9.820072,12.00798,7.725806,11.41154,8.529585,1.323155
min,-44.9,-3377.681,-3116.285,-3383.297,-3114.339,0.0
25%,6.0,-73.99207,40.73496,-73.99138,40.73406,1.0
50%,8.5,-73.9818,40.75267,-73.98014,40.75318,1.0
75%,12.5,-73.96711,40.76714,-73.96367,40.76812,2.0
max,500.0,2522.271,2621.628,1717.003,1989.728,208.0


In [6]:
df = df.dropna()

##Keep a test set for final testing( TFX internally splits train and validation data )
np.random.seed(seed=2)
msk = np.random.rand(len(df)) < 0.9
train = df[msk]
test = df[~msk]

print(len(train))
print(len(test))

train.to_csv("/content/data.csv", index=False, header=True)
test.to_csv("test.csv", index=False, header=False)
#df_test = pd.read_csv(io.BytesIO(uploaded['test.csv']))
test.head()


943389
105176


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
40,39:49.0,8.1,2011-03-29 15:39:49 UTC,-73.984267,40.744961,-73.981646,40.759608,1
49,26:00.0,9.0,2013-01-29 12:26:00 UTC,-73.992253,40.742657,-73.983597,40.755947,1
58,53:14.0,13.7,2012-07-19 23:53:14 UTC,-74.002113,40.739582,-73.971918,40.791899,2
59,13:29.0,6.5,2009-03-23 12:13:29 UTC,-73.98543,40.736011,-73.982501,40.724802,1
68,19:05.0,11.5,2015-04-12 10:19:05 UTC,-73.979279,40.723438,-74.004608,40.746948,6


In [7]:

print("shape of test data", test.shape)
test.head()


shape of test data (105176, 8)


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
40,39:49.0,8.1,2011-03-29 15:39:49 UTC,-73.984267,40.744961,-73.981646,40.759608,1
49,26:00.0,9.0,2013-01-29 12:26:00 UTC,-73.992253,40.742657,-73.983597,40.755947,1
58,53:14.0,13.7,2012-07-19 23:53:14 UTC,-74.002113,40.739582,-73.971918,40.791899,2
59,13:29.0,6.5,2009-03-23 12:13:29 UTC,-73.98543,40.736011,-73.982501,40.724802,1
68,19:05.0,11.5,2015-04-12 10:19:05 UTC,-73.979279,40.723438,-74.004608,40.746948,6


#feature engineering

In [8]:
def prepare_time_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour_of_day'] = df.pickup_datetime.dt.hour
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df["weekday"] = df.pickup_datetime.dt.weekday

    return df

In [9]:
train = prepare_time_features(train)
test = prepare_time_features(test)

In [10]:
# calculate-distance-between-two-latitude-longitude-points-haversine-formula 
# Returns distance in miles
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))   # 2*R*asin...

In [11]:
train['distance_miles'] = distance(train.pickup_latitude, train.pickup_longitude, \
                                      train.dropoff_latitude, train.dropoff_longitude)

In [12]:
test['distance_miles'] = distance(test.pickup_latitude, test.pickup_longitude, \
                                      test.dropoff_latitude, test.dropoff_longitude)

In [13]:
def transform(data):
    # Distances to nearby airports, 
    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)

    data['pickup_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                         data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_distance_to_ewr'] = distance(ewr[1], ewr[0], 
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_ewr'] = distance(ewr[1], ewr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    
    return data

train = transform(train)
test = transform(test)

In [14]:
print("old size: %d" % len(train))
train = train.drop(index= train[(train['distance_miles']==0)&(train['fare_amount']==0)].index, axis=0)
print("New size: %d" % len(train))

old size: 943389
New size: 943385


In [15]:
train[train['fare_amount']==0].shape

(24, 19)

In [16]:
print("old size: %d" % len(train))
train = train.drop(index= train[train['fare_amount']==0].index, axis=0)
print("New size: %d" % len(train))

old size: 943385
New size: 943361


In [17]:
train[train['fare_amount'] < 2.5].shape

(50, 19)

In [18]:
print("old size: %d" % len(train))
train = train.drop(index= train[train['fare_amount'] < 2.5].index, axis=0)
print("New size: %d" % len(train))

old size: 943361
New size: 943311


In [19]:
train[train.passenger_count >= 7]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,pickup_distance_to_jfk,dropoff_distance_to_jfk,pickup_distance_to_ewr,dropoff_distance_to_ewr,pickup_distance_to_lgr,dropoff_distance_to_lgr
929022,54:00.0,3.3,2009-07-30 11:54:00+00:00,0.0,0.0,0.0,0.0,208,11,7,2009,3,0.0,5372.816826,5372.816826,5393.849915,5393.849915,5379.493468,5379.493468
1007609,13:00.0,104.0,2014-06-24 15:13:00+00:00,-74.01578,40.71542,-74.17028,40.70834,9,15,6,2014,1,8.106351,13.466019,21.066468,8.504471,1.320355,8.551259,16.217073


In [20]:
print("old size: %d" % len(train))
train = train.drop(index= train[train.passenger_count >= 7].index, axis=0)
print("New size: %d" % len(train))

old size: 943311
New size: 943309


In [21]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fare_amount,943309.0,11.348479,9.828406,2.5,6.0,8.5,12.5,500.0
pickup_longitude,943309.0,-72.528629,12.15126,-3377.680935,-73.992073,-73.981803,-73.967115,2522.271325
pickup_latitude,943309.0,39.93182,7.872775,-3116.285383,40.734974,40.75266,40.767147,2621.62843
dropoff_longitude,943309.0,-72.52818,11.442532,-3383.296608,-73.991387,-73.980141,-73.963663,1717.003405
dropoff_latitude,943309.0,39.925218,8.74642,-3114.338567,40.734055,40.753175,40.768137,1989.728077
passenger_count,943309.0,1.683736,1.306773,0.0,1.0,1.0,2.0,6.0
hour_of_day,943309.0,13.506272,6.514113,0.0,9.0,14.0,19.0,23.0
month,943309.0,6.268767,3.435246,1.0,3.0,6.0,9.0,12.0
year,943309.0,2011.739801,1.860328,2009.0,2010.0,2012.0,2013.0,2015.0
weekday,943309.0,3.039395,1.949948,0.0,1.0,3.0,5.0,6.0


In [22]:
# create copy of the data set
df_train = train.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
df_test = test.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
print(df_train.shape)
print(df_test.shape)

(943309, 17)
(105176, 17)


In [23]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('fare_amount', axis=1),
                                                    df_train['fare_amount'], test_size=0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(754647, 16)
(188662, 16)
(754647,)
(188662,)


In [24]:
import xgboost as xgb

In [25]:
params = {
   
    'max_depth': 7,
    'gamma' :0,
    'eta':.03, 
    'subsample': 1,
    'colsample_bytree': 0.9, 
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 0
}

#defining the XGBoost model

In [27]:
def XGBmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(X_train,label=y_train)
    matrix_test = xgb.DMatrix(X_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_train, 'train'),(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

[0]	train-rmse:14.2401	test-rmse:14.2651
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 10 rounds.
[1]	train-rmse:13.8592	test-rmse:13.8861
[2]	train-rmse:13.5131	test-rmse:13.5425
[3]	train-rmse:13.1584	test-rmse:13.1886
[4]	train-rmse:12.8124	test-rmse:12.8461
[5]	train-rmse:12.4783	test-rmse:12.5151
[6]	train-rmse:12.1572	test-rmse:12.1962
[7]	train-rmse:11.8462	test-rmse:11.8873
[8]	train-rmse:11.5456	test-rmse:11.5894
[9]	train-rmse:11.2547	test-rmse:11.3006
[10]	train-rmse:10.9916	test-rmse:11.0399
[11]	train-rmse:10.7201	test-rmse:10.7705
[12]	train-rmse:10.458	test-rmse:10.5115
[13]	train-rmse:10.204	test-rmse:10.2609
[14]	train-rmse:9.96049	test-rmse:10.02
[15]	train-rmse:9.72559	test-rmse:9.78753
[16]	train-rmse:9.49896	test-rmse:9.56377
[17]	train-rmse:9.27757	test-rmse:9.34554
[18]	train-rmse:9.06739	test-rmse:9.13773
[19]	train-rmse:8.86378	test-rmse:8.93715
[20]	train-rmse:8.66542	test-rm

In [None]:
test = pd.read_csv("/content/test.csv")
holdout = pd.DataFrame({'key': test['key'], 'fare_amount': prediction})
holdout.to_csv('xgb_4m_utc_with_cleaning.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
fscores = pd.DataFrame({'X': list(model.get_fscore().keys()), 'Y': list(model.get_fscore().values())})
fscores.sort_values(by='Y').plot.bar(x='X')