In [1]:
import numpy as np
import pandas as pd
import os
import math
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Enter Your Name Here
myname = "Rohan B M" 
Roll_No = "BM21MTECH14003"

#load the training dataset
df = pd.read_csv('train.csv', nrows = 5000000)
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [5]:
#checking Null value in data
df[pd.isnull(df)].sum()

key                    0
fare_amount          0.0
pickup_datetime        0
pickup_longitude     0.0
pickup_latitude      0.0
dropoff_longitude    0.0
dropoff_latitude     0.0
passenger_count      0.0
dtype: object

In [6]:
# Selecting only positive fare amount
df=df.loc[df['fare_amount']>=0]
print(df.shape)

(4999789, 8)


In [7]:
#Data preprocessing
def haversian_distance(lat1, lat2, lon1,lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))
df['hav_distance'] = df.apply(lambda row:haversian_distance(row['pickup_latitude'],row['dropoff_latitude'],row['pickup_longitude'],row['dropoff_longitude']),axis=1)

df['euc_distance'] = 69 * np.sqrt((np.array(df.dropoff_longitude) - np.array(df.pickup_longitude))**2 + (np.array(df.pickup_latitude) - np.array(df.dropoff_latitude))**2)

df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'],format='%Y-%m-%d %H:%M:%S %Z')

#Build out date values into separate columns
df['pickup_year'] =df['pickup_datetime'].dt.year
df['pickup_quarter'] = df['pickup_datetime'].dt.quarter
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour

#Drop columns we don't require
df.drop(['key', 'pickup_datetime'], axis=1, inplace=True)

df['pickup_month_cos']=np.cos((df['pickup_month']-1)*(2*(np.pi/12)))
df['pickup_month_sin']=np.sin((df['pickup_month']-1)*(2*(np.pi/12)))
df['pickup_day_cos']=np.cos((df['pickup_day']-1)*(2*(np.pi/30)))
df['pickup_day_sin']=np.sin((df['pickup_day']-1)*(2*(np.pi/30)))
df['pickup_quarter_cos']=np.cos((df['pickup_quarter']-1)*(2*(np.pi/4)))
df['pickup_quarter_sin']=np.sin((df['pickup_quarter']-1)*(2*(np.pi/4)))
df['pickup_hour_cos']=np.cos((df['pickup_hour']-1)*(2*(np.pi/24)))
df['pickup_hour_sin']=np.sin((df['pickup_hour']-1)*(2*(np.pi/24)))

#Convert Year into Number of Years Historically from 2021.
df['pickup_year_age']=2021-df['pickup_year']

#Drop existing pickup date fields and use just cyclical ones going forward
df.drop(['pickup_year','pickup_quarter','pickup_month','pickup_day','pickup_hour'], axis=1, inplace=True)

#Ensure no negative passengers or where there are more than 10 passengers as most limos would have a 10 person maximum
df = df[(df.passenger_count > 0) & (df.passenger_count < 10)]

def distance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude):
    radius = 6371
    pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude = map(np.radians,[pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude])
    distance_latitude = dropoff_latitude - pickup_latitude
    distance_longitude = dropoff_longitude - pickup_longitude
    calculation = np.sin(distance_latitude/2.0)**2 + np.cos(pickup_latitude) * np.cos(dropoff_latitude) * np.sin(distance_longitude/2.0)**2
    
    return 2 * radius * np.arcsin(np.sqrt(calculation))

df['distance'] = distance(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

# Workbook from https://www.kaggle.com/gunbl4d3/xgboost-ing-taxi-fares mentioned to filter out inappropriate locations outside of the range in NYC
df = df[(df.pickup_longitude > -80) & (df.pickup_longitude < -70) & (df.pickup_latitude > 35) & (df.pickup_latitude < 45) & (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) &
        (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45)]

df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hav_distance,euc_distance,pickup_month_cos,pickup_month_sin,pickup_day_cos,pickup_day_sin,pickup_quarter_cos,pickup_quarter_sin,pickup_hour_cos,pickup_hour_sin,pickup_year_age,distance
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,0.640487,0.651073,-0.8660254,0.5,-0.978148,0.207912,6.123234000000001e-17,1.0,-0.5,-0.866025,12,1.030764
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,5.25067,5.499001,1.0,0.0,0.669131,0.743145,1.0,0.0,-0.7071068,-0.707107,11,8.450134
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,0.863411,0.943509,-0.8660254,-0.5,-0.913545,-0.406737,-1.0,1.224647e-16,0.9659258,-0.258819,10,1.389525
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,1.739386,1.748493,6.123234000000001e-17,1.0,-0.5,-0.866025,6.123234000000001e-17,1.0,0.7071068,0.707107,9,2.79927
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,1.242218,1.343397,0.5,0.866025,-0.104528,0.994522,1.0,0.0,6.123234000000001e-17,1.0,11,1.999157


In [8]:
#Split training data into training records and validation records. We cannot use test set as we do not know the outcome
x_train, x_val, y_train, y_val = train_test_split(df.iloc[:, 1:], df['fare_amount'], test_size=0.2, random_state=42)
x_train.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hav_distance,euc_distance,pickup_month_cos,pickup_month_sin,pickup_day_cos,pickup_day_sin,pickup_quarter_cos,pickup_quarter_sin,pickup_hour_cos,pickup_hour_sin,pickup_year_age,distance
654930,-74.001462,40.746437,-74.009082,40.730324,1,1.182614,1.229853,1.0,0.0,-0.913545,0.406737,1.0,0.0,0.707107,0.707107,10,1.903233
2799728,-73.992728,40.737538,-73.983247,40.747483,1,0.847633,0.948072,-1.0,1.224647e-16,0.669131,0.743145,-1.0,1.224647e-16,-0.866025,-0.5,8,1.364134
3250659,-73.989445,40.743058,-73.979262,40.761885,3,1.405776,1.476905,0.5,0.8660254,-0.913545,-0.406737,1.0,0.0,-0.258819,0.965926,9,2.262377
4922325,-73.953636,40.772266,-74.008392,40.74765,1,3.332381,4.142413,0.866025,0.5,0.809017,-0.587785,1.0,0.0,-0.866025,0.5,6,5.362947
283682,-73.970925,40.746083,-73.971385,40.74543,4,0.051141,0.055114,0.5,0.8660254,0.309017,-0.951057,1.0,0.0,-0.258819,-0.965926,12,0.082304


In [33]:
%%time
#Model 1: Linear Regression
linear_regression = linear_model.LinearRegression()
linear_regression.fit(x_train,y_train)
y_pred_linear_regression = np.round(linear_regression.predict(x_val),2)
print('Implementation of Linear Regression for the given is successful...')
print('Root Mean Squared Error: %.2f'
      % math.sqrt(mean_squared_error(y_val, y_pred_linear_regression)))

Implementation of Linear Regression for the given is successful...
Root Mean Squared Error: 6.42
Wall time: 2.73 s


In [10]:
%%time
#Model 2: Random Forest Regression
RF = RandomForestRegressor(max_depth=8,n_estimators=300, random_state=47)
RF.fit(x_train,y_train)
y_pred_rf = np.round(RF.predict(x_val),2)
print('Root Mean Squared Error: %.2f'
      % math.sqrt(mean_squared_error(y_val, y_pred_rf)))

Random Forest - Root Mean Squared Error: 4.08
Wall time: 3h 51min 29s


In [16]:
%%time
#Model 3:Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
Dtree_regression = DecisionTreeRegressor()
Dtree_regression.fit(x_train,y_train)
y_pred_Dtree_regression = Dtree_regression.predict(x_val)
print('Decision Tree Regression - Root Mean Squared Error: %.2f'
      % math.sqrt(mean_squared_error(y_val, y_pred_Dtree_regression)))

Decision Tree Regression - Root Mean Squared Error: 5.34
Wall time: 2min 59s


In [17]:
%%time
#Model 4:Gradient Boosting Regression
gradient_boost = GradientBoostingRegressor()
gradient_boost.fit(x_train,y_train)
y_pred_gradient_boost = np.round(gradient_boost.predict(x_val),2)
print('Gradient Boost - Root Mean Squared Error: %.2f'
      % math.sqrt(mean_squared_error(y_val, y_pred_gradient_boost)))

Gradient Boost - Root Mean Squared Error: 4.04
Wall time: 36min 50s


In [18]:
%%time
#Model 5:XG_Boost Regression
xg_boost = XGBRegressor(objective='reg:squarederror')
xg_boost.fit(x_train,y_train)
y_pred_xg_boost = np.round(xg_boost.predict(x_val),2)
print('XG Boost - Root Mean Squared Error: %.2f'
      % math.sqrt(mean_squared_error(y_val, y_pred_xg_boost)))

XG Boost - Root Mean Squared Error: 3.74
Wall time: 12min 47s


In [19]:
%%time
#Model 6: Light Gradient Boosting Regression
light_gbm = LGBMRegressor()
light_gbm.fit(x_train,y_train)
y_pred_light_gbm = np.round(light_gbm.predict(x_val),2)
print('Light GBM - Root Mean Squared Error: %.2f'
      % math.sqrt(mean_squared_error(y_val, y_pred_light_gbm)))

Light GBM - Root Mean Squared Error: 3.91
Wall time: 35.4 s
Parser   : 189 ms


In [21]:
#Applying same preprocession method to test data
df1 = pd.read_csv('test.csv')
df1['hav_distance'] = df1.apply(lambda row:haversian_distance(row['pickup_latitude'],row['dropoff_latitude'],row['pickup_longitude'],row['dropoff_longitude']),axis=1)
df1['euc_distance'] = 69 * np.sqrt((np.array(df1.dropoff_longitude) - np.array(df1.pickup_longitude))**2 + (np.array(df1.pickup_latitude) - np.array(df1.dropoff_latitude))**2)

df1['pickup_datetime'] =  pd.to_datetime(df1['pickup_datetime'],format='%Y-%m-%d %H:%M:%S %Z')
df1['pickup_year'] = df1['pickup_datetime'].dt.year
df1['pickup_quarter'] = df1['pickup_datetime'].dt.quarter
df1['pickup_month'] = df1['pickup_datetime'].dt.month
df1['pickup_day'] = df1['pickup_datetime'].dt.day
df1['pickup_hour'] = df1['pickup_datetime'].dt.hour

df1.drop(['key', 'pickup_datetime'], axis=1, inplace=True)

df1['pickup_month_cos']=np.cos((df1['pickup_month']-1)*(2*(np.pi/12)))
df1['pickup_month_sin']=np.sin((df1['pickup_month']-1)*(2*(np.pi/12)))
df1['pickup_day_cos']=np.cos((df1['pickup_day']-1)*(2*(np.pi/30)))
df1['pickup_day_sin']=np.sin((df1['pickup_day']-1)*(2*(np.pi/30)))
df1['pickup_quarter_cos']=np.cos((df1['pickup_quarter']-1)*(2*(np.pi/4)))
df1['pickup_quarter_sin']=np.sin((df1['pickup_quarter']-1)*(2*(np.pi/4)))
df1['pickup_hour_cos']=np.cos((df1['pickup_hour']-1)*(2*(np.pi/24)))
df1['pickup_hour_sin']=np.sin((df1['pickup_hour']-1)*(2*(np.pi/24)))
df1['pickup_year_age']=2021-df1['pickup_year']
df1.drop(['pickup_year','pickup_quarter','pickup_month','pickup_day','pickup_hour'], axis=1, inplace=True)
df1 = df1[(df1.passenger_count > 0) & (df1.passenger_count < 10)]
df1['distance'] = distance(df1['pickup_latitude'], df1['pickup_longitude'],df1['dropoff_latitude'], df1['dropoff_longitude'])
df1 = df1[(df1.pickup_longitude > -80) & (df1.pickup_longitude < -70) & (df1.pickup_latitude > 35) & (df1.pickup_latitude < 45) & (df1.dropoff_longitude > -80) & (df1.dropoff_longitude < -70) &
        (df1.dropoff_latitude > 35) & (df1.dropoff_latitude < 45)]
df1.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hav_distance,euc_distance,pickup_month_cos,pickup_month_sin,pickup_day_cos,pickup_day_sin,pickup_quarter_cos,pickup_quarter_sin,pickup_hour_cos,pickup_hour_sin,pickup_year_age,distance
0,-73.97332,40.763805,-73.98143,40.743835,1,1.443607,1.48722,1.0,0.0,0.669131,-0.743145,1.0,0.0,-1.0,1.224647e-16,6,2.32326
1,-73.986862,40.719383,-73.998886,40.739201,1,1.507044,1.599405,1.0,0.0,0.669131,-0.743145,1.0,0.0,-1.0,1.224647e-16,6,2.425353
2,-73.982524,40.75126,-73.979654,40.746139,1,0.384398,0.405057,-1.83697e-16,-1.0,0.104528,0.994522,-1.83697e-16,-1.0,-0.866025,0.5,10,0.618628
3,-73.98116,40.767807,-73.990448,40.751635,1,1.218529,1.286809,0.8660254,-0.5,1.0,0.0,-1.83697e-16,-1.0,0.5,-0.8660254,9,1.961033
4,-73.966046,40.789775,-73.988565,40.744427,1,3.347514,3.493572,0.8660254,-0.5,1.0,0.0,-1.83697e-16,-1.0,0.5,-0.8660254,9,5.387301


In [24]:
x_train.shape

(3902949, 17)

In [26]:
#Make predictions on Test Set for best 2 models
y_predictions_xg = np.round(xg_boost.predict(df1.iloc[:, :]),2)
y_predictions_lgbm = np.round(light_gbm.predict(df1.iloc[:, :]),2)

In [27]:
test_data=pd.read_csv('test.csv')

In [28]:
df1['fare_amount'] = y_predictions_xg
df_sub = pd.DataFrame(test_data['key'])
df_sub['fare_amount'] = y_predictions_xg
df_sub.to_csv('BM21MTECH14003_Q6.XGBoost.csv',index=False)

Kaggle score for XG Boost is 3.13236

In [29]:
del df1['fare_amount']

In [30]:
df1['fare_amount'] = y_predictions_lgbm
df1_sub = pd.DataFrame(test_data['key'])
df1_sub['fare_amount'] = y_predictions_lgbm
df1_sub.to_csv('BM21MTECH14003_Q6.LightGBM.csv',index=False)

Kaggle score for Light GBM is 3.33215