In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('train_EDA.csv')
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_dayofweek,pickup_hour,pickup_slot,dropoff_month,dropoff_dayofweek,dropoff_hour,dropoff_slot
0,1,1,-73.982155,40.767937,-73.96463,40.765602,N,455,3,0,17,Evening,3,0,17,Evening
1,0,1,-73.980415,40.738564,-73.999481,40.731152,N,663,6,6,0,Late_Night,6,6,0,Late_Night
2,1,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,1,1,11,Morning,1,1,12,Noon
3,1,1,-74.01004,40.719971,-74.012268,40.706718,N,429,4,2,19,Night,4,2,19,Night
4,1,1,-73.973053,40.793209,-73.972923,40.78252,N,435,3,5,13,Noon,3,5,13,Noon


In [5]:
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'Y':1, 'N':0})
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_dayofweek,pickup_hour,pickup_slot,dropoff_month,dropoff_dayofweek,dropoff_hour,dropoff_slot
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,3,0,17,Evening,3,0,17,Evening
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,6,6,0,Late_Night,6,6,0,Late_Night
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,1,1,11,Morning,1,1,12,Noon
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,4,2,19,Night,4,2,19,Night
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,3,5,13,Noon,3,5,13,Noon


In [6]:
d_pickup_month = pd.get_dummies(df['pickup_month'], prefix='pickup_month', drop_first=True)
d_pickup_month.head()

Unnamed: 0,pickup_month_2,pickup_month_3,pickup_month_4,pickup_month_5,pickup_month_6
0,0,1,0,0,0
1,0,0,0,0,1
2,0,0,0,0,0
3,0,0,1,0,0
4,0,1,0,0,0


In [8]:
df = pd.concat([df, d_pickup_month], axis=1)
df = df.drop('pickup_month', axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_dayofweek,...,pickup_month_2,pickup_month_3,pickup_month_4,pickup_month_5,pickup_month_6,pickup_month_2.1,pickup_month_3.1,pickup_month_4.1,pickup_month_5.1,pickup_month_6.1
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,3,0,...,0,1,0,0,0,0,1,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,6,6,...,0,0,0,0,1,0,0,0,0,1
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,4,2,...,0,0,1,0,0,0,0,1,0,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,3,5,...,0,1,0,0,0,0,1,0,0,0


In [9]:
df = df.loc[:, ~df.columns.duplicated()]
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_dayofweek,...,pickup_slot,dropoff_month,dropoff_dayofweek,dropoff_hour,dropoff_slot,pickup_month_2,pickup_month_3,pickup_month_4,pickup_month_5,pickup_month_6
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,3,0,...,Evening,3,0,17,Evening,0,1,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,6,6,...,Late_Night,6,6,0,Late_Night,0,0,0,0,1
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,1,1,...,Morning,1,1,12,Noon,0,0,0,0,0
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,4,2,...,Night,4,2,19,Night,0,0,1,0,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,3,5,...,Noon,3,5,13,Noon,0,1,0,0,0


In [13]:
d_pickup_dayofweek = pd.get_dummies(df['pickup_dayofweek'], prefix='pickup_dayofweek', drop_first=True)
d_pickup_dayofweek.head()

Unnamed: 0,pickup_dayofweek_1,pickup_dayofweek_2,pickup_dayofweek_3,pickup_dayofweek_4,pickup_dayofweek_5,pickup_dayofweek_6
0,0,0,0,0,0,0
1,0,0,0,0,0,1
2,1,0,0,0,0,0
3,0,1,0,0,0,0
4,0,0,0,0,1,0


In [14]:
df = pd.concat([df, d_pickup_dayofweek], axis=1)
df = df.drop('pickup_dayofweek', axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,pickup_slot,...,pickup_month_3,pickup_month_4,pickup_month_5,pickup_month_6,pickup_dayofweek_1,pickup_dayofweek_2,pickup_dayofweek_3,pickup_dayofweek_4,pickup_dayofweek_5,pickup_dayofweek_6
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,17,Evening,...,1,0,0,0,0,0,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,0,Late_Night,...,0,0,0,1,0,0,0,0,0,1
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,11,Morning,...,0,0,0,0,1,0,0,0,0,0
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,19,Night,...,0,1,0,0,0,1,0,0,0,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,13,Noon,...,1,0,0,0,0,0,0,0,1,0


In [15]:
d_pickup_slot = pd.get_dummies(df['pickup_slot'], prefix='pickup_slot', drop_first=True)
d_pickup_slot.head()

Unnamed: 0,pickup_slot_Evening,pickup_slot_Late_Night,pickup_slot_Midnight,pickup_slot_Morning,pickup_slot_Night,pickup_slot_Noon
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,1,0,0
3,0,0,0,0,1,0
4,0,0,0,0,0,1


In [16]:
df = pd.concat([df, d_pickup_slot], axis=1)
df = df.drop('pickup_slot', axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,dropoff_month,...,pickup_dayofweek_3,pickup_dayofweek_4,pickup_dayofweek_5,pickup_dayofweek_6,pickup_slot_Evening,pickup_slot_Late_Night,pickup_slot_Midnight,pickup_slot_Morning,pickup_slot_Night,pickup_slot_Noon
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,17,3,...,0,0,0,0,1,0,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,0,6,...,0,0,0,1,0,1,0,0,0,0
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,11,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,19,4,...,0,0,0,0,0,0,0,0,1,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,13,3,...,0,0,1,0,0,0,0,0,0,1


In [17]:
d_dropoff_month = pd.get_dummies(df['dropoff_month'], prefix='dropoff_month', drop_first=True)
d_dropoff_month.head()

Unnamed: 0,dropoff_month_2,dropoff_month_3,dropoff_month_4,dropoff_month_5,dropoff_month_6,dropoff_month_7
0,0,1,0,0,0,0
1,0,0,0,0,1,0
2,0,0,0,0,0,0
3,0,0,1,0,0,0
4,0,1,0,0,0,0


In [18]:
df = pd.concat([df, d_dropoff_month], axis=1)
df = df.drop('dropoff_month', axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,dropoff_dayofweek,...,pickup_slot_Midnight,pickup_slot_Morning,pickup_slot_Night,pickup_slot_Noon,dropoff_month_2,dropoff_month_3,dropoff_month_4,dropoff_month_5,dropoff_month_6,dropoff_month_7
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,17,0,...,0,0,0,0,0,1,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,0,6,...,0,0,0,0,0,0,0,0,1,0
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,11,1,...,0,1,0,0,0,0,0,0,0,0
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,19,2,...,0,0,1,0,0,0,1,0,0,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,13,5,...,0,0,0,1,0,1,0,0,0,0


In [19]:
d_dropoff_dayofweek = pd.get_dummies(df['dropoff_dayofweek'], prefix='dropoff_dayofweek', drop_first=True)
d_dropoff_dayofweek.head()

Unnamed: 0,dropoff_dayofweek_1,dropoff_dayofweek_2,dropoff_dayofweek_3,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6
0,0,0,0,0,0,0
1,0,0,0,0,0,1
2,1,0,0,0,0,0
3,0,1,0,0,0,0
4,0,0,0,0,1,0


In [20]:
df = pd.concat([df, d_dropoff_dayofweek], axis=1)
df = df.drop('dropoff_dayofweek', axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,dropoff_hour,...,dropoff_month_4,dropoff_month_5,dropoff_month_6,dropoff_month_7,dropoff_dayofweek_1,dropoff_dayofweek_2,dropoff_dayofweek_3,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,17,17,...,0,0,0,0,0,0,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,0,0,...,0,0,1,0,0,0,0,0,0,1
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,11,12,...,0,0,0,0,1,0,0,0,0,0
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,19,19,...,1,0,0,0,0,1,0,0,0,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,13,13,...,0,0,0,0,0,0,0,0,1,0


In [21]:
d_dropoff_slot = pd.get_dummies(df['dropoff_slot'], prefix='dropoff_slot', drop_first=True)
d_dropoff_slot.head()

Unnamed: 0,dropoff_slot_Evening,dropoff_slot_Late_Night,dropoff_slot_Midnight,dropoff_slot_Morning,dropoff_slot_Night,dropoff_slot_Noon
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,0,1
3,0,0,0,0,1,0
4,0,0,0,0,0,1


In [22]:
df = pd.concat([df, d_dropoff_slot], axis=1)
df = df.drop('dropoff_slot', axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,dropoff_hour,...,dropoff_dayofweek_3,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6,dropoff_slot_Evening,dropoff_slot_Late_Night,dropoff_slot_Midnight,dropoff_slot_Morning,dropoff_slot_Night,dropoff_slot_Noon
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,455,17,17,...,0,0,0,0,1,0,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,663,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,2124,11,12,...,0,0,0,0,0,0,0,0,0,1
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,429,19,19,...,0,0,0,0,0,0,0,0,1,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,435,13,13,...,0,0,1,0,0,0,0,0,0,1


In [23]:
df.corr()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_hour,dropoff_hour,...,dropoff_dayofweek_3,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6,dropoff_slot_Evening,dropoff_slot_Late_Night,dropoff_slot_Midnight,dropoff_slot_Morning,dropoff_slot_Night,dropoff_slot_Noon
vendor_id,1.0,0.287415,0.00782,0.001742,0.001528,0.004496,-0.079872,0.020304,0.009299,0.00925,...,-0.001663,-0.001568,0.00032,0.003171,0.003028,-0.004555,0.003263,0.005189,0.006424,-0.005031
passenger_count,0.287415,1.0,0.002169,-0.005125,-0.000343,-0.002762,-0.021815,0.008471,0.009101,0.008449,...,-0.008558,-0.001059,0.02069,0.016182,0.000967,0.009285,0.008807,-0.010919,0.004026,0.003568
pickup_longitude,0.00782,0.002169,1.0,0.022568,0.783582,0.10019,0.010799,0.026542,0.01015,0.010758,...,-0.000766,-0.002606,-0.017186,0.000307,0.013377,-0.020925,-0.003967,0.003977,-0.00205,0.004406
pickup_latitude,0.001742,-0.005125,0.022568,1.0,0.114884,0.494038,-0.008223,-0.029204,0.010603,0.017043,...,0.009408,0.003884,-0.021352,-0.023573,0.022594,-0.089781,-0.044264,0.053353,-0.015519,0.041982
dropoff_longitude,0.001528,-0.000343,0.783582,0.114884,1.0,0.124873,0.008226,0.014678,-0.022455,-0.023361,...,0.000587,0.001111,-0.008221,0.006201,-0.001046,0.019787,3.4e-05,-0.015425,-0.013387,-0.007126
dropoff_latitude,0.004496,-0.002762,0.10019,0.494038,0.124873,1.0,-0.009818,-0.020677,0.013612,0.017246,...,0.003835,-6.8e-05,-0.017494,-0.012811,0.022803,-0.054037,-0.025746,0.021535,-0.008407,0.027961
store_and_fwd_flag,-0.079872,-0.021815,0.010799,-0.008223,0.008226,-0.009818,1.0,0.001724,0.002245,0.002932,...,0.002677,0.001,-0.00316,-0.003242,0.005998,-0.00284,-0.001423,-0.001582,-0.002727,0.004785
trip_duration,0.020304,0.008471,0.026542,-0.029204,0.014678,-0.020677,0.001724,1.0,0.00369,0.002652,...,0.002323,0.002421,-0.00445,-0.001517,0.007825,0.001131,-0.002528,-0.001877,-0.003644,0.007281
pickup_hour,0.009299,0.009101,0.01015,0.010603,-0.022455,0.013612,0.002245,0.00369,1.0,0.933977,...,0.028961,0.001514,-0.03082,-0.081204,0.218331,-0.555289,0.463645,-0.31802,0.448673,-0.026819
dropoff_hour,0.00925,0.008449,0.010758,0.017043,-0.023361,0.017246,0.002932,0.002652,0.933977,1.0,...,0.031947,-0.000932,-0.037533,-0.088731,0.232406,-0.655928,0.470406,-0.296618,0.459671,-0.006814


In [24]:
import statsmodels.api as sm

In [25]:
y_train = df.pop('trip_duration')
X_train = df

In [26]:
X_train.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_hour,dropoff_hour,pickup_month_2,...,dropoff_dayofweek_3,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6,dropoff_slot_Evening,dropoff_slot_Late_Night,dropoff_slot_Midnight,dropoff_slot_Morning,dropoff_slot_Night,dropoff_slot_Noon
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,17,17,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,11,12,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,19,19,0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,13,13,0,...,0,0,1,0,0,0,0,0,0,1


In [28]:
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_hour,dropoff_hour,pickup_month_2,...,dropoff_dayofweek_3,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6,dropoff_slot_Evening,dropoff_slot_Late_Night,dropoff_slot_Midnight,dropoff_slot_Morning,dropoff_slot_Night,dropoff_slot_Noon
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,17,17,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,11,12,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,19,19,0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,13,13,0,...,0,0,1,0,0,0,0,0,0,1


In [29]:
df = pd.concat([X_train, y_train], axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_hour,dropoff_hour,pickup_month_2,...,dropoff_dayofweek_4,dropoff_dayofweek_5,dropoff_dayofweek_6,dropoff_slot_Evening,dropoff_slot_Late_Night,dropoff_slot_Midnight,dropoff_slot_Morning,dropoff_slot_Night,dropoff_slot_Noon,trip_duration
0,1,1,-73.982155,40.767937,-73.96463,40.765602,0,17,17,0,...,0,0,0,1,0,0,0,0,0,455
1,0,1,-73.980415,40.738564,-73.999481,40.731152,0,0,0,0,...,0,0,1,0,1,0,0,0,0,663
2,1,1,-73.979027,40.763939,-74.005333,40.710087,0,11,12,0,...,0,0,0,0,0,0,0,0,1,2124
3,1,1,-74.01004,40.719971,-74.012268,40.706718,0,19,19,0,...,0,0,0,0,0,0,0,1,0,429
4,1,1,-73.973053,40.793209,-73.972923,40.78252,0,13,13,0,...,0,1,0,0,0,0,0,0,1,435


In [30]:
df.to_csv('train_LR_prep.csv', index=False)