In [3]:
%matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split,KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge,Lasso
from sklearn.neighbors import KNeighborsClassifier


                        

Using matplotlib backend: <object object at 0x0000029F54EF5D50>


In [5]:
df=pd.read_csv('datasets/nyc_taxi_trip_duration.csv')

In [6]:
df.head()


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


### Preprocessing and Feature Extraction

#### DateTime conversion

In [9]:
#converting strings into datetime features
df['pickup_datetime']=pd.to_datetime(df.pickup_datetime)
df['dropoff_datetime']=pd.to_datetime(df.dropoff_datetime)


In [10]:
#Log Transform the Y values

df['trip_duration_log']=np.log(df['trip_duration']+1)
df['trip_duration_log']

0         5.993961
1         7.003974
2         7.400010
3         7.040536
4         6.744059
            ...   
729317    5.693732
729318    5.755742
729319    6.513230
729320    6.104793
729321    7.110696
Name: trip_duration_log, Length: 729322, dtype: float64

In [11]:
#Add some datetime features
df['pickup_weekday']=df['pickup_datetime'].dt.weekday
df['pickup_weekofyear']=df['pickup_datetime'].dt.strftime('%V').astype('float')
df['pickup_hour']=df['pickup_datetime'].dt.hour
df['pickup_minute']=df['pickup_datetime'].dt.minute
df['pickup_secs']=(df['pickup_datetime']-df['pickup_datetime'].min()).dt.total_seconds()#min() is minimum
df['pickup_week_hours']=df['pickup_weekday']*24+df['pickup_hour']#  Adds the hour of the day to the total hours since the start of the week.

### Distance features

##### Euclidean distance

In [14]:
#displacement
y_dist=df['pickup_longitude']-df['dropoff_longitude']
x_dist=df['pickup_latitude']-df['dropoff_latitude']

#square distance
df['distance']=np.sqrt(y_dist**2 + x_dist**2)

#### Haversine distance

In [16]:
def haversine_array(lat1,lng1,lat2,lng2):
    lat1,lng1,lat2,lng2=map(np.radians,(lat1,lng1,lat2,lng2))
    Average_earth_radius=6371# in kms
    lat=lat2-lat1
    lng=lng2-lng1
    d=np.sin(lat/2)**2 + np.cos(lat1)*np.cos(lat1)*np.cos(lat2)*np.sin(lng/2)**2
    h=2*Average_earth_radius*np.arcsin(np.sqrt(d))
    return h

def direction_array(lat1,lng1,lat2,lng2):
    lat1,lng1,lat2,lng2=map(np.radians,(lat1,lng1,lat2,lng2))
    Average_earth_radius=6371# in kms
    lng_delta_rad=np.radians(lng2-lng1)
    y=np.sin(lng_delta_rad)*np.cos(lat2)
    x=np.cos(lat1)*np.sin(lat2)-np.sin(lat1)*np.cos(lat2)*np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y,x))


df['haversine _distance']=haversine_array(df['pickup_latitude'].values,
                                           df['pickup_longitude'].values,
                                           df['dropoff_latitude'].values,
                                           df['dropoff_longitude'].values)

df['direction']=direction_array(df['pickup_latitude'].values,
                                           df['pickup_longitude'].values,
                                           df['dropoff_latitude'].values,
                                           df['dropoff_longitude'].values)

        
    



In [19]:
!pip install geopy

from geopy.distance import great_circle

def cal_distance(pickup_lat,pickup_lon,dropoff_lat,dropoff_lon):
    start_coordinates=[pickup_lat,pickup_lon]
    stop_coordinates=[dropoff_lat,dropoff_lon]
    return great_circle(start_coordinates,stop_coordinates).km

df['distance']=df.apply(lambda x: cal_distance(x['pickup_latitude'],x['pickup_longitude'],x['dropoff_latitude'],x['dropoff_longitude']),axis=1)







[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


#### Binning

In [21]:
### roundup latitude and longitude values
df['pickup_latitude_rounded']=np.round(df['pickup_latitude'],3)
df['pickup_longitude_rounded']=np.round(df['pickup_longitude'],3)

df['dropoff_latitude_rounded']=np.round(df['dropoff_latitude'],3)
df['dropoff_longitude_rounded']=np.round(df['dropoff_longitude'],3)

#df['trip_duration_in_hours']=df['trip_duration'].values/3600
#trip_per_hour=pd.cut(df['trip_duration_in_hours'],bins=range(0,25),labels=range(1,25))


#df['trip_duration_for_hours_binning']=trip_per_hour

In [22]:
#trip_per_hour

In [25]:
#duration_hours_count=df.trip_duration_for_hours_binning.value_counts()

In [26]:
#df['trip_hours_rides_count']=df['trip_duration_for_hours_binning'].apply(lambda x: duration_hours_count[x])

In [27]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,pickup_minute,pickup_secs,pickup_week_hours,distance,haversine _distance,direction,pickup_latitude_rounded,pickup_longitude_rounded,dropoff_latitude_rounded,dropoff_longitude_rounded
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,...,40,5157547.0,16,1.199074,1.125673,-179.022057,40.779,-73.954,40.771,-73.964
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,...,35,6132863.0,119,4.129117,4.120462,-179.867376,40.732,-73.988,40.695,-73.995
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,...,59,4471099.0,161,7.250763,6.957011,0.69813,40.721,-73.997,40.775,-73.948
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,...,44,380597.0,33,2.361101,2.35237,0.177111,40.76,-73.962,40.781,-73.957
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,...,42,4084869.0,54,4.32854,4.159001,0.681702,40.708,-74.017,40.741,-73.988


In [28]:
pd.set_option('display.max_columns',None)
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_duration_log,pickup_weekday,pickup_weekofyear,pickup_hour,pickup_minute,pickup_secs,pickup_week_hours,distance,haversine _distance,direction,pickup_latitude_rounded,pickup_longitude_rounded,dropoff_latitude_rounded,dropoff_longitude_rounded
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400,5.993961,0,9.0,16,40,5157547.0,16,1.199074,1.125673,-179.022057,40.779,-73.954,40.771,-73.964
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,7.003974,4,10.0,23,35,6132863.0,119,4.129117,4.120462,-179.867376,40.732,-73.988,40.695,-73.995
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635,7.40001,6,7.0,17,59,4471099.0,161,7.250763,6.957011,0.69813,40.721,-73.997,40.775,-73.948
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141,7.040536,1,1.0,9,44,380597.0,33,2.361101,2.35237,0.177111,40.76,-73.962,40.781,-73.957
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848,6.744059,2,7.0,6,42,4084869.0,54,4.32854,4.159001,0.681702,40.708,-74.017,40.741,-73.988


In [43]:
pickup_hour_bins=[0,6,10,17,21,24]
df['hourly_bins']=pd.cut(df['pickup_hour'],bins=pickup_hour_bins,right=False)#right=False means eg[0,6) 6 is excluded only upto 5,True means included
ride_counts=df['hourly_bins'].value_counts() 
df['hourly_rides_count']=df['hourly_bins'].apply(lambda x: ride_counts[x])

In [45]:
df['pickup_hour'].value_counts()
#df['pickup_hour'].unique()

pickup_hour
18    45404
19    45262
20    42165
21    42045
22    40293
17    38313
14    37120
12    35820
15    35687
13    35630
23    35069
11    34061
9     33821
8     33612
10    32713
16    32147
7     27907
0     26726
1     19243
6     16552
2     13960
3     10424
4      7827
5      7521
Name: count, dtype: int64

In [47]:
df['hourly_rides_count']

0         243178
1         117407
2         171144
3         111892
4         111892
           ...  
729317    243178
729318     85701
729319    171144
729320    111892
729321    171144
Name: hourly_rides_count, Length: 729322, dtype: category
Categories (5, int64): [85701 < 111892 < 243178 < 171144 < 117407]

In [24]:
df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'trip_duration_log', 'pickup_weekday',
       'pickup_weekofyear', 'pickup_hour', 'pickup_minute', 'pickup_secs',
       'pickup_week_hours', 'distance', 'haversine _distance', 'direction',
       'pickup_latitude_rounded', 'pickup_longitude_rounded',
       'dropoff_latitude_rounded', 'dropoff_longitude_rounded', 'hourly_bins',
       'hourly_rides_count'],
      dtype='object')

In [25]:
df.isnull().sum()

id                           0
vendor_id                    0
pickup_datetime              0
dropoff_datetime             0
passenger_count              0
pickup_longitude             0
pickup_latitude              0
dropoff_longitude            0
dropoff_latitude             0
store_and_fwd_flag           0
trip_duration                0
trip_duration_log            0
pickup_weekday               0
pickup_weekofyear            0
pickup_hour                  0
pickup_minute                0
pickup_secs                  0
pickup_week_hours            0
distance                     0
haversine _distance          0
direction                    0
pickup_latitude_rounded      0
pickup_longitude_rounded     0
dropoff_latitude_rounded     0
dropoff_longitude_rounded    0
hourly_bins                  0
hourly_rides_count           0
dtype: int64

In [26]:
#df.dropna(subset=['trip_duration_for_hours_binning','trip_hours_rides_count'],inplace=True)

In [27]:
df.isnull().sum()

id                           0
vendor_id                    0
pickup_datetime              0
dropoff_datetime             0
passenger_count              0
pickup_longitude             0
pickup_latitude              0
dropoff_longitude            0
dropoff_latitude             0
store_and_fwd_flag           0
trip_duration                0
trip_duration_log            0
pickup_weekday               0
pickup_weekofyear            0
pickup_hour                  0
pickup_minute                0
pickup_secs                  0
pickup_week_hours            0
distance                     0
haversine _distance          0
direction                    0
pickup_latitude_rounded      0
pickup_longitude_rounded     0
dropoff_latitude_rounded     0
dropoff_longitude_rounded    0
hourly_bins                  0
hourly_rides_count           0
dtype: int64

## Model Building

In [29]:
from sklearn.metrics import mean_squared_error as mse
from math import sqrt

In [30]:
df.dtypes

id                                   object
vendor_id                             int64
pickup_datetime              datetime64[ns]
dropoff_datetime             datetime64[ns]
passenger_count                       int64
pickup_longitude                    float64
pickup_latitude                     float64
dropoff_longitude                   float64
dropoff_latitude                    float64
store_and_fwd_flag                   object
trip_duration                         int64
trip_duration_log                   float64
pickup_weekday                        int32
pickup_weekofyear                   float64
pickup_hour                           int32
pickup_minute                         int32
pickup_secs                         float64
pickup_week_hours                     int32
distance                            float64
haversine _distance                 float64
direction                           float64
pickup_latitude_rounded             float64
pickup_longitude_rounded        

In [31]:
#df['trip_duration_for_hours_binning']=df['trip_duration_for_hours_binning'].astype('int32')
df['hourly_rides_count']=df['hourly_rides_count'].astype('int32')

In [32]:
df.dtypes

id                                   object
vendor_id                             int64
pickup_datetime              datetime64[ns]
dropoff_datetime             datetime64[ns]
passenger_count                       int64
pickup_longitude                    float64
pickup_latitude                     float64
dropoff_longitude                   float64
dropoff_latitude                    float64
store_and_fwd_flag                   object
trip_duration                         int64
trip_duration_log                   float64
pickup_weekday                        int32
pickup_weekofyear                   float64
pickup_hour                           int32
pickup_minute                         int32
pickup_secs                         float64
pickup_week_hours                     int32
distance                            float64
haversine _distance                 float64
direction                           float64
pickup_latitude_rounded             float64
pickup_longitude_rounded        

In [33]:
df_y=df['trip_duration_log']
df=df.drop(['id','pickup_datetime','dropoff_datetime','trip_duration','store_and_fwd_flag','hourly_bins','trip_duration_log',],axis=1)


In [34]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df,df_y,test_size=1/3,random_state=0)

#### Mean Prediction

In [36]:
mean_pred=np.repeat(y_train.mean(),len(y_test))
np.sqrt(mse(y_test,mean_pred))

0.7986672307875061

In [37]:
df.columns


Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'pickup_weekday',
       'pickup_weekofyear', 'pickup_hour', 'pickup_minute', 'pickup_secs',
       'pickup_week_hours', 'distance', 'haversine _distance', 'direction',
       'pickup_latitude_rounded', 'pickup_longitude_rounded',
       'dropoff_latitude_rounded', 'dropoff_longitude_rounded',
       'hourly_rides_count'],
      dtype='object')

In [38]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from math import sqrt

def cv_score(ml_model, rstate=11, cols=df.columns):
    i = 1
    cv_scores = []
    df1 = df[cols].copy()

    kf = KFold(n_splits=5, random_state=rstate, shuffle=True)
    for train_index, test_index in kf.split(df1):
        print('\n{} of kfold {}'.format(i, kf.n_splits))
        xtr, xvl = df1.iloc[train_index], df1.iloc[test_index]
        ytr, yvl = df_y.iloc[train_index], df_y.iloc[test_index]

        model = ml_model
        model.fit(xtr, ytr)
        y_pred_train = model.predict(xtr)
        y_pred_val = model.predict(xvl)
        
        rmse_score_train = sqrt(mse(ytr, y_pred_train))
        rmse_score_val = sqrt(mse(yvl, y_pred_val))
        
        msg = ''
        msg += 'Train RMSE: {:.5f}'.format(rmse_score_train)
        msg += '\nValidation RMSE: {:.5f}'.format(rmse_score_val)
        print(msg)
        
        cv_scores.append(rmse_score_val)
        i += 1
    return cv_scores





In [39]:
linreg_scores = cv_score(LinearRegression())


1 of kfold 5
Train RMSE: 0.64602
Validation RMSE: 0.62958

2 of kfold 5
Train RMSE: 0.64643
Validation RMSE: 0.62839

3 of kfold 5
Train RMSE: 0.61886
Validation RMSE: 0.79825

4 of kfold 5
Train RMSE: 0.64548
Validation RMSE: 0.63807

5 of kfold 5
Train RMSE: 0.64442
Validation RMSE: 0.63664


In [40]:
d_tree=cv_score(DecisionTreeRegressor(min_samples_leaf=25,min_samples_split=25))


1 of kfold 5
Train RMSE: 0.37193
Validation RMSE: 0.43453

2 of kfold 5
Train RMSE: 0.37194
Validation RMSE: 0.43747

3 of kfold 5
Train RMSE: 0.37380
Validation RMSE: 0.42781

4 of kfold 5
Train RMSE: 0.37302
Validation RMSE: 0.43078

5 of kfold 5
Train RMSE: 0.37340
Validation RMSE: 0.43178
