Import the files from google drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Import the required modules

In [2]:
import numpy as np
import pandas as pd
from geopy import distance
import os
import seaborn

Changing the directory to get the files

In [3]:
%cd /content/gdrive/MyDrive/

/content/gdrive/MyDrive


**Data preprocessing the train data**

In [4]:
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
df.shape

(1458644, 11)

Feature creation using the datetime functions of pandas

In [6]:
df['pickup_datetime']= pd.to_datetime(df.pickup_datetime, format='%Y-%m-%d %H:%M:%S')
df['day_of_the_date']=df.pickup_datetime.dt.dayofweek
df['month'] = df.pickup_datetime.dt.month
df['day'] = df.pickup_datetime.dt.day
df['hour'] = df.pickup_datetime.dt.hour
df['minute'] = df.pickup_datetime.dt.minute
df.head(7)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,day_of_the_date,month,day,hour,minute
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,0,3,14,17,24
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,6,6,12,0,43
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,1,1,19,11,35
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,2,4,6,19,32
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,5,3,26,13,30
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,443,5,1,30,22,1
6,id1813257,1,2016-06-17 22:34:59,2016-06-17 22:40:40,4,-73.969017,40.757839,-73.957405,40.765896,N,341,4,6,17,22,34


Creating a new column for the log values of trip duration

In [7]:
df['trip_duration_log']=np.log(df['trip_duration'].values)

Using the geopy module to calculate the distance between 2 given latitude and longitude

In [8]:
def req_distance(row):
    c1 = (row['pickup_latitude'], row['pickup_longitude'])
    c2 = (row['dropoff_latitude'], row['dropoff_longitude'])
    return distance.distance(c1, c2).km
df['distance'] = df.apply(req_distance, axis=1)

**Test Data**

In [9]:
test_df=pd.read_csv('test.csv')

Again creating new features with datetime functions of pandas

In [10]:
test_df['pickup_datetime']= pd.to_datetime(test_df.pickup_datetime, format='%Y-%m-%d %H:%M:%S')
test_df['day_of_the_date']=test_df.pickup_datetime.dt.dayofweek
test_df['month'] = test_df.pickup_datetime.dt.month
test_df['day'] = test_df.pickup_datetime.dt.day
test_df['hour'] = test_df.pickup_datetime.dt.hour
test_df['minute'] = test_df.pickup_datetime.dt.minute
test_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,day_of_the_date,month,day,hour,minute
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,3,6,30,23,59
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,3,6,30,23,59
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,3,6,30,23,59
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,3,6,30,23,59
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,3,6,30,23,59


In [11]:
test_df['distance']=test_df.apply(req_distance,axis=1)

Dropping the unnecessary features or we can create an empty list and take the values of required features in the empty list and initialize x to the required featured list

In [12]:
X=df.drop(['id','vendor_id','passenger_count','store_and_fwd_flag','minute','trip_duration','pickup_datetime','dropoff_datetime','trip_duration_log'],axis=1)
y=df['trip_duration_log']
X_test=test_df.drop(['id','vendor_id','minute','passenger_count','pickup_datetime','store_and_fwd_flag','minute','distance'],axis=1)

splitting the data

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=42)

an important step to check the shape of train(both xtrain and ytrain) and test data(both xtest and ytest)

In [14]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape


((1166915, 9), (291729, 9), (1166915,), (291729,))

Using the random Forest regressor module

In [15]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=5,min_samples_leaf=3, min_samples_split=5, n_jobs=1, max_features="sqrt", criterion = 'poisson')

In [16]:
model.fit(X_train, y_train)

Checking the model score

In [17]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X_valid, y_valid, cv=5)
# for i in range (len (cv_scores)):
#     cv_scores[i]=np.sqrt(abs(cv_scores[i]))
# print(np.mean(cv_scores))
cv_scores

array([0.69107766, 0.68400073, 0.68760568, 0.689006  , 0.68223908])

In [18]:
yhat = model.score(X_valid, y_valid)
yhat

0.7141546150161318

Calculating the mean squared error

In [19]:
from sklearn.metrics import accuracy_score, mean_squared_error
from math import sqrt

Calculating the root mean squared error

In [22]:
mse = mean_squared_error(yhat, y_valid)
rmse = sqrt(mse)
rmse

TypeError: ignored

Now using linear regression

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

Predicting the score of the linear regression model

In [25]:
model1.score(X_valid, y_valid)

0.3643155106258783