# 001 - Intro  : Trip Duration Prediction

Dataset: Yellow Taxi Trip Records - January (Train) <br>
 Yellow Taxi Trip Records - February(Test)

In [2]:
!pip install pyarrow



In [3]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction import DictVectorizer

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Reading data
trip_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
trip_df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
print(len(trip_df.columns))

19


In [5]:
print(f"There are {len(trip_df.columns)} columns in Yellow Taxi Trip Records for - January")

There are 19 columns in Yellow Taxi Trip Records for - January


## Data Assessment & Cleaning

In [6]:
# preview first five rows
trip_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [7]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [8]:
trip_df.isnull().sum()

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          71743
trip_distance                0
RatecodeID               71743
store_and_fwd_flag       71743
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71743
airport_fee              71743
dtype: int64

In [9]:
trip_df.duplicated().sum()

0

In [10]:
trip_df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [11]:
trip_df['duration'] = (trip_df['tpep_dropoff_datetime'] - trip_df['tpep_pickup_datetime']).dt.total_seconds() / 60
print(f"The standard duration of trips in January is: {round(trip_df['duration'].std(), 2)} minutes")

The standard duration of trips in January is: 42.59 minutes


In [12]:
mask = (trip_df.duration >= 1) & (trip_df.duration <= 60)
trip_df_clean = trip_df[mask]
fraction = len(trip_df_clean) / len(trip_df)
print(f"Fraction of the records left after dropping outliers is: {round(fraction, 2) * 100}%")

Fraction of the records left after dropping outliers is: 98.0%


In [13]:
trip_df['PULocationID'].dtype

dtype('int64')

In [14]:
# Convert
trip_df_clean['PULocationID'] = trip_df_clean['PULocationID'].astype(str)
trip_df_clean['DOLocationID'] = trip_df_clean['DOLocationID'].astype(str)

dicts = trip_df_clean[['PULocationID', 'DOLocationID']].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(dicts)

print(f"The dimensionality of the matrix is: {X_train.shape[1]}")

The dimensionality of the matrix is: 515


In [15]:
trip_df_val['duration'] = (trip_df_val['tpep_dropoff_datetime'] - trip_df_val['tpep_pickup_datetime']).dt.total_seconds() / 60

val_clean = trip_df_val[(trip_df_val.duration >= 1) & (trip_df_val.duration <= 60)]
val_clean['PULocationID'] = val_clean['PULocationID'].astype(str)
val_clean['DOLocationID'] = val_clean['DOLocationID'].astype(str)

In [16]:
y_train = trip_df_clean['duration'].values
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(f"The RMSE on the training set is: {round(train_rmse, 2)}")


The RMSE on the training set is: 7.65


In [None]:
val_dicts = val_clean[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = val_clean['duration'].values

In [None]:
y_val_pred = lr_model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"The RMSE on the validation set is: {round(rmse_val, 2)}")