In [2]:
# Check which interpreter environment or dependencies are being utilised.
# !pip freeze
!which python

/workspaces/mlopsZoomCamp/.venv/bin/python


In [10]:
# Packages
import os, sys
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

print(os.getcwd())

/workspaces/mlopsZoomCamp/01-intro


In [2]:
# Reading parquet data files
# to install use 'wget <url_link>' in the command line with the directory you want to install it into
data_path = '/workspaces/mlopsZoomCamp/data/'
janYellow23 = pd.read_parquet(data_path + 'yellow_tripdata_2023-01.parquet')
febYellow23 = pd.read_parquet(data_path + 'yellow_tripdata_2023-02.parquet')
display(janYellow23.head())

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [8]:
# Q1) how many columns in the January data?
print('There are {} columns in the January data.'.format(janYellow23.shape[1]))

There are 19 columns in the January data.


In [3]:
# Q2) What is the standard deviation for duration in the January data?
# add duration column in minutes
janYellow23['duration'] = (janYellow23['tpep_dropoff_datetime'] - janYellow23['tpep_pickup_datetime'])\
                            .dt.total_seconds().div(60)
print(f'The standard deviation for duration of trips in January is {np.std(janYellow23['duration'])} minutes.')

The standard deviation for duration of trips in January is 42.59434429744777 minutes.


In [4]:
# Q3) What fraction of records left after dropping outliers (where duration is not between 1 to 60 minutes inclusive)?

initJanRows = janYellow23.shape[0]
# print(initJanRows)
janYellow23 = janYellow23[janYellow23['duration'].between(1, 60, inclusive = 'both')]
print('The fraction of records kept are {:.2f}'.format(janYellow23.shape[0]*100/initJanRows))

The fraction of records kept are 98.12


In [17]:
# Q4) What is the dimensionality of the feature matrix?

# One-hot encoding method using DictVectorizer 
#  - for more info see https://dev.to/victor_isaac_king/one-hot-encoding-with-dictvectorizer-1317
featDict = janYellow23[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient = 'records')
dv = DictVectorizer()#sparse = False <- Use if small dataset
dv.fit(featDict)
featMat = dv.transform(featDict)
print(featMat.shape)

(3009173, 515)


In [11]:
#Q5) RMSE on training a plain linear regression model?

regressor = LinearRegression()
regressor.fit(featMat, janYellow23['duration'])
predictJan = regressor.predict(featMat)
rmse = root_mean_squared_error(janYellow23['duration'], predictJan)
print('RMSE of the plain linear regression model: {:.2f}'.format(rmse))

RMSE of the plain linear regression model: 7.65


In [18]:
# Q6) RMSE on validation?

# Setting features and label
febYellow23['duration'] = (febYellow23['tpep_dropoff_datetime'] - febYellow23['tpep_pickup_datetime'])\
                            .dt.total_seconds().div(60)
febYellow23 = febYellow23[febYellow23['duration'].between(1, 60, inclusive = 'both')]
featDictFeb = febYellow23[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient = 'records')
featMatFeb = dv.transform(featDictFeb)
print(featMatFeb.shape)

(2855951, 515)


In [19]:
# Running predictions in validation data
predictFeb = regressor.predict(featMatFeb)
rmseVal = root_mean_squared_error(febYellow23['duration'], predictFeb)
print('RMSE on validation data: {:.3f}'.format(rmseVal))

RMSE on validation data: 7.812
