In [1]:
!python -V

Python 3.9.12


In [32]:
import pandas as pd
import os

In [3]:
import pickle

In [4]:
!pip install pyarrow



In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [15]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [16]:
df_train = read_dataframe('./data/yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('./data/yellow_tripdata_2023-02.parquet')

In [17]:
len(df_train), len(df_val)

(3009173, 2855951)

In [18]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.811472580390592

In [21]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [13]:
#df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [7]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333


In [9]:
with open('models/lin_reg.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [10]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [11]:
df['prediction']=y_pred

In [12]:
df['prediction'].std()

6.2466790198669395

In [None]:
#Question 1: What's the standard deviation of the predicted duration for this dataset?
#Answer: 6.24

In [14]:
import datetime

In [28]:
year=2023
month=3
taxi_type='yellow'

In [17]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [21]:
df.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,prediction,ride_id
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,...,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0,16.238474,2023/03_0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,...,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,26.133216,2023/03_1
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,...,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667,11.883914,2023/03_2


In [37]:
!mkdir output

In [40]:
output_file = f'output/{year:04d}-{month:02d}.parquet'

In [45]:
df_result=pd.DataFrame()

In [47]:
df_result['prediction']=df['prediction']
df_result['ride_id']=df['ride_id']

In [48]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [49]:
file_name = "./output/2023-03.parquet"

file_stats = os.stat(file_name)

print(file_stats)
print(f'File Size in Bytes is {file_stats.st_size}')
print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024)}')

os.stat_result(st_mode=33206, st_ino=1474759, st_dev=1795, st_nlink=1, st_uid=1000, st_gid=1000, st_size=68641758, st_atime=1718822403, st_mtime=1718822384, st_ctime=1718822384)
File Size in Bytes is 68641758
File Size in MegaBytes is 65.46188163757324


In [None]:
#Q2. What is size of output file?

#Answer: Size of output file is 65.46 MB (approx.66 MB)

In [None]:
#Q3: Q3. Creating the scoring script from this notebook
#Answer: jupyter nbconvert --to script Home_work_Module_4.ipynb

In [None]:
#Q4:Virtual environment

Now let's put everything into a virtual environment. We'll use pipenv for that.

Install all the required libraries. Pay attention to the Scikit-Learn version: it should be the same as in the starter notebook.

After installing the libraries, pipenv creates two files: Pipfile and Pipfile.lock. The Pipfile.lock file keeps the hashes of the dependencies we use for the virtual env.

What's the first hash for the Scikit-Learn dependency?

In [None]:
#Answer:First hash for the Scikit-Learn dependency is : "sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b",

In [1]:
pip freeze | grep scikit-learn

scikit-learn @ file:///tmp/build/80754af9/scikit-learn_1642617106979/work
scikit-learn-intelex==2021.20220215.212715
Note: you may need to restart the kernel to use updated packages.


In [None]:
#Q5. Parametrize the script

Let's now make the script configurable via CLI. We'll create two parameters: year and month.

Run the script for April 2023.

What's the mean predicted duration?

7.29
14.29
21.29
28.29

In [None]:
#Answer:14.29

In [1]:
!ls

Home_work_Module_4.ipynb  data	models	output	score.ipynb  score.py


In [4]:
cd output/

[Errno 2] No such file or directory: 'output/'
/workspaces/MLOpsZoomcamp2024/04-deployment/output


In [5]:
ls

2023-03.parquet  [0m[34;42myellow[0m/


In [6]:
cd yellow/

/workspaces/MLOpsZoomcamp2024/04-deployment/output/yellow


In [7]:
ls

2023-04.parquet  2023-05.parquet


In [8]:
import pandas as pd


In [9]:
!pip install pyarrow



In [11]:
df=pd.read_parquet('2023-04.parquet')

In [12]:
df.head()

Unnamed: 0,tpep_pickup_datetime,PULocationID,DOLocationID,actual_duration,predicted_duration,diff
0,2023-04-01 00:14:49,48,223,30.2,16.123378,14.076622
1,2023-04-01 00:00:24,132,43,55.916667,32.265513,23.651154
2,2023-04-01 00:03:50,148,113,10.866667,12.254964,-1.388297
3,2023-04-01 00:53:18,249,79,8.166667,12.131951,-3.965285
4,2023-04-01 00:07:00,158,246,10.266667,13.071496,-2.804829


In [13]:
mean_predicted_duration=df['predicted_duration'].mean()

In [5]:
mean_predicted_duration

NameError: name 'mean_predicted_duration' is not defined

In [2]:
import pandas as pd

In [3]:
!pip install pyarrow



In [7]:
cd output/

[Errno 2] No such file or directory: 'output/'
/workspaces/MLOpsZoomcamp2024/04-deployment/output


In [9]:
ls

2023-03.parquet  [0m[34;42myellow[0m/


In [10]:
cd yellow/

/workspaces/MLOpsZoomcamp2024/04-deployment/output/yellow


In [11]:
ls

2023-04.parquet  2023-05.parquet


In [12]:
df1=pd.read_parquet('2023-05.parquet')

In [13]:
mean_predicted_duration1=df1['predicted_duration'].mean()

In [14]:
mean_predicted_duration1

14.24265663004003