In [1]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==1.5.0
!pip install pyarrow
!pip install wget


Found existing installation: scikit-learn 1.5.0
Uninstalling scikit-learn-1.5.0:
  Successfully uninstalled scikit-learn-1.5.0
Collecting scikit-learn==1.5.0
  Using cached scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Using cached scikit_learn-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.5.0


# 04 - Deployment

In [1]:
import os
import wget
import pickle
import platform
import pandas as pd
import subprocess as sp

from glob import glob
from argparse import ArgumentParser

In [2]:
month = 3
year = 2023
input_dir = 'input'
output_dir = 'output'
trip_category = 'yellow'

python_version = platform.python_version()
filename = f'{trip_category}_tripdata_{year}-{month:02d}.parquet'
input_file_url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{filename}'
output_file = f'{output_dir}/{filename}'

categorical = ['PULocationID', 'DOLocationID']

In [3]:
def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

## Training model

In [4]:
if not os.path.exists(input_dir): 
    os.makedirs(input_dir)
    input_file = wget.download(input_file_url, out=input_dir)
else:
    glob_str = f'{input_dir}/*.parquet'
    input_file = glob(glob_str)

df = read_data(input_file)
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## Q1. Standard deviation

In [5]:
print((f'Standard Deviation = {y_pred.std():.3f}'))

Standard Deviation = 6.247


## Q2. Preparing the output

In [6]:
df_result = pd.DataFrame()
df_result['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_result['predicted_duration'] = y_pred

if os.path.exists(output_file):
    os.remove(output_file)
else:
    if not os.path.exists(output_dir): os.makedirs(output_dir)
df_result.to_parquet(
    output_file,
    engine = 'pyarrow',
    compression = None,
    index = False,
    
)

print(f'{filename} - {(os.path.getsize(output_file)/1024**2):.2f}M')

yellow_tripdata_2023-03.parquet - 65.46M


## Q3. Creating the scoring script.

In [17]:
sp_out = sp.run(["jupyter nbconvert --to script homework.ipynb"], shell=True, capture_output=True)
print(sp_out.stdout.decode()) 




## Q4. Creating virtual environment and fetching first hash of Scikit-learn.

In [14]:
sp_out = sp.run(["pip freeze | grep -e 'scikit' -e 'wget' -e 'pandas' -e 'pyarrow' > requirements.txt"], shell=True, capture_output=True)
print(sp_out.stdout.decode())
cmd = f"pipenv install -r requirements.txt --python={python_version}"
sp_out = sp.run([cmd], shell=True, capture_output=True)
import json 
with open('Pipfile.lock', 'r') as f:
    lock_data = json.load(f)
print(f'First hash of Scikit-learn dependency = "{lock_data['default']['scikit-learn']['hashes'][0]}"')


First hash of Scikit-learn dependency = "sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c"
