In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.7.2


In [2]:
!python -V

Python 3.10.18


In [3]:
import pickle
import pandas as pd
import numpy as np
import pyarrow
import os

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
year=2023
month=4

df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet')

In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [8]:
std_pred = np.std(y_pred)
print(std_pred)

6.353996941249663


In [9]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [10]:
df.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,1,2023-04-01 00:14:49,2023-04-01 00:45:01,2.0,4.9,1.0,N,48,223,1,...,3.5,0.5,6.0,0.0,1.0,39.9,2.5,0.0,30.2,2023/04_0
1,2,2023-04-01 00:00:24,2023-04-01 00:56:19,1.0,21.89,2.0,N,132,43,2,...,0.0,0.5,0.0,6.55,1.0,81.8,2.5,1.25,55.916667,2023/04_1
2,1,2023-04-01 00:03:50,2023-04-01 00:14:42,2.0,1.3,1.0,N,148,113,1,...,3.5,0.5,2.0,0.0,1.0,18.4,2.5,0.0,10.866667,2023/04_2
3,1,2023-04-01 00:53:18,2023-04-01 01:01:28,1.0,1.5,1.0,N,249,79,1,...,3.5,0.5,1.0,0.0,1.0,16.0,2.5,0.0,8.166667,2023/04_3
4,2,2023-04-01 00:07:00,2023-04-01 00:17:16,2.0,1.49,1.0,N,158,246,1,...,1.0,0.5,1.0,0.0,1.0,17.4,2.5,0.0,10.266667,2023/04_4


In [11]:
df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['predicted_duration'] = y_pred

os.makedirs("output/yellow", exist_ok=True)
output_file = f'output/yellow/yellow_tripdata_{year:04d}-{month:02d}.parquet'

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [12]:
size_bytes = os.path.getsize(output_file)
size_bytes

66210815

In [13]:
predicted_duration_mean = df_result['predicted_duration'].mean()
predicted_duration_mean

np.float64(14.292282936862449)