In [5]:
import os
import pickle

import pandas as pd

In [6]:
with open("model.bin", "rb") as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
categorical = ["PULocationID", "DOLocationID"]


def read_data(filename):
    df = pd.read_parquet(filename)

    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype("int").astype("str")

    return df

In [8]:
year = 2023
month = 3
df = read_data(
    f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet"
)

In [9]:
dicts = df[categorical].to_dict(orient="records")
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [10]:
print(f"The standard deviation of the prediction is {y_pred.std()}")

6.24751524449896

In [11]:
df_result = pd.DataFrame()
ride_id = f"{year:04d}/{month:02d}_" + df.index.astype("str")
df_result["ride_id"] = ride_id
df["ride_id"] = ride_id
df_result["duration"] = y_pred

In [12]:
df_result.head()

Unnamed: 0,ride_id,duration
0,2023/03_0,16.246032
1,2023/03_1,26.135071
2,2023/03_2,11.884346
3,2023/03_3,11.997632
4,2023/03_4,10.234435


In [13]:
prediction_name = f"prediction_{year}_{month}.parquet"
pred_dir = "predictions"
os.makedirs(pred_dir, exist_ok=True)
df_result.to_parquet(
    os.path.join(pred_dir, prediction_name),
    engine="pyarrow",
    compression=None,
    index=False,
)

In [15]:
print(
    f"The size of the prediction file is {os.path.getsize(os.path.join(pred_dir, prediction_name))/(1024**2)} MB"
)

The size of the prediction file is 65.46185111999512 MB


In [16]:
%load_ext watermark
%watermark

Last updated: 2024-06-10T17:57:57.615931+02:00

Python implementation: CPython
Python version       : 3.10.14
IPython version      : 8.24.0

Compiler    : GCC 12.3.0
OS          : Linux
Release     : 6.0.0-6mx-amd64
Machine     : x86_64
Processor   : 
CPU cores   : 4
Architecture: 64bit



In [17]:
%watermark --packages pandas,sklearn

pandas : 2.2.2
sklearn: 1.4.2

