In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.0.2


In [2]:
import pickle
import pandas as pd

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [7]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [8]:
df = read_data("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet")

In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)
print(y_pred)

[18.52778307 23.06578208 33.68635854 ... 11.89045938 15.10268128
  9.46059157]


In [8]:
print("The standard deviation of our prediction is: %.2f" % y_pred.std())

The standard deviation of our prediction is: 5.28


In [17]:
year = 2022
month = 2
taxi_type = 'yellow'
output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'

In [10]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [11]:
df_result = pd.DataFrame()

In [12]:
df_result["ride_id"] = df["ride_id"]
df_result["predictions"] = y_pred

In [13]:
df_result.head()

Unnamed: 0,ride_id,predictions
0,2022/02_0,18.527783
1,2022/02_1,23.065782
2,2022/02_2,33.686359
3,2022/02_3,23.757436
4,2022/02_4,21.492904


In [20]:
import os
path_directory = f'output/{taxi_type}'
if not os.path.exists(path_directory):
   os.makedirs(path_directory)

# from pathlib import Path
# #creating a new directory called pythondirectory
# Path(path_directory).mkdir(parents=True, exist_ok=True)

In [33]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)


In [34]:
# get file size in python
import os

file_name = output_file

file_stats = os.stat(file_name)

print(file_stats)
print(f'File Size in Bytes is {file_stats.st_size:.2f}')
print(f'File Size in MegaBytes is {file_stats.st_size / (1024 * 1024):.2f}')

os.stat_result(st_mode=33188, st_ino=32465, st_dev=2080, st_nlink=1, st_uid=1000, st_gid=1000, st_size=59994880, st_atime=1686977945, st_mtime=1686978339, st_ctime=1686978339)
File Size in Bytes is 59994880.00
File Size in MegaBytes is 57.22


### Converting in a Script

In [9]:
!jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 2072 bytes to starter.py
