In [27]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [28]:
!python -V

Python 3.10.13


In [3]:
import pickle
import pandas as pd

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [9]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [10]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## Q1

In [11]:
y_pred.std()

6.247488852238703

## Q2

In [13]:
year = 2023
month = 3

In [14]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [15]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,ride_id
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,...,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0,2023/03_0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,...,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333,2023/03_1
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,...,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667,2023/03_2
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,...,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667,2023/03_3
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,...,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333,2023/03_4


In [19]:
output_file = 'df_result.csv'

In [17]:
df_result = df[['ride_id']].copy()
df_result['Result'] = y_pred
df_result.head()

Unnamed: 0,ride_id,Result
0,2023/03_0,16.245906
1,2023/03_1,26.134796
2,2023/03_2,11.884264
3,2023/03_3,11.99772
4,2023/03_4,10.234486


In [20]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [25]:
!stat df_result.csv

  File: df_result.csv
  Size: 68641704  	Blocks: 134080     IO Block: 4096   regular file
Device: 703h/1795d	Inode: 1573140     Links: 1
Access: (0666/-rw-rw-rw-)  Uid: ( 1000/codespace)   Gid: ( 1000/codespace)
Access: 2024-06-10 09:10:45.683080197 +0000
Modify: 2024-06-10 09:10:45.947080203 +0000
Change: 2024-06-10 09:10:45.947080203 +0000
 Birth: -


## Q3

In [26]:
!jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1535 bytes to starter.py


## Q4

## Q5

In [1]:
!python starter.py 2023 04

14.292282936862449


## Q6

In [15]:
!docker build -t homework4:v1 .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.2s (2/3)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 314B                                       0.0s
[0m => [internal] load metadata for docker.io/agrigorev/zoomcamp-model:mlops  0.2s
[34m => [auth] agrigorev/zoomcamp-model:pull token for registry-1.docker.io    0.0s
[0m[?25h[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.4s (2/3)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 314B                                       0.0s
[0m => [internal] load metadata for docker.io/agrigorev/zoomcamp-model:mlops  0.3s
[34m => [auth] agrigorev/zoomcamp-model:pull token for registry-1.docker.io    0

In [16]:
!docker run homework4:v1 

0.19174419265916945
