In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import pickle 
import pandas as pd
import mlflow

In [13]:
from sklearn.pipeline import make_pipeline

In [14]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green-taxi-ride-duration")

<Experiment: artifact_location='./mlflow-artifacts/1', creation_time=1672742415681, experiment_id='1', last_update_time=1672742415681, lifecycle_stage='active', name='green-taxi-ride-duration', tags={}>

In [15]:
df = pd.read_parquet('../../data/green_tripdata_2021-01.parquet')

In [16]:
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,0.5,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,0.5,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,0.5,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2.0,265,265,3.0,0.0,-52.0,0.0,-0.5,0.0,0.0,,-0.3,-52.8,3.0,1.0,0.0


In [17]:
def read_data(filename:str):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime 
    df.duration = df.duration.dt.total_seconds()
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [18]:
def prepare_dictionaries(df:pd.DataFrame):
    df['PICKUP_DROPOFF'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PICKUP_DROPOFF']
    numerical = ['trip_distance']
    dict_dataset = df[categorical + numerical].to_dict(orient='records')
    return dict_dataset

In [19]:
df_train = read_data('../../data/green_tripdata_2021-01.parquet')
df_val = read_data('../../data/green_tripdata_2021-02.parquet')

df_train_dict = prepare_dictionaries(df_train)
y_train = df_train['duration'].values

df_val_dict = prepare_dictionaries(df_val)
y_val = df_val['duration'].values

In [20]:
with mlflow.start_run():
#     params = {
#         'max_depth':20,
#         'n_estimators' : 100,
#         'min_samples_leaf':10,
#         'random_state':0
#     }
    
#     mlflow.log_params(params)
    
#     ## initialise vectorizer
#     dv = DictVectorizer()
#     model = RandomForestRegressor(**params, n_jobs=1)
    
#     ### fit & transform vectorizer
#     X_train = dv.fit_transform(df_train_dict)
    
#     ## transform validation
#     X_val = dv.transform(df_val_dict)
    
#     ### apply modle and predict
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_val)

    params = dict(max_depth=20,
                  n_estimators= 100,
                  min_samples_leaf=10,
                   random_state=0)
    
    ## log parameters in mlflow
    mlflow.log_params(params)

    ## above steps are replaced with sklearn pipeline
    pipeline = make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params, n_jobs=1)
    )
    
    
    
    ### fit the model
    pipeline.fit(df_train_dict, y_train)
    y_pred = pipeline.predict(df_val_dict)
    
    ### metrics
    rmse = mean_squared_error(y_pred, y_val, squared=False)
    print(f'rmse: {rmse}')
    
    ### log metrics
    mlflow.log_metric('rmse', rmse)
    
    ### log model in mlflow
    mlflow.sklearn.log_model(sk_model=pipeline, artifact_path="model")
    
#     ### create vectosrizer artifact and log artifacts 
#     with open('dict_vectorizer.bin','wb') as f_out:
#         pickle.dump(dv, f_out)
    
#     mlflow.log_artifact('dict_vectorizer.bin')

rmse: 16.53467808612593


