### Model Management 

In [18]:
import pandas as pd
import warnings
import mlflow
import pickle

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('experiment_1')

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='experiment_1', tags={}>

In [4]:
# 1. Create a func for features obtaining 
def prepare_df(filename):
    data = pd.read_parquet(filename)
    
    # Target Creation
    data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
    data['duration'] = data['duration'].apply(lambda x: x.total_seconds() / 60)
    
    # Filtering
    data = data[(data['duration'] >= 1) & (data['duration'] <= 60)]
    return data

def transform_df(df, cat_columns, num_columns):
    cat_dummies = pd.get_dummies(data=df[cat_columns], columns=cat_columns, prefix_sep='_is_')
    data = pd.concat([df[num_columns], cat_dummies], axis='columns')
    data['duration'] = df['duration']
    return data

In [5]:
df_train = prepare_df('../data/green_tripdata_2021-01.parquet')
df_val = prepare_df('../data/green_tripdata_2021-02.parquet')

cat_columns = ['PULocationID', 'DOLocationID']
num_columns = ['trip_distance']

df_train = transform_df(df_train, cat_columns=cat_columns, num_columns=num_columns)
df_val = transform_df(df_val, cat_columns=cat_columns, num_columns=num_columns)


common_features = set(df_train.columns).intersection(df_val.columns)
df_train = df_train[common_features]
df_val = df_val[common_features]

print('Train Shape: ', df_train.shape)
print('Validation Shape: ', df_val.shape)

# Train/Validation Data
X_train = df_train.drop(columns=['duration'])
y_train = df_train['duration']

X_test = df_val.drop(columns=['duration'])
y_test = df_val['duration']

Train Shape:  (73908, 496)
Validation Shape:  (61921, 496)


### Model Logging 
Model can be logged as:
- Artifact: `mlflow.log_artifact('model_name', artifact_path='...')`
- Using `log_model`

In [17]:
with mlflow.start_run():
    mlflow.set_tag('MLE', 'vlad') # tag info
    # Keep track of used data
    mlflow.log_param('train_data_path', '../data/green_tripdata_2021-01.parquet')
    mlflow.log_param('valid_data_path', '../data/green_tripdata_2021-02.parquet')
    
    alpha = 0.01
    mlflow.log_param('alpha', alpha) # log hyperparameters
    
    model = Lasso(alpha)
    model.fit(X_train, y_train)

    preds_train = model.predict(X_train)
    preds_val = model.predict(X_test)
    
    rmse_train = mean_squared_error(y_train, preds_train, squared=False)
    rmse_val = mean_squared_error(y_test, preds_val, squared=False)
    
    # Log the metrics
    mlflow.log_metric('rmse_train', rmse_train)
    mlflow.log_metric('rmse_val', rmse_val)
    
    # Save the train_data
    with open('artifacts/train_data.b', 'wb') as f_out:
        pickle.dump(df_train, f_out)
    
    # Log the model
    mlflow.sklearn.log_model(model, artifact_path='models_mlflow')
    
    # Log the model data
    mlflow.log_artifact('artifacts/train_data.b', artifact_path='model_features')

### Model Loading from MLFlow

In [20]:
# Option 1
logged_model = 'runs:/a8429de221ce4154923a9ad2c8dce145/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.sklearn
  run_id: a8429de221ce4154923a9ad2c8dce145

In [21]:
# Option 2
loaded_model = mlflow.sklearn.load_model(logged_model)
loaded_model

Lasso(alpha=0.01)

In [22]:
model_preds = loaded_model.predict(X_test)
model_preds

array([16.8474768 , 15.80225383, 21.1961692 , ..., 10.66641814,
       10.27469823,  9.10592686])