In [3]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
import mlflow

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('experiment_1')

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='experiment_1', tags={}>

In [4]:
# 1. Create a func for features obtaining 
def prepare_df(filename):
    data = pd.read_parquet(filename)
    
    # Target Creation
    data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
    data['duration'] = data['duration'].apply(lambda x: x.total_seconds() / 60)
    
    # Filtering
    data = data[(data['duration'] >= 1) & (data['duration'] <= 60)]
    return data

def transform_df(df, cat_columns, num_columns):
    cat_dummies = pd.get_dummies(data=df[cat_columns], columns=cat_columns, prefix_sep='_is_')
    data = pd.concat([df[num_columns], cat_dummies], axis='columns')
    data['duration'] = df['duration']
    return data

In [5]:
df_train = prepare_df('../data/green_tripdata_2021-01.parquet')
df_val = prepare_df('../data/green_tripdata_2021-02.parquet')

cat_columns = ['PULocationID', 'DOLocationID']
num_columns = ['trip_distance']

df_train = transform_df(df_train, cat_columns=cat_columns, num_columns=num_columns)
df_val = transform_df(df_val, cat_columns=cat_columns, num_columns=num_columns)


common_features = set(df_train.columns).intersection(df_val.columns)
df_train = df_train[common_features]
df_val = df_val[common_features]

print('Train Shape: ', df_train.shape)
print('Validation Shape: ', df_val.shape)

# Train/Validation Data
X_train = df_train.drop(columns=['duration'])
y_train = df_train['duration']

X_test = df_val.drop(columns=['duration'])
y_test = df_val['duration']

Train Shape:  (73908, 496)
Validation Shape:  (61921, 496)


In [None]:
train_data = xgb.DMatrix(X_train, label=y_train)
valid_data = xgb.DMatrix(X_test, label=y_test)

In [19]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        xgb_tree = xgb.train(params=params,
                             dtrain=train_data,
                             num_boost_round=25,
                             evals=[(valid_data, 'valid_data')],
                             early_stopping_rounds=50,
                             verbose_eval=False)
        y_pred = xgb_tree.predict(valid_data)
        rmse_val = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric('rmse_val', rmse_val)
    return {'loss': rmse_val, 'status': STATUS_OK}

In [14]:
searc_hspace = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [18]:
best_result = fmin(fn=objective,
                   space=searc_hspace,
                   algo=tpe.suggest,
                   max_evals=5,
                   trials=Trials())

100%|██████████| 5/5 [09:42<00:00, 116.42s/trial, best loss: 39.501860764902304]


**There is Automatic Logging:** https://mlflow.org/docs/latest/tracking.html#automatic-logging

In [21]:
# Best hyperparameters combination 
optimal_params = {
    'learning_rate': 0.5652421328091554,
    'max_depth': 86,
    'min_child_weight': 19.718184879586808,
    'objective': 'reg:linear',
    'reg_alpha': 0.009935323656046538,
    'reg_lambda': 0.16021002681628055,
    'seed': 42
}

# Let's apply Automatic Logging
mlflow.xgboost.autolog()

# Train optimal Model
xgb_tree = xgb.train(params=optimal_params,
                     dtrain=train_data,
                     num_boost_round=25,
                     evals=[(valid_data, 'valid_data')],
                     early_stopping_rounds=50,
                     verbose_eval=False)

2022/06/22 06:54:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9a82df152df847b88532925d0ccefed3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow




