In [3]:
import mlflow
import os
from utils import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from mlflow.models import infer_signature
from xgboost import XGBClassifier

In [19]:
param_box = read_yaml_file(Path("artifacts/ConfigFiles/params.yaml"))
config_box = read_yaml_file(Path("artifacts/ConfigFiles/config.yaml"))
params = param_box

yaml file: artifacts\ConfigFiles\params.yaml loaded successfully
yaml file: artifacts\ConfigFiles\config.yaml loaded successfully


In [10]:
# Read in transformed data
config =  config_box.data_transformation

# Load as DataFrame 
income_train = pd.read_csv(config.transformed_train_data)
income_test = pd.read_csv(config.transformed_test_data)

# Get the features and y label for train set
y_train = income_train['label']
train_columns = income_train.columns
feature_columns = train_columns[:-1]
X_train = income_train[feature_columns]

# Get the features and y label for test set
y_test = income_test['label']
X_test = income_test[feature_columns]

In [32]:
# Train the model
logistic_regression = LogisticRegression(penalty= params.LogisticRegression.penalty,
                                         max_iter=params.LogisticRegression.max_iter,
                                         l1_ratio=params.LogisticRegression.l1_ratio,
                                         solver=params.LogisticRegression.solver)
logistic_regression.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_regression.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f_score = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
y_pred

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [20]:
xgb = XGBClassifier(n_estimators = params.XGBoost.n_estimators,
                        learning_rate = params.XGBoost.learning_rate,
                        #use_label_encoder = False,
                        #eval_metric = "logloss",
                        #early_stopping_rounds = 5,
                        n_jobs = params.XGBoost.n_jobs)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f_score = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
y_pred

array([0, 0, 1, ..., 1, 0, 1])

In [21]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow XGBoost")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params.XGBoost)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f_score)
    mlflow.log_metric("roc_auc_score", roc_auc)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Elasticnet model")

    # Infer the model signature
    signature = infer_signature(X_train, xgb.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=xgb,
        artifact_path="logistic_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-gradient_boosting",
    )

Registered model 'tracking-gradient_boosting' already exists. Creating a new version of this model...
2024/03/05 12:42:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-gradient_boosting, version 4
Created version '4' of model 'tracking-gradient_boosting'.
