In [1]:
import os
import sys

from pathlib import Path

PACKAGE_ROOT = Path(os.getcwd()).parent
sys.path.append(str(PACKAGE_ROOT))

print(PACKAGE_ROOT)

/home/moon/project/mlops-pm


In [2]:
from src.config import config
import src.pipeline as pipe

In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
import optuna
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train = pd.read_csv(os.path.join(config.DATAPATH, config.TRAIN_FILE))
test = pd.read_csv(os.path.join(config.DATAPATH, config.TEST_FILE))
origin = pd.read_csv(os.path.join(config.DATAPATH, config.ORIGIN_FILE))

In [5]:
X_train, y_train = train.drop(config.TARGET, axis=1), train[config.TARGET]
X_origin, y_origin = origin.drop(config.TARGET, axis=1), origin[config.TARGET]
X_test = test.copy()

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

input_processor = pipe.input_pipeline
# xgb_clf = xgb.XGBClassifier(random_state=42)
lin_clf = LogisticRegression(max_iter=2000)

lin_pipe = make_pipeline(input_processor, lin_clf)

In [7]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# scores = cross_validate(
#     lin_pipe, X_train, y_train, cv=skf, scoring = "roc_auc", 
# )

# base_roc = scores["test_score"].mean()
# print(f"Base ROCAUC: {base_roc:.5f}")

In [8]:
import mlflow
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)
    
def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")
            
def plot_residuals(model, dvalid, valid_y, save_path=None):
    """
    Plots the residuals of the model predictions against the true values.

    Args:
    - model: The trained XGBoost model.
    - dvalid (xgb.DMatrix): The validation data in XGBoost DMatrix format.
    - valid_y (pd.Series): The true values for the validation set.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the residuals plot on a Jupyter window)
    """

    # Predict using the model
    preds = model.predict(dvalid)

    # Calculate residuals
    residuals = valid_y - preds

    # Set Seaborn style
    sns.set_style("whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5})

    # Create scatter plot
    fig = plt.figure(figsize=(12, 8))
    plt.scatter(valid_y, residuals, color="blue", alpha=0.5)
    plt.axhline(y=0, color="r", linestyle="-")

    # Set labels, title and other plot properties
    plt.title("Residuals vs True Values", fontsize=18)
    plt.xlabel("True Values", fontsize=16)
    plt.ylabel("Residuals", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="y")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # Show the plot
    plt.close(fig)

    return fig

In [10]:
experiment_id = get_or_create_experiment("Experiment - 1")

In [11]:
# Set the current active MLflow experiment
mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='file:///home/moon/project/mlops-pm/notebooks/mlruns/524055895374574115', creation_time=1719833102358, experiment_id='524055895374574115', last_update_time=1719833102358, lifecycle_stage='active', name='Experiment - 1', tags={}>

In [13]:
# def objective(trial, X, y, cv, scoring):
    
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1100),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        
#     }
    
#     # Perform CV
#     xgb_clf = xgb.XGBClassifier(**params, random_state=42, eval_metric="auc", objective="binary:logistic")
#     scores = cross_validate(xgb_clf, X, y, cv=cv, scoring=scoring, n_jobs=-1)
#     # Compute ROC
#     roc = scores["test_score"].mean()

#     return roc

In [19]:
def objective(trial, X, y, cv, scoring):
    
    params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
        "n_jobs" : -1
    }
    
    # model = LogisticRegression(**params, random_state=42)
    model = make_pipeline(input_processor, LogisticRegression(**params, random_state=42))
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    roc = scores["test_score"].mean()
    return roc

In [22]:
%%time
with mlflow.start_run(experiment_id=experiment_id, run_name="first_attempt", nested=True):
    # Create study that minimizes
    study = optuna.create_study(direction="maximize")
    
    # Wrap the objective inside a lambda with the relevant arguments
    # Pass additional arguments inside another function
    func = lambda trial: objective(trial, X_train, y_train, cv=skf, scoring="roc_auc")

    # Start optimizing with 100 trials
    study.optimize(func, n_trials=5, callbacks=[champion_callback])
    
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_roc", study.best_value)
    
    # Log tags
    mlflow.set_tags(
        tags={
            "project": "PM Project",
            "optimizer_engine": "optuna",
            "model_family": "Logistic",
            "feature_set_version": 1,
        }
    )
    
    print("log_tag-----")
    # Log a fit model instance
    model = make_pipeline(input_processor, LogisticRegression(**study.best_params, random_state=42))
    # model = xgb.XGBClassifier(**study.best_params, random_state=42, eval_metric="auc", objective="binary:logistic")
    model.fit(X_train, y_train)
    
    # Log the correlation plot
    # mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")

    # Log the feature importances plot
    # importances = plot_feature_importance(model, booster=study.best_params.get("booster"))
    # mlflow.log_figure(figure=importances, artifact_file="feature_importances.png")

    # Log the residuals plot
    residuals = plot_residuals(model,X_train, y_train)
    mlflow.log_figure(figure=residuals, artifact_file="residuals.png")
    
    artifact_path = "model"
    
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path=artifact_path,
        input_example=X_train.iloc[[0]],
        metadata={"model_data_version": 2},
    )
    
    # mlflow.model.log_model(
    #     xgb_model=model,
    #     artifact_path=artifact_path,
    #     input_example=X.iloc[[0]],
    #     model_format="ubj",
    #     metadata={"model_data_version": 1},
    # )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)
    
    # print(f"Base ROCAUC: {base_roc:.5f}")
    print(f"Optimized ROCAUC: {study.best_value:.5f}")
    
    print("Best params:")
    for key, value in study.best_params.items():
        print(f"\t{key}: {value}")
    
    

[I 2024-07-01 20:33:46,198] A new study created in memory with name: no-name-e8504a7c-67ec-4f06-8b4e-ddd50485b40c


  'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
  'C' : trial.suggest_loguniform("C", 1e-2, 1),


UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


[I 2024-07-01 20:33:48,827] Trial 0 finished with value: 0.9043946701958777 and parameters: {'tol': 0.00027589120920798975, 'C': 0.1683822095778228}. Best is trial 0 with value: 0.9043946701958777.
  'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
  'C' : trial.suggest_loguniform("C", 1e-2, 1),


Initial trial 0 achieved value: 0.9043946701958777
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


[I 2024-07-01 20:33:51,483] Trial 1 finished with value: 0.9044220692037557 and parameters: {'tol': 0.000238146311265913, 'C': 0.01738600180308847}. Best is trial 1 with value: 0.9044220692037557.
  'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
  'C' : trial.suggest_loguniform("C", 1e-2, 1),


Trial 1 achieved value: 0.9044220692037557 with  0.0030% improvement
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


[I 2024-07-01 20:33:54,123] Trial 2 finished with value: 0.9020323867205404 and parameters: {'tol': 0.00023113256527589114, 'C': 0.01688353754520258}. Best is trial 1 with value: 0.9044220692037557.
  'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
  'C' : trial.suggest_loguniform("C", 1e-2, 1),


UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[I 2024-07-01 20:33:56,873] Trial 3 finished with value: 0.9058246043093202 and parameters: {'tol': 0.00016952924045551623, 'C': 0.04817790704888467}. Best is trial 3 with value: 0.9058246043093202.
  'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
  'C' : trial.suggest_loguniform("C", 1e-2, 1),


Trial 3 achieved value: 0.9058246043093202 with  0.1548% improvement
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
UDI 는 포함되지 않은 인덱스입니다.
log_tag-----
UDI 는 포함되지 않은 인덱스입니다.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


UDI 는 포함되지 않은 인덱스입니다.




UDI 는 포함되지 않은 인덱스입니다.
Optimized ROCAUC: 0.90582
Best params:
	tol: 0.00016952924045551623
	C: 0.04817790704888467
CPU times: user 31.3 s, sys: 23.8 s, total: 55.1 s
Wall time: 20.5 s


