In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler



In [2]:
def create_feature(df: pd.DataFrame) -> None:
    df["air_process_diff"] = abs(df["Air temperature [K]"] - df["Process temperature [K]"])
    df["speed_power"] = df['Rotational speed [rpm]'] * (2 * np.pi / 60) / (df['Rotational speed [rpm]'] * (2 * np.pi / 60) * df['Torque [Nm]'])
    df['torque_power'] = df['Torque [Nm]'] / (df['Rotational speed [rpm]'] * (2 * np.pi / 60) * df['Torque [Nm]'])
    df["tool_process"] = df["Tool wear [min]"] * df["Process temperature [K]"]
    df["temp_ratio"] = df["Process temperature [K]"] / df["Air temperature [K]"]
    df["product_id_num"] = pd.to_numeric(df["Product ID"].str.slice(start=1))
    
    df.drop(columns="Product ID", inplace=True)
    try:
        df.drop(columns="id", inplace=True)
    except:
        df.drop(columns="UDI", inplace=True)
    
def identify_column_types(df: pd.DataFrame) -> tuple[list, list, list]:
    num_cols = df.select_dtypes("float").columns.tolist()
    int_cols = df.select_dtypes("integer").columns.tolist()
    cat_cols = df.select_dtypes("object").columns.tolist()
    
    return num_cols, int_cols, cat_cols

def load_data(path):
    
    df = pd.read_csv(path)
    create_feature(df)
    return df
    

In [3]:
train_path = "../data/raw/train.csv"
test_path = "../data/raw/test.csv"
origin_path = "../data/raw/machine failure.csv"

In [4]:
target_col = 'Machine failure'

num_cols = [
    'Air temperature [K]',
    'Process temperature [K]',
    'Rotational speed [rpm]',
    'Torque [Nm]',
    'Tool wear [min]'
]

binary_cols = [
    'TWF',
    'HDF',
    'PWF',
    'OSF',
    'RNF'
]

# cat_cols = 'Type'
cat_cols = ['Type']

In [5]:
df_train = load_data(train_path)
df_test = load_data(test_path)
df_origin = load_data(origin_path)

In [6]:
df = pd.concat([df_train, df_origin], axis=0)



In [7]:
le = LabelEncoder()
for col in cat_cols:
    df["encoded_" + col] = le.fit_transform(df[col])
    
df.drop(cat_cols, axis=1, inplace=True)

In [19]:
import xgboost as xgb
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
import re
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split

X, y = df.drop("Machine failure", axis=1), df["Machine failure"]
X.columns = [re.sub(r"[^a-zA-Z0-9_]+", "_", col) for col in X.columns]
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)


In [67]:
dtrain = xgb.DMatrix(data=X, label=y)
param = {"objective": "binary:logistic"}
num_round = 10
res = xgb.cv(
    param,
    dtrain,
    num_round,
    nfold=5,
    metrics={"error"},
    seed=42,
    callbacks=[xgb.callback.EvaluationMonitor(show_stdv=True)],
    early_stopping_rounds=30)

[0]	train-error:0.00362+0.00009	test-error:0.00367+0.00033
[1]	train-error:0.00362+0.00009	test-error:0.00367+0.00033
[2]	train-error:0.00362+0.00009	test-error:0.00367+0.00033
[3]	train-error:0.00362+0.00009	test-error:0.00367+0.00033
[4]	train-error:0.00361+0.00009	test-error:0.00365+0.00033
[5]	train-error:0.00361+0.00009	test-error:0.00365+0.00033
[6]	train-error:0.00361+0.00009	test-error:0.00364+0.00034
[7]	train-error:0.00361+0.00009	test-error:0.00364+0.00033
[8]	train-error:0.00361+0.00008	test-error:0.00365+0.00034
[9]	train-error:0.00360+0.00008	test-error:0.00365+0.00035


In [68]:
res

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.003625,9.1e-05,0.003674,0.000327
1,0.003625,9.1e-05,0.003674,0.000327
2,0.003625,9.1e-05,0.003674,0.000327
3,0.003621,8.8e-05,0.003667,0.000332
4,0.003614,8.7e-05,0.003654,0.000332
5,0.003613,8.6e-05,0.003654,0.000332
6,0.003614,8.7e-05,0.00364,0.000343
7,0.003609,9.1e-05,0.00364,0.000331
8,0.003606,8.3e-05,0.003647,0.000344
9,0.003597,8e-05,0.003647,0.00035


In [71]:
xgb_clf = xgb.XGBClassifier(random_state=42)

scores = cross_validate(
    xgb_clf, X, y, cv=skf, scoring = "roc_auc", n_jobs=-1
)

In [73]:
base_roc = scores["test_score"].mean()
print(f"Base ROCAUC: {base_roc:.5f}")

Base ROCAUC: 0.96686


In [20]:
import mlflow

In [21]:
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

In [22]:
experiment_id = get_or_create_experiment("Test Experiment")

In [23]:
experiment_id

'747810045357265472'

In [24]:
# Set the current active MLflow experiment
mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='file:///home/moon/project/mlops-pm/notebooks/mlruns/747810045357265472', creation_time=1717573153708, experiment_id='747810045357265472', last_update_time=1717573153708, lifecycle_stage='active', name='Test Experiment', tags={}>

In [25]:
def objective(trial, X, y, cv, scoring):
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        
    }
    
    # Perform CV
    xgb_clf = xgb.XGBClassifier(**params, random_state=42, eval_metric="auc", objective="binary:logistic")
    scores = cross_validate(xgb_clf, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    # Compute ROC
    roc = scores["test_score"].mean()

    return roc

In [33]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_correlation_with_demand(df, save_path=None):
    """
    Plots the correlation of each variable in the dataframe with the 'demand' column.

    Args:
    - df (pd.DataFrame): DataFrame containing the data, including a 'demand' column.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the plot on a Jupyter window)
    """

    # Compute correlations between all variables and 'demand'
    correlations = df.corr()["Machine failure"].drop("Machine failure").sort_values()

    # Generate a color palette from red to green
    colors = sns.diverging_palette(10, 130, as_cmap=True)
    color_mapped = correlations.map(colors)

    # Set Seaborn style
    sns.set_style(
        "whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5}
    )  # Light grey background and thicker grid lines

    # Create bar plot
    fig = plt.figure(figsize=(12, 8))
    plt.barh(correlations.index, correlations.values, color=color_mapped)

    # Set labels and title with increased font size
    plt.title("Correlation with Machine failure", fontsize=18)
    plt.xlabel("Correlation Coefficient", fontsize=16)
    plt.ylabel("Variable", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="x")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # prevent matplotlib from displaying the chart every time we call this function
    plt.close(fig)

    return fig


# Test the function
correlation_plot = plot_correlation_with_demand(df, save_path="correlation_plot.png")


In [34]:

def plot_feature_importance(model, booster):
    """
    Plots feature importance for an XGBoost model.

    Args:
    - model: A trained XGBoost model

    Returns:
    - fig: The matplotlib figure object
    """
    fig, ax = plt.subplots(figsize=(10, 8))
    importance_type = "weight" if booster == "gblinear" else "gain"
    xgb.plot_importance(
        model,
        importance_type=importance_type,
        ax=ax,
        title=f"Feature Importance based on {importance_type}",
    )
    plt.tight_layout()
    plt.close(fig)

    return fig

In [35]:
def plot_residuals(model, dvalid, valid_y, save_path=None):
    """
    Plots the residuals of the model predictions against the true values.

    Args:
    - model: The trained XGBoost model.
    - dvalid (xgb.DMatrix): The validation data in XGBoost DMatrix format.
    - valid_y (pd.Series): The true values for the validation set.
    - save_path (str, optional): Path to save the generated plot. If not specified, plot won't be saved.

    Returns:
    - None (Displays the residuals plot on a Jupyter window)
    """

    # Predict using the model
    preds = model.predict(dvalid)

    # Calculate residuals
    residuals = valid_y - preds

    # Set Seaborn style
    sns.set_style("whitegrid", {"axes.facecolor": "#c2c4c2", "grid.linewidth": 1.5})

    # Create scatter plot
    fig = plt.figure(figsize=(12, 8))
    plt.scatter(valid_y, residuals, color="blue", alpha=0.5)
    plt.axhline(y=0, color="r", linestyle="-")

    # Set labels, title and other plot properties
    plt.title("Residuals vs True Values", fontsize=18)
    plt.xlabel("True Values", fontsize=16)
    plt.ylabel("Residuals", fontsize=16)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis="y")

    plt.tight_layout()

    # Save the plot if save_path is specified
    if save_path:
        plt.savefig(save_path, format="png", dpi=600)

    # Show the plot
    plt.close(fig)

    return fig


In [26]:
def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

In [37]:
%%time
with mlflow.start_run(experiment_id=experiment_id, run_name="first_attempt", nested=True):
    # Create study that minimizes
    study = optuna.create_study(direction="maximize")
    
    # Wrap the objective inside a lambda with the relevant arguments
    # Pass additional arguments inside another function
    func = lambda trial: objective(trial, X, y, cv=skf, scoring="roc_auc")

    # Start optimizing with 100 trials
    study.optimize(func, n_trials=5, callbacks=[champion_callback])
    
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_roc", study.best_value)
    
    # Log tags
    mlflow.set_tags(
        tags={
            "project": "PM Project",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )
    
    print("log_tag-----")
    # Log a fit model instance
    model = xgb.XGBClassifier(**study.best_params, random_state=42, eval_metric="auc", objective="binary:logistic")
    model.fit(X, y)
    
    # Log the correlation plot
    mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")

    # Log the feature importances plot
    importances = plot_feature_importance(model, booster=study.best_params.get("booster"))
    mlflow.log_figure(figure=importances, artifact_file="feature_importances.png")

    # Log the residuals plot
    residuals = plot_residuals(model, X, y)
    mlflow.log_figure(figure=residuals, artifact_file="residuals.png")
    
    artifact_path = "model"
    
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        input_example=X.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)
    
    

[I 2024-06-10 19:13:19,948] A new study created in memory with name: no-name-b67e0067-7184-435e-a517-6d44601004aa
[I 2024-06-10 19:13:27,899] Trial 0 finished with value: 0.9653722857917234 and parameters: {'n_estimators': 346, 'learning_rate': 0.0017369959842948907, 'max_depth': 9, 'subsample': 0.8433621584275277, 'colsample_bytree': 0.7245640879872735}. Best is trial 0 with value: 0.9653722857917234.


Initial trial 0 achieved value: 0.9653722857917234


[I 2024-06-10 19:13:36,151] Trial 1 finished with value: 0.9657935139488096 and parameters: {'n_estimators': 465, 'learning_rate': 0.003928436411782914, 'max_depth': 7, 'subsample': 0.7918667481293671, 'colsample_bytree': 0.6783794308138725}. Best is trial 1 with value: 0.9657935139488096.


Trial 1 achieved value: 0.9657935139488096 with  0.0436% improvement


[I 2024-06-10 19:13:58,293] Trial 2 finished with value: 0.9633657113089076 and parameters: {'n_estimators': 1058, 'learning_rate': 0.15143468690321665, 'max_depth': 7, 'subsample': 0.7264839851454977, 'colsample_bytree': 0.5693424637866555}. Best is trial 1 with value: 0.9657935139488096.
[I 2024-06-10 19:14:14,739] Trial 3 finished with value: 0.9645521222205723 and parameters: {'n_estimators': 770, 'learning_rate': 0.12851533764798612, 'max_depth': 7, 'subsample': 0.7383468663229358, 'colsample_bytree': 0.6454056929679579}. Best is trial 1 with value: 0.9657935139488096.
[I 2024-06-10 19:14:21,170] Trial 4 finished with value: 0.9667030519123025 and parameters: {'n_estimators': 452, 'learning_rate': 0.009261204614790566, 'max_depth': 6, 'subsample': 0.7169852437128189, 'colsample_bytree': 0.6263143410744128}. Best is trial 4 with value: 0.9667030519123025.


Trial 4 achieved value: 0.9667030519123025 with  0.0941% improvement
log_tag-----




CPU times: user 1min 41s, sys: 2.49 s, total: 1min 43s
Wall time: 1min 11s




[I 2024-06-05 16:29:22,810] A new study created in memory with name: no-name-95f7cafb-3d6b-4588-ba31-47e182bae1fb
[I 2024-06-05 16:29:32,903] Trial 0 finished with value: 0.9652396862289561 and parameters: {'n_estimators': 617, 'learning_rate': 0.001399771218293194, 'max_depth': 5, 'subsample': 0.5392642859683296, 'colsample_bytree': 0.5448109404935677}. Best is trial 0 with value: 0.9652396862289561.
[I 2024-06-05 16:29:37,592] Trial 1 finished with value: 0.9586865578546299 and parameters: {'n_estimators': 230, 'learning_rate': 0.0012947107883542277, 'max_depth': 6, 'subsample': 0.7140267115610769, 'colsample_bytree': 0.8811867895750225}. Best is trial 0 with value: 0.9652396862289561.
[I 2024-06-05 16:30:00,890] Trial 2 finished with value: 0.9680068354092315 and parameters: {'n_estimators': 651, 'learning_rate': 0.009250240924955157, 'max_depth': 13, 'subsample': 0.9692860161962089, 'colsample_bytree': 0.9972574494334845}. Best is trial 2 with value: 0.9680068354092315.
[I 2024-06-

CPU times: user 2.18 s, sys: 1.31 s, total: 3.49 s
Wall time: 2min 21s


In [82]:
print(f"Base ROCAUC: {base_roc:.5f}")
print(f"Optimized ROCAUC: {study.best_value:.5f}")

Base ROCAUC: 0.96686
Optimized ROCAUC: 0.97082


In [83]:
print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

Best params:
	n_estimators: 959
	learning_rate: 0.01253278960281304
	max_depth: 7
	subsample: 0.7544060353752393
	colsample_bytree: 0.9822523112702799


In [86]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.