In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import  train_test_split
import copy
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier 
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import mlflow
import optuna
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import os
sys.path.append(os.path.abspath('/Users/supriyasindigerekumaraswmamy/Desktop/Thesis/wind_Turbine'))

from utils.helper import *
failures = load_failures_data('../data/model_data/failures.csv')
components = failures['component'].unique()
component_data = load_all_component_data(components)
data_splits = prepare_all_data_for_training(component_data, "target_class")

1. data_splits[component][0] : Component_X_train
2. data_splits[component][1] : Component_X_test
3. data_splits[component][2] : Component_y_train
4. data_splits[component][3] : Component_y_test

In [3]:
# NOTE: review the links mentioned above for guidance on connecting to a managed tracking server, such as the free Databricks Community Edition

mlflow.set_tracking_uri("http://localhost:80")


In [5]:
experiment_name = "XAI"
experiment_id = mlflow.create_experiment(experiment_name)
print(f"Created experiment with ID: {experiment_id}")

Created experiment with ID: 5


In [6]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")


In [7]:
# create a objective funtion for the Optuna study for each component
def objective(trial, component):
    """
    Objective function for Optuna study to optimize hyperparameters for the XGBoost classifier
    """

    # define the search space for the hyperparameters
    search_space = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_categorical('n_estimators', [50, 100, 200]),
        'max_depth': trial.suggest_categorical('max_depth', [10, 20, 40, 80]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'subsample': 1.0,
        'colsample_bytree': 1.0,
        'lambda': 1.0,
        'alpha': 1.0,
        'max_features': trial.suggest_int('max_features', 5, 20)
    }

    # create the XGBoost classifier with the hyperparameters
    model = XGBClassifier(**search_space)
    model.fit(data_splits[component][0], data_splits[component][2])
    # apply feature selection
    selector = SelectFromModel(model, threshold=-np.inf, max_features=search_space['max_features'])
    X_train_selected = selector.transform(data_splits[component][0])
    X_test_selected = selector.transform(data_splits[component][1])
    model.fit(X_train_selected, data_splits[component][2])
    
    pipeline = Pipeline([("model", model)])

    # evaluate the model with cross-validation
    score = cross_val_score(
        pipeline, data_splits[component][0], data_splits[component][2], cv=5
    ).mean()

    return score

In [8]:
# create a study for each component
mlflow.set_experiment(experiment_name)
trials= 5
for component in components:
    globals()[f"{component}_study"] = optuna.create_study(direction='maximize')
    globals()[f"{component}_study"].optimize(lambda trial: objective(trial, component), n_trials=trials, callbacks=[champion_callback])

    # get the best hyperparameters
    best_params = globals()[f"{component}_study"].best_params
    best_score = globals()[f"{component}_study"].best_value
    print(f"Best score for {component} component: {best_score}")
    print(f"Best parameters for {component} component: {best_params}")



Initial trial 0 achieved value: 0.9540603852160332
Trial 1 achieved value: 0.95760194343224 with  0.3698% improvement
Best score for GEARBOX component: 0.95760194343224
Best parameters for GEARBOX component: {'n_estimators': 100, 'max_depth': 80, 'learning_rate': 0.07735647047260169, 'max_features': 13}
Initial trial 0 achieved value: 0.9104771820232518
Trial 1 achieved value: 0.9116571230262016 with  0.1294% improvement
Trial 4 achieved value: 0.9128405344438659 with  0.1296% improvement
Best score for GENERATOR component: 0.9128405344438659
Best parameters for GENERATOR component: {'n_estimators': 100, 'max_depth': 40, 'learning_rate': 0.06382497465228437, 'max_features': 20}
Initial trial 0 achieved value: 0.8616050668054832
Trial 2 achieved value: 0.8627798021863613 with  0.1362% improvement
Trial 4 achieved value: 0.865139684192261 with  0.2728% improvement
Best score for HYDRAULIC_GROUP component: 0.865139684192261
Best parameters for HYDRAULIC_GROUP component: {'n_estimators': 5

In [9]:
# log the best model and parameters and the best score to ml flow for each component
for component in components:
    with mlflow.start_run(experiment_id=experiment_id, run_name=component):
        mlflow.log_params( globals()[f"{component}_study"].best_params)
        mlflow.log_metrics({'score': globals()[f"{component}_study"].best_value})
    
        mlflow.set_tags(
            tags={
            "project": "Thesis",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
            "component": component
         }
        )
        globals()[f"{component}_best_model"]= XGBClassifier(**globals()[f"{component}_study"].best_params)
        globals()[f"{component}_best_model"].fit(data_splits[component][0], data_splits[component][2])
   

        artifact_path = "model"

        mlflow.xgboost.log_model(
            xgb_model= globals()[f"{component}_best_model"],
            artifact_path=artifact_path,
        #input_example= GEARBOX_X_train.iloc[[0]],
            model_format='pickle',
            metadata={"model_data_version": 1},
        )

        model_uri = mlflow.get_artifact_uri(artifact_path)
      

In [19]:
import pickle
import os

In [20]:
# save the models to disk
if not os.path.exists('../models'):
    os.makedirs('../models')

In [21]:
model_name="xgb"

In [22]:
import pickle
for component in components:
    with open('./models/selected-{}.pickle'.format(component), 'wb') as handle:
        pickle.dump(globals()[f"{component}_best_model"], handle, protocol=pickle.HIGHEST_PROTOCOL)

  