In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import  train_test_split
import copy
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier 
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import mlflow
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
failures = pd.read_csv('../data/model_data/failures.csv',sep=',')
components = failures['Component'].unique()

In [5]:
encoder = LabelEncoder()
for component in components:
    globals()[f"{component}_df"] = pd.read_csv(f'../data/model_data/labelled_data_{component}.csv',sep=',')
    globals()[f"{component}_df"]['Turbine_ID'] = encoder.fit_transform(['Turbine_ID']*globals()[f"{component}_df"].shape[0])
    # set the date as the index
    globals()[f"{component}_df"] = globals()[f"{component}_df"].set_index('Timestamp')

In [6]:
class_target_name = "Failure (Target)"
for component in components:
    X = globals()[f"{component}_df"].drop(columns=['Component',class_target_name])
    y = globals()[f"{component}_df"][class_target_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    globals()[f"{component}_X_train"] = X_train
    globals()[f"{component}_X_test"] = X_test
    globals()[f"{component}_y_train"] = y_train
    globals()[f"{component}_y_test"] = y_test

In [7]:
# NOTE: review the links mentioned above for guidance on connecting to a managed tracking server, such as the free Databricks Community Edition

mlflow.set_tracking_uri("http://localhost:8080")


In [8]:
experiment_name = "Predictive Maintenance"
experiment_id = mlflow.create_experiment(experiment_name)
print(f"Created experiment with ID: {experiment_id}")

Created experiment with ID: 374465840103549388


In [9]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")


In [10]:
# create a objective funtion for the Optuna study for each component
def objective(trial, component):
    """
    Objective function for Optuna study to optimize hyperparameters for the XGBoost classifier
    """

    # define the search space for the hyperparameters
    search_space = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_categorical('n_estimators', [50, 100, 200]),
        'max_depth': trial.suggest_categorical('max_depth', [10, 20, 40, 80]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'subsample': 1.0,
        'colsample_bytree': 1.0,
        'lambda': 1.0,
        'alpha': 1.0,
        'max_features': trial.suggest_int('max_features', 5, 50)
    }

    # create the XGBoost classifier with the hyperparameters
    model = XGBClassifier(**search_space)
    model.fit(globals()[f"{component}_X_train"], globals()[f"{component}_y_train"])
    # apply feature selection
    selector = SelectFromModel(model, threshold=-np.inf, max_features=search_space['max_features'])
    X_train_selected = selector.transform(globals()[f"{component}_X_train"])
    X_test_selected = selector.transform(globals()[f"{component}_X_test"])
    model.fit(X_train_selected, y_train)
    
    pipeline = Pipeline([("model", model)])

    # evaluate the model with cross-validation
    score = cross_val_score(
        pipeline, globals()[f"{component}_X_train"], globals()[f"{component}_y_train"], cv=5
    ).mean()

    return score

In [11]:
# create a study for each component
mlflow.set_experiment(experiment_name)
for component in components:
    globals()[f"{component}_study"] = optuna.create_study(direction='maximize')
    globals()[f"{component}_study"].optimize(lambda trial: objective(trial, component), n_trials=5, callbacks=[champion_callback])

    # get the best hyperparameters
    best_params = globals()[f"{component}_study"].best_params
    best_score = globals()[f"{component}_study"].best_value
    print(f"Best score for {component} component: {best_score}")
    print(f"Best parameters for {component} component: {best_params}")



Initial trial 0 achieved value: 0.9530791788856305
Trial 2 achieved value: 0.9583577712609971 with  0.5508% improvement
Best score for GEARBOX component: 0.9583577712609971
Best parameters for GEARBOX component: {'n_estimators': 200, 'max_depth': 80, 'learning_rate': 0.05439579553639034, 'max_features': 8}
Initial trial 0 achieved value: 0.901466275659824
Trial 1 achieved value: 0.9061583577712609 with  0.5178% improvement
Trial 2 achieved value: 0.9073313782991201 with  0.1293% improvement
Best score for GENERATOR component: 0.9073313782991201
Best parameters for GENERATOR component: {'n_estimators': 50, 'max_depth': 10, 'learning_rate': 0.10845720027446801, 'max_features': 31}
Initial trial 0 achieved value: 0.8627565982404691
Best score for HYDRAULIC_GROUP component: 0.8627565982404691
Best parameters for HYDRAULIC_GROUP component: {'n_estimators': 50, 'max_depth': 40, 'learning_rate': 0.07098441477305667, 'max_features': 49}
Initial trial 0 achieved value: 0.9624633431085045
Trial 

In [12]:
# log the best model and parameters and the best score to ml flow for each component
for component in components:
    with mlflow.start_run(experiment_id=experiment_id, run_name=component):
        mlflow.log_params( globals()[f"{component}_study"].best_params)
        mlflow.log_metrics({'score': globals()[f"{component}_study"].best_value})
    
        mlflow.set_tags(
            tags={
            "project": "Thesis",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
            "component": component
         }
        )
        globals()[f"{component}_best_model"]= XGBClassifier(**globals()[f"{component}_study"].best_params)
        globals()[f"{component}_best_model"].fit(globals()[f"{component}_X_train"], globals()[f"{component}_y_train"])
   

        artifact_path = "model"

        mlflow.xgboost.log_model(
            xgb_model= globals()[f"{component}_best_model"],
            artifact_path=artifact_path,
        #input_example= GEARBOX_X_train.iloc[[0]],
            model_format='pickle',
            metadata={"model_data_version": 1},
        )

        model_uri = mlflow.get_artifact_uri(artifact_path)
      

In [49]:
import pickle
import os

In [50]:
# save the models to disk
if not os.path.exists('../models_MLFLOW'):
    os.makedirs('../models_MLFLOW')

In [None]:
model_name="xgb"

In [51]:
import pickle
for component in components:
    with open('./models_MLFLOW/selected-{}.pickle'.format(component), 'wb') as handle:
        pickle.dump(globals()[f"{component}_best_model"], handle, protocol=pickle.HIGHEST_PROTOCOL)

  