In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import  train_test_split
import copy
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier as xgb
from sklearn.model_selection import learning_curve, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import mlflow
import optuna


In [55]:
failures = pd.read_csv('./data/model_data/failures.csv',sep=',')
components = failures['Component'].unique()

In [56]:
encoder = LabelEncoder()
for component in components:
    globals()[f"{component}_df"] = pd.read_csv(f'./data/model_data/labelled_data_{component}.csv',sep=',')
    globals()[f"{component}_df"]['Turbine_ID'] = encoder.fit_transform(['Turbine_ID']*globals()[f"{component}_df"].shape[0])
    # set the date as the index
    globals()[f"{component}_df"] = globals()[f"{component}_df"].set_index('Timestamp')

In [57]:
df = GEARBOX_df.drop(columns=['Component'])

In [58]:
class_target_name = "Failure (Target)"
for component in components:
    X = globals()[f"{component}_df"].drop(columns=['Component',class_target_name])
    y = globals()[f"{component}_df"][class_target_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    globals()[f"{component}_X_train"] = X_train
    globals()[f"{component}_X_test"] = X_test
    globals()[f"{component}_y_train"] = y_train
    globals()[f"{component}_y_test"] = y_test

In [63]:
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

In [64]:
experiment_id = get_or_create_experiment("Wind Turbine")

In [65]:
# NOTE: review the links mentioned above for guidance on connecting to a managed tracking server, such as the free Databricks Community Edition

mlflow.set_tracking_uri("http://localhost:8080")


In [66]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.

    Note: This callback is not intended for use in distributed computing systems such as Spark
    or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
    workers or agents.
    The race conditions with file system state management for distributed trials will render
    inconsistent values with this callback.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")


In [67]:
def objective(trial):
    with mlflow.start_run(nested=True):
        # Define hyperparameters
        params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_categorical('n_estimators', [50, 100, 200]),
        'max_depth': trial.suggest_categorical('max_depth', [10, 20, 40, 80]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'subsample': 1.0,
        'colsample_bytree': 1.0,
        'lambda': 1.0,
        'alpha': 1.0,
        'max_features': trial.suggest_int('max_features', 5, 104)  # Add max_features parameter
    }
        # Train XGBoost model
        model = XGBClassifier(**params)
        model.fit(GEARBOX_X_train, GEARBOX_y_train)

        selector = SelectFromModel(model, threshold=-np.inf, prefit=True, max_features=params['max_features'])
        X_train_selected = selector.transform(X_train)
        X_test_selected = selector.transform(X_test)
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        f1 = f1_score(y_test, y_pred, average='weighted')
       
   
        mlflow.log_params(params)
        mlflow.log_metric('f1_weighted', f1)

    return f1
   


In [68]:
run_name = "first_attempt"

In [None]:
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=500,callbacks=[champion_callback])
    mlflow.log_params(study.best_params)
    mlflow.log_metrics({'f1_score': study.best_trial.value})

    mlflow.set_tags(
        tags={
            "project": "Wind Turbine",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    best_params = trial.params
    best_model = XGBClassifier(**best_params)
    best_model.fit(GEARBOX_X_train, GEARBOX_y_train)
    selector = SelectFromModel(best_model, threshold=-np.inf, prefit=True, max_features=best_params['max_features'])
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    best_model.fit(X_train_selected, y_train)

    artifact_path = "model"

    mlflow.xgboost.log_model(
        xgb_model=best_model,
        artifact_path=artifact_path,
        input_example= GEARBOX_X_train.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

    model_uri = mlflow.get_artifact_uri(artifact_path)
