**Installing the Required Modules**

In [104]:
!pip install optuna
!pip install scikit-lego

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**Importing the Required Modules**

In [105]:
from sklearn.metrics import f1_score,  precision_score, recall_score, auc, roc_curve
from sklearn.ensemble import IsolationForest

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

from sklearn.compose import make_column_transformer

from sklego.preprocessing import ColumnCapper

from sklearn.model_selection import train_test_split

import optuna

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 20)
pd.set_option("display.precision", 4)
pd.set_option("plotting.backend", "matplotlib")

from google.colab import drive
drive.mount('/drive')

import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


**Freeze Random State**

In [106]:
np.random.seed(35)

**Utility Functions**

In [107]:
def outlier_metirc_detection_report(y, y_pred):
    precision = precision_score(y, y_pred, pos_label=-1, zero_division=0)
    recall = recall_score(y, y_pred, pos_label=-1, zero_division=0)
    f1 = f1_score(y, y_pred, pos_label=-1, zero_division=0)
    print(f"precision: {precision: .2f}")
    print(f"recall: {recall: .2f}")
    print(f"f1_score: {f1: .2f}")


# **I. Goal**

Цель нашей использовать построить детектор сбоя работы оборудования по  телеметрическим данным: https://www.kaggle.com/datasets/tiagotgoz/predictive-useful-life-based-into-telemetry

# **II. Approaches and Limitations**

В качестве детектора сбоев работы оборудования речерез детекцию аномалий выбран IsolationForest


# **1. Data Transformers**

## **1.1 Datatime Transformers**

In [108]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

## **1.2 Utility Transformers**

In [109]:
def identy():
  return FunctionTransformer(lambda x: x)

## **1.3 Full Dataset Transformer**

In [110]:
transform = make_column_transformer(
   (OrdinalEncoder(), ["age", "model"]),
   (sin_transformer(365), ["time_in_cycles"]),
   (cos_transformer(365), ["time_in_cycles"]),
   (ColumnCapper((.1, .95)),
     ["voltmean_24h","rotatemean_24h","pressuremean_24h",
      "vibrationmean_24h", "voltsd_24h", "rotatesd_24h",
      "pressuresd_24h", "vibrationsd_24h", "voltmean_5d", "rotatemean_5d",
      "pressuremean_5d", "vibrationmean_5d", "voltsd_5d", "rotatesd_5d",
      "pressuresd_5d","vibrationsd_5d", "volts_entropy", "volts_benford",
      "volts_cidce", "volts_lzc", "press_max", "press_below", "press_above",
      "press_entropy", "press_benford", "press_cidce", "press_lzc",
      "vibra_max", "vibra_below", "vibra_above", "vibra_entropy",
      "vibra_benford", "vibra_cidce", "vibra_lzc", "rotate_max", "rotate_below",
      "rotate_above", "rotate_entropy", "rotate_benford", "rotate_cidce",
       "rotate_lzc","volts_max", "volts_below", "volts_above", "DI",
      ]),
   (identy(), ["error1", "error2", "error3", "error4", "error5",
               "comp1",  "comp2",  "comp3", "comp4"]),
)

# **2. Detector Fitting**

## **2.1 Objective Function**

Не будем  крутить “ручку настройки” параметров алгоритма детекции аномалий, а доверим все это Optuna. Так как по сути мы имеем дело с бинарным классификатором аномалия/нормальный пример в качестве score функции будем использовать площадь под ROC кривой

In [111]:
class IsolationForestSupervised:

    def __init__(self, X_train, Y_train, X_val, Y_val):
        self.X = X_train
        self.Y = Y_train
        self.X_val = X_val
        self.Y_val = Y_val
        self.best_score = -1e53
        self.best_model = None

    def evaluate_score(self, y_true, score):
        fpr, tpr, thresholds = roc_curve(y_true, score, pos_label=-1)
        return auc(fpr, tpr)

    def __call__(self, trial):

        params ={ "n_estimators" :trial.suggest_int('n_estimators', 10, 100),
                  "max_samples": trial.suggest_float('max_samples', 0.0, .995),
                  "contamination": trial.suggest_float('contamination', 1e-7, .5),
                  "max_features" :trial.suggest_float('max_features', 0.0, 1.0),
                  "bootstrap": trial.suggest_categorical('bootstrap', [True, False])
                 }
        forest = IsolationForest(**params, warm_start=False, n_jobs=-1)
        forest.fit(self.X)
        y_val = forest.decision_function(self.X_val)
        score = self.evaluate_score(self.Y_val, -y_val)

        if self.best_score < score:
            self.best_score = score
            self.best_model = forest

        return score

## **2.2 Detector Fitting and Data Anonymization**

Обозначим через -1 - вышедшее из строя оборудования, а за 1 - нормально функционирующее

In [112]:
train_df = pd.read_csv('/drive/My Drive/rul/train_telemetry.csv')
test_df = pd.read_csv('/drive/My Drive/rul/test_telemetry.csv')

train_df["failed"] = train_df["failed"].astype(int)
test_df["failed"] = test_df["failed"].astype(int)

train_df[["failed"]] = train_df[["failed"]].applymap(lambda x: -1 if x == 1 else 1)
test_df[["failed"]] = test_df[["failed"]].applymap(lambda x: -1 if x == 1 else 1)

Y_train = train_df["failed"].values.ravel()
Y_test = test_df["failed"].values.ravel()

n_outliers = train_df["failed"].value_counts()[-1]
print(f"outliers number: {n_outliers:.2f}")
print(f"outliers percent: {(n_outliers/train_df.shape[0])*100:.2f}")

outliers number: 319.00
outliers percent: 1.62


Как видно примеров для честного классификатора у нас очень мало поэтому детектирования нерабочих состояний как аномалий вполне законна. Для этого пользуясь Optuna поищем оптимальные по AUC параметры и будем помнить, что может потребоваться несколько попыток так как IsolationForest unsupervised алгоритм и он легко переобучается

In [113]:
dropped_lst = ["machineID", 'datetime', 'RULWeek', 'failure', 'failed', 'RUL', 'RUL_I' ]

train_df, X_val, Y_train, Y_val = \
      train_test_split(
          train_df, Y_train, test_size=.3, stratify=train_df[["failed"]])

train_df.drop(dropped_lst, axis=1, inplace=True)
test_df.drop(dropped_lst, axis=1, inplace=True)

X_train = transform.fit_transform(train_df)

X_val = transform.transform(X_val)
X_test = transform.transform(test_df)

In [114]:
objective = IsolationForestSupervised(X_train, Y_train, X_val, Y_val)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
params = study.best_trial.params

[I 2023-06-19 19:43:54,730] A new study created in memory with name: no-name-e11a6201-19b0-4a49-8595-56d0ec365313
[I 2023-06-19 19:43:55,083] Trial 0 finished with value: 0.9201451791465378 and parameters: {'n_estimators': 41, 'max_samples': 0.5573561033728717, 'contamination': 0.49357448208541166, 'max_features': 0.33860046157629653, 'bootstrap': False}. Best is trial 0 with value: 0.9201451791465378.
[I 2023-06-19 19:43:55,378] Trial 1 finished with value: 0.944278201920865 and parameters: {'n_estimators': 20, 'max_samples': 0.4468076600142463, 'contamination': 0.18424964220808257, 'max_features': 0.7253508598981848, 'bootstrap': False}. Best is trial 1 with value: 0.944278201920865.
[I 2023-06-19 19:43:56,078] Trial 2 finished with value: 0.9497507260754543 and parameters: {'n_estimators': 89, 'max_samples': 0.018396319885359138, 'contamination': 0.16919194338413562, 'max_features': 0.9598043551002519, 'bootstrap': False}. Best is trial 2 with value: 0.9497507260754543.
[I 2023-06-1

## **2.3 Test Classifier on Test Data**

In [115]:
outliers_d = objective.best_model

Y_pred = outliers_d.predict(X_test)

outlier_metirc_detection_report(Y_test, Y_pred)


precision:  0.62
recall:  0.03
f1_score:  0.06


Хотя метрики и неособо хорошие они вполнее соответствуют тому малому числу примеров аномалий в данных