In [30]:
import shap
import pandas as pd
import pickle
import copy
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from alibi.explainers import AnchorTabular
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import  train_test_split

In [31]:
import warnings
warnings.filterwarnings("ignore")

In [32]:
failures = pd.read_csv('../data/model_data/failures.csv',sep=',')
components = failures['Component'].unique()

In [33]:
encoder = LabelEncoder()
for component in components:
    globals()[f"{component}_df"] = pd.read_csv(f'../data/model_data/labelled_data_{component}.csv',sep=',')
    globals()[f"{component}_df"]['Turbine_ID'] = encoder.fit_transform(['Turbine_ID']*globals()[f"{component}_df"].shape[0])
    # set the date as the index
    globals()[f"{component}_df"] = globals()[f"{component}_df"].set_index('Timestamp')

In [34]:
class_target_name = "Failure (Target)"
for component in components:
    X = globals()[f"{component}_df"].drop(columns=['Component',class_target_name])
    y = globals()[f"{component}_df"][class_target_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    globals()[f"{component}_X_train"] = X_train
    globals()[f"{component}_X_test"] = X_test
    globals()[f"{component}_y_train"] = y_train
    globals()[f"{component}_y_test"] = y_test

In [35]:
model_name = "xgb"

In [36]:
for component in components:

    with open("../main_pipeline/model/selected-{}_{}.pickle".format(model_name, component), "rb") as f:
      globals()[f"{model_name}_{component}"] = pickle.load(f)

In [13]:
feature_selector = xgb_GEARBOX.named_steps['selector']
selected_features = GEARBOX_X_train.columns[feature_selector.get_support()]
# create a df with the selected features using the training set
selected_features_df = GEARBOX_X_train[selected_features]
selected_features_df.reset_index(drop=True, inplace=True)
selected_features_df

Unnamed: 0,Hyd_Oil_Temp_Avg,Nac_Temp_Avg,Amb_WindSpeed_Max,Amb_WindSpeed_Min,Amb_WindSpeed_Std,Amb_WindDir_Relative_Avg,Amb_WindDir_Abs_Avg,Amb_Temp_Avg,HVTrafo_Phase2_Temp_Avg,Cont_Hub_Temp_Avg,Cont_VCP_Temp_Avg,Blds_PitchAngle_Std,Cont_VCP_ChokcoilTemp_Avg,Grd_RtrInvPhase2_Temp_Avg,Grd_RtrInvPhase3_Temp_Avg,Rtr_RPM_Std,Grd_Prod_ReactPwr_Std,Grd_Prod_PsbleCap_Std,Nac_Direction_Avg,Avg_AmbientTemp
0,4374,3531,1332.4,97.3,111.5,1570.8,27459.6,2116,10181,3852,4828,275.5,8299,4729,4913,95.8,5036.2,13616.0,29489.4,13.958333
1,4127,3660,1052.7,186.0,96.2,-636.7,38995.6,1873,6110,3234,4838,331.4,10764,4624,4639,108.1,5011.1,13031.8,42152.1,12.111111
2,4716,4302,1904.7,199.9,159.4,-333.0,37834.5,3034,10407,4475,5959,167.3,11632,5400,5364,99.3,1349.5,10142.9,38168.8,19.972222
3,4514,4108,1020.5,258.6,90.1,-245.4,16780.4,2603,6979,4049,5174,284.6,12093,5266,5271,92.6,7529.7,13574.1,17025.6,17.319444
4,5127,4549,841.3,129.6,85.0,289.0,21577.5,3366,10757,4709,5621,267.7,7324,5029,5044,87.0,5859.9,8097.6,20569.8,22.381944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1700,6591,4942,1173.0,168.6,107.9,422.1,34738.2,3764,10092,5239,5837,190.1,9887,5305,5577,67.6,4553.3,16652.1,34677.0,25.000000
1701,5542,4840,1034.9,232.7,104.2,-70.9,15141.1,3684,7762,4901,5763,302.1,10602,5282,5298,126.4,5101.0,14348.3,16291.3,24.805556
1702,6116,5120,2943.1,611.8,272.6,-3.2,12567.7,3428,12527,4600,6573,282.0,18197,6727,6580,39.3,609.4,14344.3,12568.8,22.979167
1703,4342,3467,1407.8,81.6,117.1,-2855.7,32383.2,2063,10028,3589,4665,220.9,7057,4650,4776,78.8,5228.0,12443.6,39559.3,13.645833


In [14]:
selected_features_df_test = GEARBOX_X_test[selected_features]

In [15]:
xgb_GEARBOX.fit(selected_features_df, GEARBOX_y_train)

In [21]:
def predict_fn(X):
    # Convert numpy array to pandas DataFrame
    #print(f"Input shape to predict_fn: {X.shape}")
    #print(f"Input dtype to predict_fn: {X.dtype}")
    X_df = pd.DataFrame(X, columns=selected_features_df.columns)
    predictions = xgb_GEARBOX.predict_proba(X_df)
    #print(f"Predictions: {predictions}")
    return predictions


In [18]:
test_input = selected_features_df_test.to_numpy()
print(f"Test input shape: {test_input.shape}")
#print(f"Test input dtype: {test_input.dtype}")
test_prediction = predict_fn(test_input)
print(f"Test prediction: {test_prediction}")
#print(f"Test prediction dtype: {test_prediction.dtype}")

Test input shape: (731, 20)
Input shape to predict_fn: (731, 20)
Input dtype to predict_fn: float64
Predictions: [[9.9943227e-01 5.6775549e-04]
 [9.9990219e-01 9.7837539e-05]
 [9.9978876e-01 2.1126261e-04]
 ...
 [9.9671155e-01 3.2884236e-03]
 [9.9945468e-01 5.4531824e-04]
 [9.9881309e-01 1.1869107e-03]]
Test prediction: [[9.9943227e-01 5.6775549e-04]
 [9.9990219e-01 9.7837539e-05]
 [9.9978876e-01 2.1126261e-04]
 ...
 [9.9671155e-01 3.2884236e-03]
 [9.9945468e-01 5.4531824e-04]
 [9.9881309e-01 1.1869107e-03]]


In [29]:
# Train the model with the features selected during feature sleection only then anchors or predic_fn works
feature_names = selected_features_df.columns.to_list()
explainer = AnchorTabular(predict_fn, feature_names)
explainer.fit(selected_features_df.to_numpy())


AnchorTabular(meta={
  'name': 'AnchorTabular',
  'type': ['blackbox'],
  'explanations': ['local'],
  'params': {'seed': None, 'disc_perc': (25, 50, 75)},
  'version': '0.9.7.dev0'}
)

In [24]:
X_test_np = selected_features_df_test.to_numpy()
test_length = X_test_np.shape[0]

In [25]:
def time_convert(sec):
  mins = sec // 60
  sec = sec % 60
  hours = mins // 60
  mins = mins % 60
  return "Time Lapsed = {0}:{1}:{2}".format(int(hours),int(mins),sec)

In [26]:
X_test_np = selected_features_df_test.to_numpy()
X_test_np.shape
start_time = time.time()

anchors = []
for i, dp in enumerate(X_test_np):
    anchors.append(explainer.explain(dp, threshold=0.95))
    current_time = time.time()
    time_lapsed = current_time - start_time
    print("{} von {} Schritten abgeschlossen. Zeit: {}".format(i, test_length, time_convert(time_lapsed)), end="\r", flush=True)

688 von 731 Schritten abgeschlossen. Zeit: Time Lapsed = 0:7:0.84504008293151862

Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


718 von 731 Schritten abgeschlossen. Zeit: Time Lapsed = 0:7:29.878673076629648

Could not find an anchor satisfying the 0.95 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.


730 von 731 Schritten abgeschlossen. Zeit: Time Lapsed = 0:8:21.050113916397095

In [27]:
for index, elem in enumerate(anchors):
    print(index)
    print('Anchor: %s' % (' AND '.join(elem["data"]["anchor"])))
    print('Precision: %.2f' % elem["data"]["precision"])
    print('Coverage: %.2f' % elem["data"]["coverage"])


0
Anchor: Blds_PitchAngle_Std <= 153.40
Precision: 0.97
Coverage: 0.25
1
Anchor: Avg_AmbientTemp <= 20.06
Precision: 0.99
Coverage: 0.50
2
Anchor: Avg_AmbientTemp <= 15.39
Precision: 1.00
Coverage: 0.25
3
Anchor: Cont_VCP_ChokcoilTemp_Avg <= 9099.00
Precision: 0.98
Coverage: 0.25
4
Anchor: Blds_PitchAngle_Std <= 153.40
Precision: 1.00
Coverage: 0.25
5
Anchor: Blds_PitchAngle_Std > 227.70
Precision: 0.98
Coverage: 0.51
6
Anchor: Blds_PitchAngle_Std > 153.40
Precision: 0.98
Coverage: 0.75
7
Anchor: Cont_VCP_Temp_Avg <= 5844.00
Precision: 0.98
Coverage: 0.50
8
Anchor: Avg_AmbientTemp <= 23.74
Precision: 0.99
Coverage: 0.76
9
Anchor: Avg_AmbientTemp <= 15.39
Precision: 1.00
Coverage: 0.26
10
Anchor: Amb_WindDir_Relative_Avg > -188.90
Precision: 0.99
Coverage: 0.75
11
Anchor: Avg_AmbientTemp <= 20.06
Precision: 0.99
Coverage: 0.51
12
Anchor: 
Precision: 0.99
Coverage: 1.00
13
Anchor: Cont_Hub_Temp_Avg <= 4842.00
Precision: 0.97
Coverage: 0.75
14
Anchor: Avg_AmbientTemp <= 20.06
Precision: 0