In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, plot_roc_curve, classification_report
from sklearn import model_selection


In [2]:
rooth_path = '../rawdata/outro/'
generator_df = pd.read_csv(rooth_path + 'generator_df.csv')
gen_bear_df = pd.read_csv(rooth_path + 'gen_bear_df.csv')
hyd_df = pd.read_csv(rooth_path + 'hyd_df.csv')
gearbox_df = pd.read_csv(rooth_path + 'gearbox_df.csv')
transf_df = pd.read_csv(rooth_path + 'transf_df.csv')

In [2]:
generator_df['Timestamp'] = pd.to_datetime(generator_df['Timestamp'])
gen_bear_df['Timestamp'] = pd.to_datetime(gen_bear_df['Timestamp'])
hyd_df['Timestamp'] = pd.to_datetime(hyd_df['Timestamp'])
gearbox_df['Timestamp'] = pd.to_datetime(gearbox_df['Timestamp'])
transf_df['Timestamp'] = pd.to_datetime(transf_df['Timestamp'])

KeyboardInterrupt: 

In [None]:
#By Pearson corr analysis
features_drop = ['Gen_RPM_Max', 'Gen_RPM_Min', 'Gen_Phase1_Temp_Avg','Gen_Phase3_Temp_Avg', 'Amb_WindSpeed_Est_Avg',
                'Grd_RtrInvPhase1_Temp_Avg', 'Grd_RtrInvPhase3_Temp_Avg', 'Rtr_RPM_Max', 'Rtr_RPM_Min','Grd_Prod_VoltPhse2_Avg',
                'Blds_PitchAngle_Max', 'Blds_PitchAngle_Min', 'Prod_LatestAvg_ReactPwrGen1', 'Cont_Hub_Temp_Avg',
                'Spin_Temp_Avg', 'Rtr_RPM_Std', 'Rtr_RPM_Avg', 'Cont_VCP_Temp_Avg', 'Grd_Prod_CurPhse1_Avg', 'Prod_LatestAvg_TotActPwr',
                 'Grd_Prod_CurPhse3_Avg', 'Grd_Prod_Pwr_Max', 'Grd_Prod_Pwr_Min', 'HVTrafo_Phase1_Temp_Avg', 'Grd_Prod_CurPhse2_Avg',
                 'HVTrafo_Phase3_Temp_Avg', 'Grd_Prod_PsblePwr_Max', 'Grd_Prod_PsblePwr_Min', 'Grd_Prod_ReactPwr_Avg',
                'Grd_Prod_PsbleInd_Max', 'Grd_Prod_PsbleInd_Min', 'Prod_LatestAvg_ActPwrGen1', 'Prod_LatestAvg_TotReactPwr',
                'Grd_Prod_PsbleInd_Avg', 'Blds_PitchAngle_Avg', 'Grd_Prod_ReactPwr_Max', 'Grd_Prod_ReactPwr_Min',
                'Nac_Direction_Avg', 'Amb_WindDir_Abs_Avg', 'Grd_Prod_PsbleCap_Min', 'Gear_Oil_Temp_Avg', 'Grd_Prod_VoltPhse1_Avg']

In [None]:
## Remove columns with strong correlations
generator_df = generator_df.drop(columns=features_drop)
gen_bear_df = gen_bear_df.drop(columns=features_drop)
hyd_df = hyd_df.drop(columns=features_drop)
gearbox_df = gearbox_df.drop(columns=features_drop)
transf_df = transf_df.drop(columns=features_drop)

In [None]:
def prepare_train_test(df):
    last_date = df['Timestamp'].iloc[-1]
    split = last_date - pd.DateOffset(months=3)
    df_train = df[df['Timestamp'] < split]
    df_test = df[df['Timestamp'] >= split]
    
    return df_train, df_test

In [None]:
generator_df_train, generator_df_test = prepare_train_test(generator_df)
gen_bear_df_train, gen_bear_df_test = prepare_train_test(gen_bear_df)
hyd_df_train, hyd_df_test = prepare_train_test(hyd_df)
gearbox_df_train, gearbox_df_test = prepare_train_test(gearbox_df)
transf_df_train, transf_df_test = prepare_train_test(transf_df)

In [None]:
#Group by day per turbine
def group_per_frequency(df, strategy='mean'):
    df['Date'] = df['Timestamp'].dt.date
    if strategy == 'max':
        df = df.groupby(by=['Turbine_ID','Date']).max().reset_index().drop(columns='Timestamp')
    else:
        df = df.groupby(by=['Turbine_ID','Date']).mean().reset_index()
        
    return df

In [None]:
df_train_gearbox_day = group_per_frequency(gearbox_df_train)
df_train_gen_day = group_per_frequency(generator_df_train)
df_train_gen_bear_day = group_per_frequency(gen_bear_df_train)
df_train_hyd_day = group_per_frequency(hyd_df_train)
df_train_transf_day = group_per_frequency(transf_df_train)
df_test_gearbox_day = group_per_frequency(gearbox_df_test)
df_test_gen_day = group_per_frequency(generator_df_test)
df_test_gen_bear_day = group_per_frequency(gen_bear_df_test)
df_test_hyd_day = group_per_frequency(hyd_df_test)
df_test_transf_day = group_per_frequency(transf_df_test)

In [None]:
dfs = [df_train_gearbox_day,df_train_gen_day, df_train_gen_bear_day, df_train_hyd_day, df_train_transf_day,
       df_test_gearbox_day, df_test_gen_day, df_test_gen_bear_day, df_test_hyd_day, df_test_transf_day ]

for df in dfs:
    df['60_days'] = df['60_days'].round(decimals=0)
    df['50_days'] = df['50_days'].round(decimals=0)
    df['40_days'] = df['40_days'].round(decimals=0)
    df['30_days'] = df['30_days'].round(decimals=0)
    df['20_days'] = df['20_days'].round(decimals=0)
    df['10_days'] = df['10_days'].round(decimals=0)

In [None]:
def add_features(df_in, rolling_win_size):
    
    sensor_cols = []
    for i in df_in.keys()[2:-8]:
        sensor_cols.append(i)
    sensor_av_cols = [nm+'_av' for nm in sensor_cols]
    sensor_sd_cols = [nm+'_sd' for nm in sensor_cols]
    df_out = pd.DataFrame()
    ws = rolling_win_size
    #calculate rolling stats for each engine id
    for m_id in pd.unique(df_in.Turbine_ID):
        # get a subset for each engine sensors
        df_engine = df_in[df_in['Turbine_ID'] == m_id]
        df_sub = df_engine[sensor_cols]
        # get rolling mean for the subset
        av = df_sub.rolling(ws, min_periods=1).mean()
        av.columns = sensor_av_cols
        # get the rolling standard deviation for the subset
        sd = df_sub.rolling(ws, min_periods=1).std().fillna(0)
        sd.columns = sensor_sd_cols
        # combine the two new subset dataframes columns to the engine subset
        new_ftrs = pd.concat([df_engine,av,sd], axis=1)
        # add the new features rows to the output dataframe
        df_out = pd.concat([df_out,new_ftrs])
    return df_out

In [None]:
df_train_gearbox_extra = add_features(df_train_gearbox_day, 7)
df_train_gen_extra = add_features(df_train_gen_day, 7)
df_train_gen_bear_extra = add_features(df_train_gen_bear_day, 7)
df_train_hyd_extra = add_features(df_train_hyd_day, 7)
df_train_transf_extra = add_features(df_train_transf_day, 7)
df_test_gearbox_extra = add_features(df_test_gearbox_day, 7)
df_test_gen_extra = add_features(df_test_gen_day, 7)
df_test_gen_bear_extra = add_features(df_test_gen_bear_day, 7)
df_test_hyd_extra = add_features(df_test_hyd_day, 7)
df_test_transf_extra = add_features(df_test_transf_day, 7)

In [None]:
#Failures Generator in train data - T06 and T11
#Failures Hydraulic Group in train data - T06 and T11
#Failures Gen_bear in train data - T07 and T09
#Failures Transformer in train data - T07
# Gearbox -> Change train_test in order to be 1 failure in test data

In [None]:
df_train_gen = df_train_gen_extra.loc[(df_train_gen_extra['Turbine_ID']=='T06') | (df_train_gen_extra['Turbine_ID']=='T11')]
df_train_gen_bear = df_train_gen_bear_extra.loc[(df_train_gen_bear_extra['Turbine_ID']=='T07') | (df_train_gen_bear_extra['Turbine_ID']=='T09')]
df_train_hyd = df_train_hyd_extra.loc[(df_train_hyd_extra['Turbine_ID']=='T06') | (df_train_hyd_extra['Turbine_ID']=='T11')]
df_train_transf = df_train_transf_extra.loc[df_train_transf_extra['Turbine_ID']=='T07']
df_test_gen = df_test_gen_extra.copy()
df_test_gen_bear = df_test_gen_bear_extra.copy()
df_test_hyd = df_test_hyd_extra.copy()
df_test_transf = df_test_transf_extra.copy()

In [None]:
#Scaling
def scale(df_train, df_test, scaler='StandardScaler'):
    
    for m_id in pd.unique(df_train.Turbine_ID):
        X_train = df_train.drop(columns=['Date', 'TTF', '60_days', '50_days', '40_days', '30_days', '20_days', '10_days', 'Component', 'Turbine_ID']) 
        X_test = df_test.drop(columns=['Date', 'TTF', '60_days', '50_days', '40_days', '30_days', '20_days', '10_days', 'Component', 'Turbine_ID'])
        if scaler == 'MinMaxScaler':
            sc = MinMaxScaler()
            X_train_scale = sc.fit_transform(X_train)
            X_test_scale = sc.transform(X_test)
        else:
            sc = StandardScaler()
            X_train_scale = sc.fit_transform(X_train)
            X_test_scale = sc.transform(X_test)

        
    return X_train_scale, X_test_scale

In [None]:
def bin_classify(model, clf, X_train, X_test, y_train, y_test, params=None, score=None, ):
     
    grid_search = model_selection.GridSearchCV(estimator=clf, param_grid=params, cv=5, scoring=score, n_jobs=-1)

    grid_search.fit(X_train, y_train)
    train_pred = grid_search.predict(X_train)
    y_pred = grid_search.predict(X_test)
    
    if hasattr(grid_search, 'predict_proba'):   
        y_score = grid_search.predict_proba(X_test)[:,1]
    elif hasattr(grid_search, 'decision_function'):
        y_score = grid_search.decision_function(X_test)
    else:
        y_score = y_pred
    
    predictions = {'y_pred' : y_pred, 'y_score' : y_score}
    df_predictions = pd.DataFrame.from_dict(predictions)
    
    return grid_search.best_estimator_, df_predictions

In [None]:
def metrics(estimator, X_test, y_test, y_pred, label):
    
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    plot_roc_curve(estimator, X_test, y_test, name = f'{estimator} {label}')

In [None]:
def logreg(X_train, X_test, y_train, y_test, label):
    model = 'Logistic Regression'
    clf_logreg = LogisticRegression(random_state=42, max_iter=1000)
    gs_params = {'C': [.01, 0.1, 1.0, 10], 'solver': ['liblinear', 'lbfgs']}
    gs_score = 'f1'

    clf_logreg, pred_logreg = bin_classify(model, clf_logreg, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_logreg)
    
    metrics(clf_logreg, X_test, y_test, pred_logreg['y_pred'], label)
    
    return clf_logreg, pred_logreg

In [None]:
def rfc(X_train, X_test, y_train, y_test, label):
    model = 'Random Forest Classifier'
    clf_rfc = RandomForestClassifier(random_state=42)
    gs_params = {'n_estimators': [800, 900, 1000, 1300, 1400, 1500], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', None]}
    gs_score = 'f1'

    clf_rfc, pred_rfc = bin_classify(model, clf_rfc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_rfc)
    
    metrics(clf_rfc, X_test, y_test, pred_rfc['y_pred'], label)
    
    return clf_rfc, pred_rfc

In [None]:
def gbc(X_train, X_test, y_train, y_test, label):
    model = 'Gradient Boosting Classifier'
    clf_gbc = GradientBoostingClassifier(random_state=42)
    gs_params = {'learning_rate': [0.001, 0.01, 0.1, 0.5, 1], 'n_estimators': [100, 200, 500, 700]}
    gs_score = 'f1'

    clf_gbc, pred_gbc= bin_classify(model, clf_gbc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_gbc)

    metrics(clf_gbc, X_test, y_test, pred_gbc['y_pred'], label)
    
    return clf_gbc, pred_gbc

In [None]:
def knn(X_train, X_test, y_train, y_test, label):
    model = 'KNN'
    clf_knn = KNeighborsClassifier()
    gs_params = {'n_neighbors': [5, 7, 10, 15]}
    gs_score = 'f1'

    clf_knn, pred_knn= bin_classify(model, clf_knn, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_knn)

    metrics(clf_knn, X_test, y_test, pred_knn['y_pred'], label)
    
    return clf_knn, pred_knn

In [None]:
def abc(X_train, X_test, y_train, y_test, label):
    model = 'AdaBoostClassifier'
    clf_abc = AdaBoostClassifier(random_state=42)
    gs_params = {'n_estimators': [50, 100, 200, 300, 500]}
    gs_score = 'f1'

    clf_abc, pred_abc= bin_classify(model, clf_abc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_abc)

    metrics(clf_abc, X_test, y_test, pred_abc['y_pred'], label)
    
    return clf_abc, pred_abc

In [None]:
def svc(X_train, X_test, y_train, y_test, label):
    model = 'SVC'
    clf_svc = SVC(random_state=42)
    gs_params = {'C': [0.01, 0.1, 1, 1.2], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [2,3], 'class_weight': ['balanced', None], 'gamma': ['auto', 'scale']}
    gs_score = 'f1'

    clf_svc, pred_svc= bin_classify(model, clf_svc, X_train, X_test, y_train, y_test, params=gs_params, score=gs_score)
    print('\nBest Parameters:\n',clf_svc)

    metrics(clf_svc, X_test, y_test, pred_svc['y_pred'], label)
    
    return clf_svc, pred_svc

# GENERATOR

In [None]:
feat_drop = ['Date', 'TTF', '60_days', '50_days', '40_days', '30_days', '20_days', '10_days', 'Component', 'Turbine_ID']
labels = ['60_days', '50_days', '40_days' ,'30_days']

In [None]:
X_train_scale_gen, X_test_scale_gen = scale(df_train_gen, df_test_gen)
X_train_gen = df_train_gen.drop(columns=feat_drop)
X_test_gen = df_test_gen.drop(columns=feat_drop)
for label in labels:
    y_train_gen = df_train_gen[label]
    y_test_gen = df_test_gen[label]
    print(f'Generator {label}')
    clf_logreg_gen, pred_logreg_gen = logreg(X_train_gen, X_test_gen, y_train_gen, y_test_gen, label)
    clf_rfc_gen, pred_rfc_gen = rfc(X_train_gen, X_test_gen, y_train_gen, y_test_gen, label)
    clf_gbc_gen, pred_gbc_gen = gbc(X_train_gen, X_test_gen, y_train_gen, y_test_gen, label)
    clf_knn_gen, pred_knn_gen = knn(X_train_scale_gen, X_test_scale_gen, y_train_gen, y_test_gen, label)
    clf_abc_gen, pred_abc_gen = abc(X_train_gen, X_test_gen, y_train_gen, y_test_gen, label)
    clf_svc_gen, pred_svc_gen = svc(X_train_scale_gen, X_test_scale_gen, y_train_gen, y_test_gen, label)

# Hydraulic System

In [None]:
X_train_scale_hyd, X_test_scale_hyd = scale(df_train_hyd, df_test_hyd)
X_train_hyd = df_train_hyd.drop(columns=feat_drop)
X_test_hyd = df_test_hyd.drop(columns=feat_drop)
for label in labels:
    y_train_hyd = df_train_hyd[label]
    y_test_hyd = df_test_hyd[label]
    print(f'Hydraulic {label}')
    clf_logreg_hyd, pred_logreg_hyd = logreg(X_train_hyd, X_test_hyd, y_train_hyd, y_test_hyd, label)
    clf_rfc_hyd, pred_rfc_hyd = rfc(X_train_hyd, X_test_hyd, y_train_hyd, y_test_hyd, label)
    clf_gbc_hyd, pred_gbc_hyd = gbc(X_train_hyd, X_test_hyd, y_train_hyd, y_test_hyd, label)
    clf_knn_hyd, pred_knn_hyd = knn(X_train_scale_hyd, X_test_scale_hyd, y_train_hyd, y_test_hyd, label)
    clf_abc_hyd, pred_abc_hyd = abc(X_train_hyd, X_test_hyd, y_train_hyd, y_test_hyd, label)
    clf_svc_hyd, pred_svc_hyd = svc(X_train_scale_hyd, X_test_scale_hyd, y_train_hyd, y_test_hyd, label)

# Transformer

In [None]:
X_train_scale_transf, X_test_scale_transf = scale(df_train_transf, df_test_transf)
X_train_transf = df_train_transf.drop(columns=feat_drop)
X_test_transf = df_test_transf.drop(columns=feat_drop)
for label in labels:
    y_train_transf = df_train_transf[label]
    y_test_transf = df_test_transf[label]
    print(f'Transformer {label}')
    clf_logreg_transf, pred_logreg_transf = logreg(X_train_transf, X_test_transf, y_train_transf, y_test_transf, label)
    clf_rfc_transf, pred_rfc_transf = rfc(X_train_transf, X_test_transf, y_train_transf, y_test_transf, label)
    clf_gbc_transf, pred_gbc_transf = gbc(X_train_transf, X_test_transf, y_train_transf, y_test_transf, label)
    clf_knn_transf, pred_knn_transf = knn(X_train_scale_transf, X_test_scale_transf, y_train_transf, y_test_transf, label)
    clf_abc_transf, pred_abc_transf = abc(X_train_transf, X_test_transf, y_train_transf, y_test_transf, label)
    clf_svc_transf, pred_svc_transf = svc(X_train_scale_transf, X_test_scale_transf, y_train_transf, y_test_transf, label)

# Generator Bearing

In [None]:
X_train_scale_gen_bear, X_test_scale_gen_bear = scale(df_train_gen_bear, df_test_gen_bear)
X_train_gen_bear = df_train_gen_bear.drop(columns=feat_drop)
X_test_gen_bear = df_test_gen_bear.drop(columns=feat_drop)
for label in labels:
    y_train_gen_bear = df_train_gen_bear[label]
    y_test_gen_bear = df_test_gen_bear[label]
    print(f'Generator Bearing {label}')
    clf_logreg_gen_bear, pred_logreg_gen_bear = logreg(X_train_gen_bear, X_test_gen_bear, y_train_gen_bear, y_test_gen_bear, label)
    clf_rfc_gen_bear, pred_rfc_gen_bear = rfc(X_train_gen_bear, X_test_gen_bear, y_train_gen_bear, y_test_gen_bear, label)
    clf_gbc_gen_bear, pred_gbc_gen_bear = gbc(X_train_gen_bear, X_test_gen_bear, y_train_gen_bear, y_test_gen_bear, label)
    clf_knn_gen_bear, pred_knn_gen_bear = knn(X_train_scale_gen_bear, X_test_scale_gen_bear, y_train_gen_bear, y_test_gen_bear, label)
    clf_abc_gen_bear, pred_abc_gen_bear = abc(X_train_gen_bear, X_test_gen_bear, y_train_gen_bear, y_test_gen_bear, label)
    clf_svc_gen_bear, pred_svc_gen_bear = svc(X_train_scale_gen_bear, X_test_scale_gen_bear, y_train_gen_bear, y_test_gen_bear, label)