In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [21]:
def get_data_and_first_cleaning():
    rooth_path = '../rawdata/training/'
    failures_df = pd.read_csv(rooth_path + 'wind-farm-1-failures-training.csv')
    locations_df = pd.read_csv(rooth_path + "wind-farm-1-locations.csv")
    logs_df = pd.read_csv(rooth_path + 'wind-farm-1-logs-training.csv')
    metmast_df = pd.read_csv(rooth_path + 'wind-farm-1-metmast-training.csv', sep=';')
    signals_df = pd.read_csv(rooth_path + 'wind-farm-1-signals-training.csv', sep=';')
    power_df = pd.read_csv(rooth_path + 'Power_curve.csv', sep=',')
#     costs_df = pd.read_csv(rooth_path + 'HTW_Costs.csv')
    
    # pass all the Timestamps to date-time format
    failures_df['Timestamp'] = pd.to_datetime(failures_df['Timestamp'])
    logs_df['Timestamp'] = pd.to_datetime(logs_df['TimeDetected'])
    logs_df = logs_df.drop(columns=['TimeDetected','UnitTitleDestination'])
    metmast_df['Timestamp'] = pd.to_datetime(metmast_df['Timestamp'])
    signals_df['Timestamp'] = pd.to_datetime(signals_df['Timestamp'])
     
    # Drop Columns with std=0 by DA
    drop_features_metmast = ['Anemometer1_Offset', 'Anemometer2_Freq', 'Anemometer2_Offset', 'Min_Raindetection',
                             'Avg_Raindetection', 'Anemometer1_CorrGain', 'Anemometer1_CorrOffset',
                             'Anemometer2_CorrGain', 'Anemometer2_CorrOffset', 'DistanceAirPress', 
                             'AirRessureSensorZeroOffset']
    metmast_df = metmast_df.drop(columns=drop_features_metmast)
    
    drop_features_signals = ['Prod_LatestAvg_ActPwrGen2', 'Prod_LatestAvg_ReactPwrGen2']
    signals_df = signals_df.drop(columns=drop_features_signals)
    
    failures_df = failures_df.drop(columns='Remarks')
    
    #Fill na by interpolate
    signals_df = signals_df.bfill()
    
    return failures_df, locations_df, logs_df, metmast_df, signals_df, power_df

In [22]:
failures_df, locations_df, logs_df, metmast_df, signals_df, power_df = get_data_and_first_cleaning()

In [54]:
# Classification label for failures
failures_generator = failures_df[failures_df['Component'] == 'GENERATOR']
failures_generator.replace('GENERATOR', 1, inplace=True)
failures_gen_bear = failures_df[failures_df['Component'] == 'GENERATOR_BEARING']
failures_gen_bear.replace('GENERATOR_BEARING', 1, inplace=True)
failures_hyd = failures_df[failures_df['Component'] == 'HYDRAULIC_GROUP']
failures_hyd.replace('HYDRAULIC_GROUP', 1, inplace=True)
failures_gearbox = failures_df[failures_df['Component'] == 'GEARBOX']
failures_gearbox.replace('GEARBOX', 1, inplace=True)
failures_transf = failures_df[failures_df['Component'] == 'TRANSFORMER']
failures_transf.replace('TRANSFORMER', 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [55]:
failures_generator

Unnamed: 0,Turbine_ID,Component,Timestamp
0,T11,1,2016-03-03 19:00:00+00:00
5,T06,1,2016-07-11 19:48:00+00:00
7,T06,1,2016-07-24 17:01:00+00:00
10,T06,1,2016-09-04 08:08:00+00:00
11,T06,1,2016-10-02 17:08:00+00:00
15,T06,1,2016-10-27 16:26:00+00:00
22,T07,1,2017-08-21 14:47:00+00:00


In [56]:
# Merge DF Failures with DF of each component to know the moment of each failure
generator_final = signals_df.merge(failures_generator, on=['Timestamp','Turbine_ID'], how='outer')
gen_bear_final = signals_df.merge(failures_gen_bear, on=['Timestamp', 'Turbine_ID'], how='outer')
hyd_final = signals_df.merge(failures_hyd, on=['Timestamp', 'Turbine_ID'], how='outer')
gearbox_final = signals_df.merge(failures_gearbox, on=['Timestamp', 'Turbine_ID'], how='outer')
transf_final = signals_df.merge(failures_transf, on=['Timestamp', 'Turbine_ID'], how='outer')

In [57]:
#Fill na on component column
generator_final['Component'].fillna(0, inplace=True)
gen_bear_final['Component'].fillna(0, inplace=True)
hyd_final['Component'].fillna(0, inplace=True)
gearbox_final['Component'].fillna(0, inplace=True)
transf_final['Component'].fillna(0, inplace=True)

In [58]:
# Set Timestamp as Index
generator_final.set_index('Timestamp', inplace=True)
gen_bear_final.set_index('Timestamp', inplace=True)
hyd_final.set_index('Timestamp', inplace=True)
gearbox_final.set_index('Timestamp', inplace=True)
transf_final.set_index('Timestamp', inplace=True)

In [59]:
# Sort Index
generator_final.sort_index(inplace=True)
gen_bear_final.sort_index(inplace=True)                                      
hyd_final.sort_index(inplace=True)
gearbox_final.sort_index(inplace=True)
transf_final.sort_index(inplace=True)

KeyboardInterrupt: 

In [None]:
# Reset Index to group by Timestamp and Turbine_ID
generator_final.reset_index(inplace=True)
gen_bear_final.reset_index(inplace=True)                                      
hyd_final.reset_index(inplace=True)
gearbox_final.reset_index(inplace=True)
transf_final.reset_index(inplace=True)

In [None]:
def prepare_data(df, strategy='mean'):
    if strategy == 'mean':
        df = df.groupby(by=['Turbine_ID','Timestamp']).mean()
    else:
        df = df.grouby(by=['Turbine_ID', 'Timestamp']).max()
    
    # Fill na by interpolate
    df = df.bfill().reset_index()
        
    return df

In [None]:
df_generator_final=prepare_data(generator_final)
df_gen_bear_final=prepare_data(gen_bear_final)
df_hyd_final=prepare_data(hyd_final)
df_gearbox_final=prepare_data(gearbox_final)
df_transf_final=prepare_data(transf_final)

In [None]:
print(df_generator_final.isnull().values.any())
print(df_gen_bear_final.isnull().values.any())
print(df_hyd_final.isnull().values.any())
print(df_gearbox_final.isnull().values.any())
print(df_transf_final.isnull().values.any())

In [None]:
def fill_na_by_turbine(df):
    turbines = ['T01', 'T06', 'T07', 'T09', 'T11']
    df_ = pd.DataFrame(columns=df.columns)
    for turbine in turbines:
        df1 = df.loc[df['Turbine_ID']==turbine]
        if df1['Component'].nunique()>1:
            index = df1[df1['Component']==1]
            index['date'] = index['Timestamp']
            index = index[['date','Timestamp', 'Turbine_ID']]
            df_merged = df1.merge(index, how='left', on=['Turbine_ID','Timestamp'])
            df_merged = df_merged.fillna(method='bfill')
            
            #If there is not a failure after, hold present date
            df_merged['date'] = df_merged['date'].fillna(df_merged['Timestamp'])
            
            df_merged['TTF'] = round((df_merged['date'] - 
                                      df_merged['Timestamp']) / np.timedelta64(1, 'D'),0)
        else:
            df_merged = df1
            df_merged['date'] = df_merged['Timestamp']
#             df1 = df1[['date','Timestamp', 'Turbine_ID']]
#             df_merged = df.merge(df1, how='left', on=['Turbine_ID','Timestamp'])
            
#             df_merged['date'] = df_merged['date'].fillna(df_merged['Timestamp'])
            df_merged['TTF'] = 0 # df_merged['date'] - df_merged['Timestamp'] 

        #Drop Column Date
        df_final = df_merged.drop(columns='date')

        #df_final['TTF'] = df_final['TTF'].dt.days

        df_ = pd.concat([df_, df_final])

    return df_

In [None]:
generator_df = fill_na_by_turbine(df_generator_final)
gen_bear_df = fill_na_by_turbine(df_gen_bear_final)
hyd_df = fill_na_by_turbine(df_hyd_final)
gearbox_df = fill_na_by_turbine(df_gearbox_final)
transf_df = fill_na_by_turbine(df_transf_final)

In [None]:
def Failure_Time_Horizon(days, period):
    if 2 <= days <= period:
        Flag=1
    else:
        Flag=0
    return Flag

In [None]:
generator_df['60_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
gen_bear_df['60_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
hyd_df['60_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
gearbox_df['60_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
transf_df['60_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)

In [None]:
generator_df['50_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 50),axis=1)
gen_bear_df['50_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 50),axis=1)
hyd_df['50_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 50),axis=1)
gearbox_df['50_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 50),axis=1)
transf_df['50_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 50),axis=1)

In [None]:
generator_df['40_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 40),axis=1)
gen_bear_df['40_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 40),axis=1)
hyd_df['40_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 40),axis=1)
gearbox_df['40_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 40),axis=1)
transf_df['40_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 40),axis=1)

In [None]:
generator_df['30_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
gen_bear_df['30_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
hyd_df['30_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
gearbox_df['30_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
transf_df['30_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)

In [None]:
generator_df['20_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 20),axis=1)
gen_bear_df['20_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 20),axis=1)
hyd_df['20_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 20),axis=1)
gearbox_df['20_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 20),axis=1)
transf_df['20_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 20),axis=1)

In [None]:
generator_df['10_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
gen_bear_df['10_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
hyd_df['10_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
gearbox_df['10_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
transf_df['10_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)

In [None]:
## Save DF's to CSV
generator_df.to_csv('../rawdata/outro/generator_df.csv', index=False)
gen_bear_df.to_csv('../rawdata/outro/gen_bear_df.csv', index=False)
hyd_df.to_csv('../rawdata/outro/hyd_df.csv', index=False)
gearbox_df.to_csv('../rawdata/outro/gearbox_df.csv', index=False)
transf_df.to_csv('../rawdata/outro/transf_df.csv', index=False)

In [2]:
rooth_path = '../rawdata/outro/'
generator_df = pd.read_csv(rooth_path + 'generator_df.csv')
gen_bear_df = pd.read_csv(rooth_path + 'gen_bear_df.csv')
hyd_df = pd.read_csv(rooth_path + 'hyd_df.csv')
gearbox_df = pd.read_csv(rooth_path + 'gearbox_df.csv')
transf_df = pd.read_csv(rooth_path + 'transf_df.csv')

In [3]:
generator_df['Timestamp'] = pd.to_datetime(generator_df['Timestamp'])
gen_bear_df['Timestamp'] = pd.to_datetime(gen_bear_df['Timestamp'])
hyd_df['Timestamp'] = pd.to_datetime(hyd_df['Timestamp'])
gearbox_df['Timestamp'] = pd.to_datetime(gearbox_df['Timestamp'])
transf_df['Timestamp'] = pd.to_datetime(transf_df['Timestamp'])

In [194]:
features_drop = ['Gen_RPM_Max', 'Gen_RPM_Min', 'Gen_Phase1_Temp_Avg','Gen_Phase3_Temp_Avg', 'Amb_WindSpeed_Est_Avg',
                'Grd_RtrInvPhase1_Temp_Avg', 'Grd_RtrInvPhase3_Temp_Avg', 'Rtr_RPM_Max', 'Rtr_RPM_Min','Grd_Prod_VoltPhse2_Avg',
                'Blds_PitchAngle_Max', 'Blds_PitchAngle_Min', 'Prod_LatestAvg_ReactPwrGen1', 'Cont_Hub_Temp_Avg',
                'Spin_Temp_Avg', 'Rtr_RPM_Std', 'Rtr_RPM_Avg', 'Cont_VCP_Temp_Avg', 'Grd_Prod_CurPhse1_Avg', 'Prod_LatestAvg_TotActPwr',
                 'Grd_Prod_CurPhse3_Avg', 'Grd_Prod_Pwr_Max', 'Grd_Prod_Pwr_Min', 'HVTrafo_Phase1_Temp_Avg', 'Grd_Prod_CurPhse2_Avg',
                 'HVTrafo_Phase3_Temp_Avg', 'Grd_Prod_PsblePwr_Max', 'Grd_Prod_PsblePwr_Min', 'Grd_Prod_ReactPwr_Avg',
                'Grd_Prod_PsbleInd_Max', 'Grd_Prod_PsbleInd_Min', 'Prod_LatestAvg_ActPwrGen1', 'Prod_LatestAvg_TotReactPwr',
                'Grd_Prod_PsbleInd_Avg', 'Blds_PitchAngle_Avg', 'Grd_Prod_ReactPwr_Max', 'Grd_Prod_ReactPwr_Min',
                'Nac_Direction_Avg', 'Amb_WindDir_Abs_Avg', 'Grd_Prod_PsbleCap_Min', 'Gear_Oil_Temp_Avg', 'Grd_Prod_VoltPhse1_Avg']

In [195]:
## Remove columns with strong correlations
generator_df = generator_df.drop(columns=features_drop)
gen_bear_df = gen_bear_df.drop(columns=features_drop)
hyd_df = hyd_df.drop(columns=features_drop)
gearbox_df = gearbox_df.drop(columns=features_drop)
transf_df = transf_df.drop(columns=features_drop)

In [196]:
def prepare_train_test(df):
    last_date = df['Timestamp'].iloc[-1]
    split = last_date - pd.DateOffset(months=3)
    df_train = df[df['Timestamp'] < split]
    df_test = df[df['Timestamp'] >= split]
    
    return df_train, df_test

In [197]:
generator_df_train, generator_df_test = prepare_train_test(generator_df)
gen_bear_df_train, gen_bear_df_test = prepare_train_test(gen_bear_df)
hyd_df_train, hyd_df_test = prepare_train_test(hyd_df)
gearbox_df_train, gearbox_df_test = prepare_train_test(gearbox_df)
transf_df_train, transf_df_test = prepare_train_test(transf_df)

In [198]:
#Group by day per turbine
def group_per_frequency(df, strategy='mean'):
    df['Date'] = df['Timestamp'].dt.date
    if strategy == 'max':
        df = df.groupby(by=['Turbine_ID','Date']).max().reset_index().drop(columns='Timestamp')
    else:
        df = df.groupby(by=['Turbine_ID','Date']).mean().reset_index()
        
    return df

In [199]:
df_train_gearbox_day = group_per_frequency(gearbox_df_train)
df_train_gen_day = group_per_frequency(generator_df_train)
df_train_gen_bear_day = group_per_frequency(gen_bear_df_train)
df_train_hyd_day = group_per_frequency(hyd_df_train)
df_train_transf_day = group_per_frequency(transf_df_train)
df_test_gearbox_day = group_per_frequency(gearbox_df_test)
df_test_gen_day = group_per_frequency(generator_df_test)
df_test_gen_bear_day = group_per_frequency(gen_bear_df_test)
df_test_hyd_day = group_per_frequency(hyd_df_test)
df_test_transf_day = group_per_frequency(transf_df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [200]:
dfs = [df_train_gearbox_day,df_train_gen_day, df_train_gen_bear_day, df_train_hyd_day, df_train_transf_day,
            df_test_gearbox_day, df_test_gen_day, df_test_gen_bear_day, df_test_hyd_day, df_test_transf_day ]

In [201]:
for df in dfs:
    df['60_days'] = df['60_days'].round(decimals=0)
    df['50_days'] = df['50_days'].round(decimals=0)
    df['40_days'] = df['40_days'].round(decimals=0)
    df['30_days'] = df['30_days'].round(decimals=0)
    df['20_days'] = df['20_days'].round(decimals=0)
    df['10_days'] = df['10_days'].round(decimals=0)

In [249]:
df_test_hyd_day['30_days'].value_counts()

0.0    421
1.0     44
Name: 30_days, dtype: int64

In [203]:
def add_features(df_in, rolling_win_size):
    
    sensor_cols = []
    for i in df_in.keys()[2:-8]:
        sensor_cols.append(i)
    sensor_av_cols = [nm+'_av' for nm in sensor_cols]
    sensor_sd_cols = [nm+'_sd' for nm in sensor_cols]
    df_out = pd.DataFrame()
    ws = rolling_win_size
    #calculate rolling stats for each engine id
    for m_id in pd.unique(df_in.Turbine_ID):
        # get a subset for each engine sensors
        df_engine = df_in[df_in['Turbine_ID'] == m_id]
        df_sub = df_engine[sensor_cols]
        # get rolling mean for the subset
        av = df_sub.rolling(ws, min_periods=1).mean()
        av.columns = sensor_av_cols
        # get the rolling standard deviation for the subset
        sd = df_sub.rolling(ws, min_periods=1).std().fillna(0)
        sd.columns = sensor_sd_cols
        # combine the two new subset dataframes columns to the engine subset
        new_ftrs = pd.concat([df_engine,av,sd], axis=1)
        # add the new features rows to the output dataframe
        df_out = pd.concat([df_out,new_ftrs])
    return df_out

In [250]:
df_train_gearbox_extra = add_features(df_train_gearbox_day, 30)
df_train_gen_extra = add_features(df_train_gen_day, 30)
df_train_gen_bear_extra = add_features(df_train_gen_bear_day, 30)
df_train_hyd_extra = add_features(df_train_hyd_day, 30)
df_train_transf_extra = add_features(df_train_transf_day, 30)
df_test_gearbox_extra = add_features(df_test_gearbox_day, 30)
df_test_gen_extra = add_features(df_test_gen_day, 30)
df_test_gen_bear_extra = add_features(df_test_gen_bear_day, 30)
df_test_hyd_extra = add_features(df_test_hyd_day, 30)
df_test_transf_extra = add_features(df_test_transf_day, 30)

In [205]:
features = ['Gen_RPM_Std_av', 'Gen_Bear_Temp_Avg_av',
       'Gen_Phase2_Temp_Avg_av', 'Hyd_Oil_Temp_Avg_av',
       'Gear_Bear_Temp_Avg_av', 'Nac_Temp_Avg_av', 'Amb_WindSpeed_Max_av',
       'Amb_WindSpeed_Min_av', 'Amb_WindSpeed_Avg_av', 'Amb_WindSpeed_Std_av',
       'Amb_WindDir_Relative_Avg_av', 'Amb_Temp_Avg_av',
       'Prod_LatestAvg_ActPwrGen0_av', 'Prod_LatestAvg_ReactPwrGen0_av',
       'HVTrafo_Phase2_Temp_Avg_av', 'Grd_InverterPhase1_Temp_Avg_av',
       'Cont_Top_Temp_Avg_av', 'Gen_SlipRing_Temp_Avg_av',
       'Blds_PitchAngle_Std_av', 'Cont_VCP_ChokcoilTemp_Avg_av',
       'Grd_RtrInvPhase2_Temp_Avg_av', 'Cont_VCP_WtrTemp_Avg_av',
       'Grd_Prod_Pwr_Avg_av', 'Grd_Prod_CosPhi_Avg_av', 'Grd_Prod_Freq_Avg_av',
       'Grd_Prod_VoltPhse3_Avg_av', 'Grd_Busbar_Temp_Avg_av',
       'Grd_Prod_Pwr_Std_av', 'Grd_Prod_ReactPwr_Std_av',
       'Grd_Prod_PsblePwr_Avg_av', 'Grd_Prod_PsblePwr_Std_av',
       'Grd_Prod_PsbleInd_Std_av', 'Grd_Prod_PsbleCap_Avg_av',
       'Grd_Prod_PsbleCap_Max_av', 'Grd_Prod_PsbleCap_Std_av',
       'Gen_Bear2_Temp_Avg_av', 'Gen_RPM_Avg_sd', 'Gen_RPM_Std_sd',
       'Gen_Bear_Temp_Avg_sd', 'Gen_Phase2_Temp_Avg_sd', 'Hyd_Oil_Temp_Avg_sd',
       'Gear_Bear_Temp_Avg_sd', 'Nac_Temp_Avg_sd', 'Amb_WindSpeed_Max_sd',
       'Amb_WindSpeed_Min_sd', 'Amb_WindSpeed_Avg_sd', 'Amb_WindSpeed_Std_sd',
       'Amb_WindDir_Relative_Avg_sd', 'Amb_Temp_Avg_sd',
       'Prod_LatestAvg_ActPwrGen0_sd', 'Prod_LatestAvg_ReactPwrGen0_sd',
       'HVTrafo_Phase2_Temp_Avg_sd', 'Grd_InverterPhase1_Temp_Avg_sd',
       'Cont_Top_Temp_Avg_sd', 'Gen_SlipRing_Temp_Avg_sd',
       'Blds_PitchAngle_Std_sd', 'Cont_VCP_ChokcoilTemp_Avg_sd',
       'Grd_RtrInvPhase2_Temp_Avg_sd', 'Cont_VCP_WtrTemp_Avg_sd',
       'Grd_Prod_Pwr_Avg_sd', 'Grd_Prod_CosPhi_Avg_sd', 'Grd_Prod_Freq_Avg_sd',
       'Grd_Prod_VoltPhse3_Avg_sd', 'Grd_Busbar_Temp_Avg_sd',
       'Grd_Prod_Pwr_Std_sd', 'Grd_Prod_ReactPwr_Std_sd',
       'Grd_Prod_PsblePwr_Avg_sd', 'Grd_Prod_PsblePwr_Std_sd',
       'Grd_Prod_PsbleInd_Std_sd', 'Grd_Prod_PsbleCap_Avg_sd',
       'Grd_Prod_PsbleCap_Max_sd', 'Grd_Prod_PsbleCap_Std_sd',
       'Gen_Bear2_Temp_Avg_sd', 'Date', 'TTF', '60_days', '50_days', '40_days', '30_days', 
            '20_days', '10_days', 'Component', 'Turbine_ID']

In [223]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score

from sklearn import model_selection
from sklearn import metrics

In [207]:
dfs_final = [df_train_gearbox_extra, 
             df_train_gen_extra,
             df_train_gen_bear_extra,
             df_train_hyd_extra,
             df_train_transf_extra,
             df_test_gearbox_extra,
             df_test_gen_extra,
             df_test_gen_bear_extra,
             df_test_hyd_extra,
             df_test_transf_extra]

In [208]:
def bin_classify(model, clf, df_train, df_test, label, params=None, score=None, ):
    
    """Perfor Grid Search hyper parameter tuning on a classifier.
    
    Args:
        model (str): The model name identifier
        clf (clssifier object): The classifier to be tuned
        features (list): The set of input features names
        params (dict): Grid Search parameters
        score (str): Grid Search score
        
    Returns:
        Tuned Clssifier object
        dataframe of model predictions and scores
    
    """
    
    
    X_train = df_train.drop(columns=['Date', 'TTF', '60_days', '50_days', '40_days', '30_days', 
                                               '20_days', '10_days', 'Component', 'Turbine_ID'])
    X_test = df_test.drop(columns=['Date', 'TTF', '60_days', '50_days', '40_days', '30_days', 
                                             '20_days', '10_days', 'Component', 'Turbine_ID'])

    y_train = df_train[label]
    y_test = df_test[label]

    
    grid_search = model_selection.GridSearchCV(estimator=clf, param_grid=params, cv=5, scoring=score, n_jobs=-1)

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)
    
    if hasattr(grid_search, 'predict_proba'):   
        y_score = grid_search.predict_proba(X_test)[:,1]
    elif hasattr(grid_search, 'decision_function'):
        y_score = grid_search.decision_function(X_test)
    else:
        y_score = y_pred
        
    predictions = {'y_pred' : y_pred, 'y_score' : y_score}
    df_predictions = pd.DataFrame.from_dict(predictions)
    
    return grid_search.best_estimator_, df_predictions, y_test

In [224]:
def metrics (y_test, y_test_pred):
    cm2 = confusion_matrix(y_test.values,y_test_pred)

    total1=sum(sum(cm2))
    
    
    metrics_dict = {
    'AUC_Test': roc_auc_score(y_test, y_test_pred) if len(y_test.value_counts())>1 else np.nan,
    'Accuracy':     (cm2[0,0]+cm2[1,1])/total1 if len(y_test.value_counts())>1 else np.nan,
    'Recall': cm2[1,1]/(cm2[1,0]+cm2[1,1]) if len(y_test.value_counts())>1 else np.nan,
    'Specificity':  cm2[0,0]/(cm2[0,0]+cm2[0,1]) if len(y_test.value_counts())>1 else np.nan,
    'Precision':    cm2[1,1]/(cm2[0,1]+cm2[1,1]) if len(y_test.value_counts())>1 else np.nan,
    'F1 Score':    f1_score(y_test,y_test_pred) if len(y_test.value_counts())>1 else np.nan,
        }

    return metrics_dict

def conf_matrix ( y_test, y_test_pred):

    return pd.crosstab(y_test, y_test_pred, rownames=['Actual Class'], colnames=['Predicted Class'])

    sns.set(style='white')
    fpr, tpr, threshold = roc_curve(y_test, y_test_prob)
    plt.plot(fpr, tpr, label='model')
    plt.legend(loc='center right')
    plt.plot([0,1],[0,1],'k')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
    
    return pd.crosstab(y_test, y_test_pred, rownames=['Actual Class'], colnames=['Predicted Class'])

# Random Forest

In [254]:
df_train_gen_extra = df_train_gen_extra[features]
df_test_gen_extra = df_test_gen_extra[features]

In [256]:
#Generator 30 days
model = 'Random Forest Classifier'
clf_rfc = RandomForestClassifier(random_state=42)
gs_params = {'n_estimators': [750, 800, 1000, 2000], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', None]}
gs_score = 'recall'

clf_rfc, pred_rfc, y_test = bin_classify(model, clf_rfc, df_train_gen_extra, df_test_gen_extra, '30_days', params=gs_params, score=gs_score)
print('\nBest Parameters:\n',clf_rfc)


Best Parameters:
 RandomForestClassifier(criterion='entropy', n_estimators=750, random_state=42)


In [257]:
temp = y_test.to_frame()
temp[temp['30_days']==1]

Unnamed: 0,30_days
237,1.0
238,1.0
239,1.0
240,1.0
241,1.0
242,1.0
243,1.0
244,1.0
245,1.0
246,1.0


In [261]:
pred_rfc[pred_rfc['y_score']>=0.5]

Unnamed: 0,y_pred,y_score
154,1.0,0.514667
155,1.0,0.525333
156,0.0,0.5
157,1.0,0.504
158,1.0,0.522667
159,1.0,0.536
160,1.0,0.513333
161,1.0,0.529333
162,1.0,0.521333
163,1.0,0.525333


In [259]:
metrics_dict = metrics (y_test, pred_rfc['y_pred'])

In [260]:
metrics_dict

{'AUC_Test': 0.4782110091743119,
 'Accuracy': 0.896774193548387,
 'Recall': 0.0,
 'Specificity': 0.9564220183486238,
 'Precision': 0.0,
 'F1 Score': 0.0}

# Gradient Boosting Classifier

In [227]:
#Generator 30 days
model = 'Gradient Boosting Classifier'
clf_gbc = GradientBoostingClassifier(random_state=42)
gs_params = {'learning_rate': [0.001, 0.01, 0.1, 0.5, 1], 'n_estimators': [100, 200, 300, 500, 800]}
gs_score = 'recall'

clf_gbc, pred_gbc, y_test = bin_classify(model, clf_gbc, df_train_gen_extra, df_test_gen_extra, '30_days', params=gs_params, score=gs_score)
print('\nBest Parameters:\n',clf_gbc)


Best Parameters:
 GradientBoostingClassifier(learning_rate=0.5, n_estimators=500, random_state=42)


In [233]:
temp = y_test.to_frame()
temp[temp['30_days']==1]

Unnamed: 0,30_days
237,1.0
238,1.0
239,1.0
240,1.0
241,1.0
242,1.0
243,1.0
244,1.0
245,1.0
246,1.0


In [234]:
pred_gbc[pred_gbc['y_pred']==1]

Unnamed: 0,y_pred,y_score
63,1.0,0.843211
64,1.0,1.0
65,1.0,0.999998
66,1.0,0.999999
67,1.0,0.999983
68,1.0,0.999977
69,1.0,0.999964
98,1.0,0.767419
141,1.0,1.0
142,1.0,0.999786


In [235]:
metrics_dict = metrics (y_test, pred_gbc['y_pred'])

In [236]:
metrics_dict

{'AUC_Test': 0.5449620373299588,
 'Accuracy': 0.8408602150537634,
 'Recall': 0.20689655172413793,
 'Specificity': 0.8830275229357798,
 'Precision': 0.10526315789473684,
 'F1 Score': 0.13953488372093023}

In [237]:
#Generator 30 days
model = 'Extra Trees Classifier'
clf_etc = ExtraTreesClassifier(random_state=42)
gs_params = {'n_estimators': [750, 800, 1000, 2000], 'criterion': ['gini', 'entropy']}
gs_score = 'recall'

clf_etc, pred_etc, y_test = bin_classify(model, clf_etc, df_train_gen_extra, df_test_gen_extra, '30_days', params=gs_params, score=gs_score)
print('\nBest Parameters:\n',clf_etc)


Best Parameters:
 ExtraTreesClassifier(n_estimators=1000, random_state=42)


In [238]:
pred_etc[pred_etc['y_pred']==1]

Unnamed: 0,y_pred,y_score
145,1.0,0.549
146,1.0,0.584
147,1.0,0.593
148,1.0,0.574
149,1.0,0.564
150,1.0,0.53
151,1.0,0.51
153,1.0,0.559
154,1.0,0.554
164,1.0,0.56


In [239]:
metrics_dict = metrics (y_test, pred_etc['y_pred'])
metrics_dict

{'AUC_Test': 0.48509174311926606,
 'Accuracy': 0.9096774193548387,
 'Recall': 0.0,
 'Specificity': 0.9701834862385321,
 'Precision': 0.0,
 'F1 Score': 0.0}

In [251]:
df_train_hyd_extra = df_train_hyd_extra[features]
df_test_hyd_extra = df_test_hyd_extra[features]

In [252]:
#Hydraulic 30 days
model = 'Random Forest Classifier'
clf_rfc = RandomForestClassifier(random_state=42)
gs_params = {'n_estimators': [900, 1000, 2000, 2050], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced', None]}
gs_score = 'recall'

clf_rfc, pred_rfc, y_test = bin_classify(model, clf_rfc, df_train_hyd_extra, df_test_hyd_extra, '30_days', params=gs_params, score=gs_score)
print('\nBest Parameters:\n',clf_rfc)


Best Parameters:
 RandomForestClassifier(criterion='entropy', n_estimators=2000, random_state=42)


In [253]:
temp = y_test.to_frame()
temp[temp['30_days']==1]

Unnamed: 0,30_days
142,1.0
143,1.0
144,1.0
145,1.0
146,1.0
147,1.0
148,1.0
149,1.0
150,1.0
151,1.0


In [271]:
pred_rfc[pred_rfc['y_score']>0.5]

Unnamed: 0,y_pred,y_score
154,1.0,0.514667
155,1.0,0.525333
157,1.0,0.504
158,1.0,0.522667
159,1.0,0.536
160,1.0,0.513333
161,1.0,0.529333
162,1.0,0.521333
163,1.0,0.525333
165,1.0,0.544


In [272]:
df_train_transf_extra = df_train_transf_extra[features]
df_test_transf_extra = df_test_transf_extra[features]

In [273]:
#Transformer 30 days
model = 'Gradient Boosting Classifier'
clf_gbc = GradientBoostingClassifier(random_state=42)
gs_params = {'learning_rate': [0.001, 0.01, 0.1, 0.5, 1], 'n_estimators': [100, 200, 300, 500, 800]}
gs_score = 'recall'

clf_gbc, pred_gbc, y_test = bin_classify(model, clf_gbc, df_train_transf_extra, df_test_transf_extra, '30_days', params=gs_params, score=gs_score)
print('\nBest Parameters:\n',clf_gbc)


Best Parameters:
 GradientBoostingClassifier(random_state=42)


In [277]:
pred_gbc[pred_gbc['y_score']>0.2]

Unnamed: 0,y_pred,y_score
248,0.0,0.204297
249,0.0,0.215049


In [None]:
# def several_models_results():

#     for key,target_ in dict_mod.items():

#         target=target_
#         df_train[target]=df_train[target].round().astype(int)#.value_counts()
#         df_test[target]=df_test[target].round().astype(int)#.value_counts()
#         y_test=df_test[target]
    

#         if 'LogReg' in key:
#             model = 'Logistic Regression'
#             clf_LogReg = LogisticRegression(random_state=42)
#             gs_params = {'C': [.01, 0.1, 1.0, 10], 'solver': ['liblinear', 'lbfgs'], 'class_weight': ['balanced',None]}
#             #gs_score = 'roc_auc' 68
#             gs_score = 'f1'

#             clf_LogReg, pred_LogReg_test, pred_LogReg_train = classifier(model, clf_LogReg, features,target, df_train,df_test, params=gs_params, score=gs_score)
#             dict_results[key]=metrics(df_train[target], pred_LogReg_train['y_train_prob'], df_test[target], pred_LogReg_test['y_test_pred'], pred_LogReg_test['y_test_prob'])


#         if  'RF' in key:  
#             model = 'Random Forest'
#             clf_rfc = RandomForestClassifier(n_estimators=50, random_state=42)
#             gs_params = {'max_depth': [3,4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy'], 'class_weight': ['balanced',None]}
#             gs_score = 'f1'

#             clf_rfc, pred_RandForest_test, pred_RandForest_train = classifier(model, clf_rfc, features,target, df_train,df_test,  params=gs_params, score=gs_score)
#             dict_results[key]=metrics(df_train[target], pred_RandForest_train['y_train_prob'], df_test[target], pred_RandForest_test['y_test_pred'], pred_RandForest_test['y_test_prob'])


#         if  'KNN' in key:
#             model = 'KNN'
#             clf_knn = KNeighborsClassifier(n_jobs=-1)
#             gs_params = {'n_neighbors': [5, 9 , 10, 11, 12]}
#             gs_score = 'f1'

#             clf_knn, pred_KNN_test, pred_KNN_train = classifier(model, clf_knn, features,target, df_train,df_test,  params=gs_params, score=gs_score)
#             dict_results[key]=metrics(df_train[target], pred_KNN_train['y_train_prob'], df_test[target], pred_KNN_test['y_test_pred'], pred_KNN_test['y_test_prob'])
    

#     return dict_results