<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [4]:
def get_data_and_first_cleaning():
    rooth_path = '../rawdata/'
    failures_df = pd.read_csv(rooth_path + 'wind-farm-1-failures-training.csv', sep=';')
    locations_df = pd.read_csv(rooth_path + "wind-farm-1-locations.csv", sep=';')
    logs_df = pd.read_csv(rooth_path + 'wind-farm-1-logs-training.csv', sep=';')
    metmast_df = pd.read_csv(rooth_path + 'wind-farm-1-metmast-training.csv', sep=';')
    signals_df = pd.read_csv(rooth_path + 'wind-farm-1-signals-training.csv', sep=';')
    power_df = pd.read_csv(rooth_path + 'Power_curve.csv', sep=',')
#     costs_df = pd.read_csv(rooth_path + 'HTW_Costs.csv')
    
    # pass all the Timestamps to date-time format
    failures_df['Timestamp'] = pd.to_datetime(failures_df['Timestamp'])
    logs_df['Timestamp'] = pd.to_datetime(logs_df['TimeDetected'])
    logs_df = logs_df.drop(columns=['TimeDetected','UnitTitleDestination'])
    metmast_df['Timestamp'] = pd.to_datetime(metmast_df['Timestamp'])
    signals_df['Timestamp'] = pd.to_datetime(signals_df['Timestamp'])
     
    # Drop Columns with std=0 by DA
    drop_features_metmast = ['Anemometer1_Offset', 'Anemometer2_Freq', 'Anemometer2_Offset', 'Min_Raindetection',
                             'Avg_Raindetection', 'Anemometer1_CorrGain', 'Anemometer1_CorrOffset',
                             'Anemometer2_CorrGain', 'Anemometer2_CorrOffset', 'DistanceAirPress', 
                             'AirRessureSensorZeroOffset']
    metmast_df = metmast_df.drop(columns=drop_features_metmast)
    
    drop_features_signals = ['Prod_LatestAvg_ActPwrGen2', 'Prod_LatestAvg_ReactPwrGen2']
    signals_df = signals_df.drop(columns=drop_features_signals)
    
    failures_df = failures_df.drop(columns='Remarks')
    
    #Fill na by interpolate
    signals_df = signals_df.bfill()
    
    return failures_df, locations_df, logs_df, metmast_df, signals_df, power_df

In [5]:
failures_df, locations_df, logs_df, metmast_df, signals_df, power_df = get_data_and_first_cleaning()

In [6]:
# Function to find str in columns of df
def component(component, col):
    pair_comp_col=[]
    for i in col:
        if component in i:
            pair_comp_col.append(i)
    return pair_comp_col

In [7]:
# Classification label for failures
failures_generator = failures_df[failures_df['Component'] == 'GENERATOR']
failures_generator.replace('GENERATOR', 1, inplace=True)
failures_gen_bear = failures_df[failures_df['Component'] == 'GENERATOR_BEARING']
failures_gen_bear.replace('GENERATOR_BEARING', 1, inplace=True)
failures_hyd = failures_df[failures_df['Component'] == 'HYDRAULIC_GROUP']
failures_hyd.replace('HYDRAULIC_GROUP', 1, inplace=True)
failures_gearbox = failures_df[failures_df['Component'] == 'GEARBOX']
failures_gearbox.replace('GEARBOX', 1, inplace=True)
failures_transf = failures_df[failures_df['Component'] == 'TRANSFORMER']
failures_transf.replace('TRANSFORMER', 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


In [8]:
#Features to create each DF
def create_features(df):
    time_id = ['Timestamp', 'Turbine_ID']
    pair_hyd = component('Hyd', df.columns)
    pair_trafo = component('Trafo', df.columns)
    pair_gear = component('Gear', df.columns)
    pair_gen = component('Gen', df.columns)
    pair_rot = component('Rtr', df.columns)
    pair_amb = component('Amb', df.columns)
    pair_blds = component('Blds', df.columns)
    pair_cont = component('Cont', df.columns)
    pair_nac = component('Nac', df.columns)
    pair_spin = component('Spin', df.columns)
    pair_bus = component('Busbar', df.columns)
    pair_vol = component('Volt', df.columns)
    
    #Create DF for each component
    df_generator = df[time_id + pair_gen + pair_rot + pair_amb + pair_blds + pair_cont + pair_nac + pair_spin + pair_bus + pair_hyd]
    df_gen_bear = df[time_id + pair_gen + pair_rot + pair_amb + pair_blds + pair_cont + pair_nac + pair_spin + pair_bus + pair_hyd]
    df_transformer = df[time_id + pair_trafo + pair_rot + pair_amb + pair_blds + pair_cont + pair_nac + pair_spin + pair_bus + pair_vol] 
    df_hydraulic = df[time_id + pair_hyd + pair_rot + pair_amb + pair_blds + pair_cont + pair_nac + pair_spin + pair_bus] 
    df_gearbox = df[time_id + pair_gear + pair_rot + pair_amb + pair_blds + pair_cont + pair_nac + pair_spin + pair_bus + pair_hyd]
    
    return df_generator, df_gen_bear, df_transformer, df_hydraulic, df_gearbox

In [9]:
df_generator, df_gen_bear, df_transformer, df_hydraulic, df_gearbox = create_features(signals_df)

In [10]:
# Merge DF Failures with DF of each component to know the moment of each failure
generator_final = df_generator.merge(failures_generator, on=['Timestamp','Turbine_ID'], how='outer')
gen_bear_final = df_gen_bear.merge(failures_gen_bear, on=['Timestamp', 'Turbine_ID'], how='outer')
hyd_final = df_hydraulic.merge(failures_hyd, on=['Timestamp', 'Turbine_ID'], how='outer')
gearbox_final = df_gearbox.merge(failures_gearbox, on=['Timestamp', 'Turbine_ID'], how='outer')
transf_final = df_transformer.merge(failures_transf, on=['Timestamp', 'Turbine_ID'], how='outer')

In [11]:
#Fill na on component column
generator_final['Component'].fillna(0, inplace=True)
gen_bear_final['Component'].fillna(0, inplace=True)
hyd_final['Component'].fillna(0, inplace=True)
gearbox_final['Component'].fillna(0, inplace=True)
transf_final['Component'].fillna(0, inplace=True)

In [12]:
# Set Timestamp as Index
generator_final.set_index('Timestamp', inplace=True)
gen_bear_final.set_index('Timestamp', inplace=True)
hyd_final.set_index('Timestamp', inplace=True)
gearbox_final.set_index('Timestamp', inplace=True)
transf_final.set_index('Timestamp', inplace=True)

In [13]:
# Sort Index
generator_final.sort_index(inplace=True)
gen_bear_final.sort_index(inplace=True)                                      
hyd_final.sort_index(inplace=True)
gearbox_final.sort_index(inplace=True)
transf_final.sort_index(inplace=True)

In [14]:
# Reset Index to group by Timestamp and Turbine_ID
generator_final.reset_index(inplace=True)
gen_bear_final.reset_index(inplace=True)                                      
hyd_final.reset_index(inplace=True)
gearbox_final.reset_index(inplace=True)
transf_final.reset_index(inplace=True)

In [15]:
def prepare_data(df, strategy='mean'):
    if strategy == 'mean':
        df = df.groupby(by=['Turbine_ID','Timestamp']).mean()
    else:
        df = df.grouby(by=['Turbine_ID', 'Timestamp']).max()
    
    # Fill na by interpolate
    df = df.bfill().reset_index()
        
    return df

In [16]:
df_generator_final=prepare_data(generator_final)
df_gen_bear_final=prepare_data(gen_bear_final)
df_hyd_final=prepare_data(hyd_final)
df_gearbox_final=prepare_data(gearbox_final)
df_transf_final=prepare_data(transf_final)

In [17]:
print(df_generator_final.isnull().values.any())
print(df_gen_bear_final.isnull().values.any())
print(df_hyd_final.isnull().values.any())
print(df_gearbox_final.isnull().values.any())
print(df_transf_final.isnull().values.any())

False
False
False
False
False


In [18]:
df_generator_final[df_generator_final['Turbine_ID']=='T01']['Timestamp'].value_counts()

2017-05-05 15:40:00+00:00    1
2017-08-31 16:30:00+00:00    1
2017-02-26 14:00:00+00:00    1
2017-01-21 08:10:00+00:00    1
2017-03-30 08:20:00+00:00    1
2017-04-26 15:10:00+00:00    1
2016-12-03 14:10:00+00:00    1
2017-05-28 09:30:00+00:00    1
2017-08-04 09:40:00+00:00    1
2016-01-20 10:20:00+00:00    1
2016-02-16 17:10:00+00:00    1
2016-04-06 19:20:00+00:00    1
2016-04-24 17:20:00+00:00    1
2016-03-19 11:30:00+00:00    1
2016-05-26 11:40:00+00:00    1
2016-01-02 12:20:00+00:00    1
2016-06-22 18:30:00+00:00    1
2016-08-29 18:40:00+00:00    1
2016-07-24 12:50:00+00:00    1
2016-09-30 13:00:00+00:00    1
2016-12-20 13:50:00+00:00    1
2017-08-22 07:40:00+00:00    1
2017-06-15 07:30:00+00:00    1
2017-07-21 13:20:00+00:00    1
2016-09-02 10:30:00+00:00    1
2016-11-09 10:40:00+00:00    1
2016-10-04 04:50:00+00:00    1
2016-12-11 05:00:00+00:00    1
2016-04-10 11:10:00+00:00    1
2016-06-17 11:20:00+00:00    1
                            ..
2016-10-03 06:20:00+00:00    1
2016-01-

In [19]:
def fill_na_by_turbine(df):
    turbines = ['T01', 'T06', 'T07', 'T09', 'T11']
    df_ = pd.DataFrame(columns=df.columns)
    for turbine in turbines:
        df1 = df.loc[df['Turbine_ID']==turbine]
        if df1['Component'].nunique()>1:
            index = df1[df1['Component']==1]
            index['date'] = index['Timestamp']
            index = index[['date','Timestamp', 'Turbine_ID']]
            df_merged = df1.merge(index, how='left', on=['Turbine_ID','Timestamp'])
            df_merged = df_merged.fillna(method='bfill')
            
            #If there is not a failure after, hold present date
            df_merged['date'] = df_merged['date'].fillna(df_merged['Timestamp'])
            
            df_merged['TTF'] = round((df_merged['date'] - 
                                      df_merged['Timestamp']) / np.timedelta64(1, 'D'),0)
        else:
            df_merged = df1
            df_merged['date'] = df_merged['Timestamp']
#             df1 = df1[['date','Timestamp', 'Turbine_ID']]
#             df_merged = df.merge(df1, how='left', on=['Turbine_ID','Timestamp'])
            
#             df_merged['date'] = df_merged['date'].fillna(df_merged['Timestamp'])
            df_merged['TTF'] = 0 # df_merged['date'] - df_merged['Timestamp'] 

        #Drop Column Date
        df_final = df_merged.drop(columns='date')

        #df_final['TTF'] = df_final['TTF'].dt.days

        df_ = pd.concat([df_, df_final])

    return df_
        


In [20]:
generator_df = fill_na_by_turbine(df_generator_final)
generator_df
# gen_bear_df = fill_na_by_turbine(df_gen_bear_final)
# hyd_df = fill_na_by_turbine(df_hyd_final)
# gearbox_df = fill_na_by_turbine(df_gearbox_final)
# transf_df = fill_na_by_turbine(df_transf_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Amb_Temp_Avg,Amb_WindDir_Abs_Avg,Amb_WindDir_Relative_Avg,Amb_WindSpeed_Avg,Amb_WindSpeed_Est_Avg,Amb_WindSpeed_Max,Amb_WindSpeed_Min,Amb_WindSpeed_Std,Blds_PitchAngle_Avg,Blds_PitchAngle_Max,...,Prod_LatestAvg_ReactPwrGen0,Prod_LatestAvg_ReactPwrGen1,Rtr_RPM_Avg,Rtr_RPM_Max,Rtr_RPM_Min,Rtr_RPM_Std,Spin_Temp_Avg,TTF,Timestamp,Turbine_ID
0,18.0,206.1,-12.4,3.3,3.6,11.6,0.5,0.9,0.6,4.5,...,-99.0,-5636.0,11.1,11.3,10.9,0.1,20.0,0.0,2016-01-01 00:00:00+00:00,T01
1,18.0,217.4,-1.1,3.2,3.3,12.5,0.4,0.9,8.1,59.6,...,-867.0,-3871.0,8.8,11.2,0.0,4.0,20.0,0.0,2016-01-01 00:10:00+00:00,T01
2,18.0,207.4,-5.8,4.3,4.1,8.8,0.7,1.0,10.4,24.0,...,-410.0,-1865.0,6.8,12.3,1.9,4.3,20.0,0.0,2016-01-01 00:20:00+00:00,T01
3,18.0,232.0,9.6,4.4,4.6,9.9,0.7,1.1,-0.5,0.9,...,0.0,-15822.0,11.1,11.5,10.7,0.1,20.0,0.0,2016-01-01 00:30:00+00:00,T01
4,18.0,245.6,23.3,4.1,4.3,9.9,0.8,1.2,-0.1,4.4,...,-79.0,-13314.0,11.1,11.7,10.8,0.2,20.0,0.0,2016-01-01 00:40:00+00:00,T01
5,18.0,213.7,-7.6,6.0,5.8,8.9,2.8,0.8,-1.8,-0.5,...,0.0,-16571.0,11.8,13.0,11.1,0.4,20.0,0.0,2016-01-01 00:50:00+00:00,T01
6,18.0,228.7,7.4,5.1,4.9,8.9,1.7,0.8,-1.2,0.7,...,0.0,-13635.0,11.2,12.0,10.9,0.2,20.0,0.0,2016-01-01 01:00:00+00:00,T01
7,18.0,234.8,13.5,5.3,5.1,8.7,1.9,0.7,-1.3,0.2,...,0.0,-13073.0,11.2,11.8,10.9,0.2,20.0,0.0,2016-01-01 01:10:00+00:00,T01
8,18.0,206.0,-15.3,5.4,5.2,9.2,2.0,0.8,-1.5,0.5,...,0.0,-13543.0,11.3,12.1,10.9,0.2,20.0,0.0,2016-01-01 01:20:00+00:00,T01
9,18.0,219.1,-2.2,4.6,4.6,8.7,0.4,1.5,-0.8,1.8,...,-79.0,-10222.0,11.3,12.2,10.7,0.3,20.0,0.0,2016-01-01 01:30:00+00:00,T01


In [103]:
def Failure_Time_Horizon(days, period):
    if 2 <= days <= period:
        Flag=1
    else:
        Flag=0
    return Flag

In [104]:
generator_df['60_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
gen_bear_df['60_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
hyd_df['60_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
gearbox_df['60_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)
transf_df['60_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 60),axis=1)

In [105]:
generator_df['30_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
gen_bear_df['30_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
hyd_df['30_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
gearbox_df['30_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)
transf_df['30_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 30),axis=1)

In [106]:
generator_df['10_days'] = generator_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
gen_bear_df['10_days'] = gen_bear_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
hyd_df['10_days'] = hyd_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
gearbox_df['10_days'] = gearbox_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)
transf_df['10_days'] = transf_df.apply(lambda x: Failure_Time_Horizon(x['TTF'], 10),axis=1)

In [107]:
## Save DF's to CSV
generator_df.to_csv('../wmillfailprev/data/generator_df.csv', index=False)
gen_bear_df.to_csv('../wmillfailprev/data/gen_bear_df.csv', index=False)
hyd_df.to_csv('../wmillfailprev/data/hyd_df.csv', index=False)
gearbox_df.to_csv('../wmillfailprev/data/gearbox_df.csv', index=False)
transf_df.to_csv('../wmillfailprev/data/transf_df.csv', index=False)