In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [53]:
rooth_path = '../rawdata/proc_data/'
df_train_gearbox = pd.read_csv(rooth_path + 'df_train_gearbox.csv')
df_train_gen = pd.read_csv(rooth_path + 'df_train_gen.csv')
df_train_gen_bear = pd.read_csv(rooth_path + 'df_train_gen_bear.csv')
df_train_hyd = pd.read_csv(rooth_path + 'df_train_hyd.csv')
df_train_transf = pd.read_csv(rooth_path + 'df_train_transf.csv')

In [54]:
df_test_gen = pd.read_csv(rooth_path + 'df_test_gen.csv')
df_test_gearbox = pd.read_csv(rooth_path + 'df_test_gearbox.csv')
df_test_gen_bear = pd.read_csv(rooth_path + 'df_test_gen_bear.csv')
df_test_hyd = pd.read_csv(rooth_path + 'df_test_hyd.csv')
df_test_transf = pd.read_csv(rooth_path + 'df_test_transf.csv')

In [55]:
df_train_gearbox['Timestamp'] = pd.to_datetime(df_train_gearbox['Timestamp'])
df_train_gen['Timestamp'] = pd.to_datetime(df_train_gen['Timestamp'])
df_train_gen_bear['Timestamp'] = pd.to_datetime(df_train_gen_bear['Timestamp'])
df_train_hyd['Timestamp'] = pd.to_datetime(df_train_hyd['Timestamp'])
df_train_transf['Timestamp'] = pd.to_datetime(df_train_transf['Timestamp'])
df_test_gearbox['Timestamp'] = pd.to_datetime(df_test_gearbox['Timestamp'])
df_test_gen['Timestamp'] = pd.to_datetime(df_test_gen['Timestamp'])
df_test_gen_bear['Timestamp'] = pd.to_datetime(df_test_gen_bear['Timestamp'])
df_test_hyd['Timestamp'] = pd.to_datetime(df_test_hyd['Timestamp'])
df_test_transf['Timestamp'] = pd.to_datetime(df_test_transf['Timestamp'])

In [56]:
def add_features(df_in, rolling_win_size):
    """Add rolling average and rolling standard deviation for sensors readings using fixed rolling window size.
    Args:
            df_in (dataframe)     : The input dataframe to be proccessed (training or test)
            rolling_win_size (int): The window size, number of cycles for applying the rolling function
    Returns:
            dataframe: contains the input dataframe with additional rolling mean and std for each sensor
    """
    
    sensor_cols = []
    for i in df_in.keys()[2:-5]:
        sensor_cols.append(i)
    sensor_av_cols = [nm+'_av' for nm in sensor_cols]
    sensor_sd_cols = [nm+'_sd' for nm in sensor_cols]
    df_out = pd.DataFrame()
    ws = rolling_win_size
    #calculate rolling stats for each engine id
    for m_id in pd.unique(df_in.Turbine_ID):
        # get a subset for each engine sensors
        df_engine = df_in[df_in['Turbine_ID'] == m_id]
        df_sub = df_engine[sensor_cols]
        # get rolling mean for the subset
        av = df_sub.rolling(ws, min_periods=1).mean()
        av.columns = sensor_av_cols
        # get the rolling standard deviation for the subset
        sd = df_sub.rolling(ws, min_periods=1).std().fillna(0)
        sd.columns = sensor_sd_cols
        # combine the two new subset dataframes columns to the engine subset
        new_ftrs = pd.concat([df_engine,av,sd], axis=1)
        # add the new features rows to the output dataframe
        df_out = pd.concat([df_out,new_ftrs])
    return df_out

In [57]:
df_train_gearbox_extra = add_features(df_train_gearbox, 15)
df_train_gen_extra = add_features(df_train_gen, 15)
df_train_gen_bear_extra = add_features(df_train_gen_bear, 15)
df_train_hyd_extra = add_features(df_train_hyd, 15)
df_train_transf_extra = add_features(df_train_transf, 15)
df_test_gearbox_extra = add_features(df_test_gearbox, 15)
df_test_gen_extra = add_features(df_test_gen, 15)
df_test_gen_bear_extra = add_features(df_test_gen_bear, 15)
df_test_hyd_extra = add_features(df_test_hyd, 15)
df_test_transf_extra = add_features(df_test_transf, 15)

In [58]:
#Group by day per turbine
def group_per_frequency(df, strategy='mean'):
    df['Date'] = df['Timestamp'].dt.date
    if strategy == 'max':
        df = df.groupby(by=['Turbine_ID','Date']).max().reset_index().drop(columns='Timestamp')
    else:
        df = df.groupby(by=['Turbine_ID','Date']).mean().reset_index()
        
    return df

In [104]:
df_train_gearbox_day = group_per_frequency(df_train_gearbox_extra)
df_train_gen_day = group_per_frequency(df_train_gen_extra)
df_train_gen_bear_day = group_per_frequency(df_train_gen_bear_extra)
df_train_hyd_day = group_per_frequency(df_train_hyd_extra)
df_train_transf_day = group_per_frequency(df_train_transf_extra)
df_test_gearbox_day = group_per_frequency(df_test_gearbox_extra)
df_test_gen_day = group_per_frequency(df_test_gen_extra)
df_test_gen_bear_day = group_per_frequency(df_test_gen_bear_extra)
df_test_hyd_day = group_per_frequency(df_test_hyd_extra)
df_test_transf_day = group_per_frequency(df_test_transf_extra)

In [92]:
#Standard scaler per Turbine
def scale(df_train, df_test, scaler='StandardScaler'):
    
    X_train = df_train.drop(columns=['Timestamp', 'TTF', '60_days', '30_days', '10_days', 'Component'])
    X_test = df_test.drop(columns=['Timestamp', 'TTF', '60_days', '30_days', '10_days', 'Component'])
    
    X_train1 = X_train.loc[X_train['Turbine_ID']=='T01']
    X_test1 = X_test.loc[X_test['Turbine_ID']=='T01']
    
    X_train1 = X_train1.drop(columns='Turbine_ID')
    X_test1 = X_test1.drop(columns='Turbine_ID')
    
    if scaler == 'MinMaxScaler':
        sc = MinMaxScaler()
        X_train1 = sc.fit_transform(X_train1)
        X_test1 = sc.transform(X_test1)
    else:
        sc = StandardScaler()
        X_train1 = sc.fit_transform(X_train1)
        X_test1 = sc.transform(X_test1)
    
    turbines = ['T06', 'T07', 'T09', 'T11']
    for turbine in turbines:
        X_train_ = X_train.loc[X_train['Turbine_ID']==turbine]
        X_test_ = X_test.loc[X_test['Turbine_ID']==turbine]
        
        X_train_ = X_train_.drop(columns='Turbine_ID')
        X_test_ = X_test_.drop(columns='Turbine_ID')
        
        if scaler == 'MinMaxScaler':
            sc = MinMaxScaler()
            X_train_ = sc.fit_transform(X_train_)
            X_test_ = sc.transform(X_test_)
        else:
            sc = StandardScaler()
            X_train_ = sc.fit_transform(X_train_)
            X_test_ = sc.transform(X_test_)

        X_train1 = np.concatenate((X_train1, X_train_))
        X_test1 = np.concatenate((X_test1, X_test_))
        
    return X_train1, X_test1

In [93]:
X_train, X_test = scale(temp_train, temp_test)