# Libraries and functions

In [1]:
import helper
from helper import *

%matplotlib inline

  from pandas.core import datetools
Using TensorFlow backend.
  return f(*args, **kwds)


In [4]:
def create_RUL_label():
    
    print('Creating RUL label....')
    
    RUL_train = df_train.groupby(['dataset_id', 'unit_id'])['cycle'].max()
    RUL_test = df_test.groupby(['dataset_id', 'unit_id'])['cycle'].max() + df_RUL.groupby(['dataset_id', 'unit_id'])['rul'].max()

    df_train['RUL'] = df_train.apply(lambda r: get_RUL(r, RUL_train), axis=1)
    df_test['RUL'] = df_test.apply(lambda r: get_RUL(r, RUL_test), axis=1)
    
    print('Done!!')
    
    return df_train, df_test

def split_dataset_by_type():
    
    print('Splitting datasets...')
    
    type_1 = ['FD001', 'FD003']
    type_2 = ['FD002', 'FD004']
    
    df_train_type1 = df_train[df_train['dataset_id'].isin(type_1)].reset_index(drop=True)
    df_train_type2 = df_train[df_train['dataset_id'].isin(type_2)].reset_index(drop=True)

    df_test_type1 = df_test[df_test['dataset_id'].isin(type_1)].reset_index(drop=True)
    df_test_type2 = df_test[df_test['dataset_id'].isin(type_2)].reset_index(drop=True)

    df_RUL_type1 = df_RUL[df_RUL['dataset_id'].isin(type_1)].reset_index(drop=True)
    df_RUL_type2 = df_RUL[df_RUL['dataset_id'].isin(type_2)].reset_index(drop=True)
    
    print('Done!!')
    
    return df_train_type1, df_train_type2, df_test_type1, df_test_type2, df_RUL_type1, df_RUL_type2

def normalize_dataset():
    
    print('Normalizing datasets...')
    
    df_train_type1_normalize = df_train_type1.copy()
    df_test_type1_normalize = df_test_type1.copy()

    scaler_type1 = StandardScaler().fit(df_train_type1[sensor_columns])
    df_train_type1_normalize[sensor_columns] = scaler_type1.transform(df_train_type1[sensor_columns])
    df_test_type1_normalize[sensor_columns] = scaler_type1.transform(df_test_type1[sensor_columns])


    df_train_type2_normalize = df_train_type2.copy()
    df_test_type2_normalize = df_test_type2.copy()

    gb = df_train_type2.groupby('HDBScan')[sensor_columns]
        
    d={}

    for x in gb.groups:
        d["scaler_type2_{0}".format(x)] = StandardScaler().fit(gb.get_group(x))
        df_train_type2_normalize.loc[df_train_type2_normalize['HDBScan'] == x, sensor_columns] = d["scaler_type2_{0}".format(x)].transform(df_train_type2.loc[df_train_type2['HDBScan'] == x, sensor_columns]) 
        df_test_type2_normalize.loc[df_test_type2_normalize['HDBScan'] == x, sensor_columns] = d["scaler_type2_{0}".format(x)].transform(df_test_type2.loc[df_test_type2['HDBScan'] == x, sensor_columns]) 
    
    print('Done!!')
    
    return df_train_type1_normalize, df_train_type2_normalize, df_test_type1_normalize, df_test_type2_normalize

def clustering_dataset():
    
    print('Clustering...')
    
    columns = ['']

    HDBScan_clustering(df_train_type1, df_test_type1, columns = ['setting 1', 'setting 2', 'setting 3'])
    HDBScan_clustering(df_train_type2, df_test_type2, columns = ['setting 1', 'setting 2', 'setting 3'])
    
    print('Done!!')
    
    return df_train_type1, df_train_type2, df_test_type1, df_test_type2

def join_datasets():
    
    print('Joining datasets...')
    
    df_train_all = pd.concat([df_train_type1, df_train_type2]).reset_index(drop=True)
    df_test_all = pd.concat([df_test_type1, df_test_type2]).reset_index(drop=True)

    df_train_all_normalize = pd.concat([df_train_type1_normalize, df_train_type2_normalize]).reset_index(drop=True)
    df_test_all_normalize = pd.concat([df_test_type1_normalize, df_test_type2_normalize]).reset_index(drop=True)
    
    print('Done!!')
    
    return df_train_all, df_test_all, df_train_all_normalize, df_test_all_normalize

def differentiate_sensors():
    
    print('Differentiating sensors readings...')
    
    df_train_differentiated = df_train_all_normalize.copy()
    df_test_differentiated = df_test_all_normalize.copy()

    unit = ['FD001', 'FD002', 'FD003', 'FD004']

    for unit_id in unit:
        units = df_train_all_normalize[df_train_all_normalize['dataset_id'] == unit_id]['unit_id'].unique()
        #print('Differentiating dataset {} from training set'.format(unit_id))

        for unit_number in units:
            for i in range (0,21):

                df_train_differentiated.loc[(df_train_differentiated.dataset_id == unit_id) & \
                                            (df_train_differentiated.unit_id == unit_number), 'sensor '+ '{}'.format(i+1)] \
                = timeseries_difference(df_train_all_normalize.loc[(df_train_all_normalize.dataset_id == unit_id) & \
                                        (df_train_all_normalize.unit_id == unit_number)][sensor_columns].iloc[:,i], diff_lag = 10).copy()                

    for unit_id in unit:
        units = df_test_all_normalize[df_test_all_normalize['dataset_id'] == unit_id]['unit_id'].unique()
        #print('Differentiating dataset {} from test set'.format(unit_id))
        
        for unit_number in units:
            for i in range (0,21):

                df_test_differentiated.loc[(df_test_differentiated.dataset_id == unit_id) & \
                                            (df_test_differentiated.unit_id == unit_number), 'sensor '+ '{}'.format(i+1)] \
                = timeseries_difference(df_test_all_normalize.loc[(df_test_all_normalize.dataset_id == unit_id) & \
                                        (df_test_all_normalize.unit_id == unit_number)][sensor_columns].iloc[:,i], diff_lag = 10).copy()
    
    print('Done!!')
    
    return df_train_differentiated, df_test_differentiated

def get_rolling_features():
    
    print('Calculating rolling features...')
    
    dfs = ['FD001','FD002', 'FD003', 'FD004']

    rolling_feature = [{"min": min}, {"max": max}, {"mean": np.mean}, {"std": np.std}, {"median": np.median}]
    rolling_size = [5, 10, 20]

    df_train_final = df_train_all_normalize.copy()
    df_train_final.reset_index(inplace=True)

    for i ,x in enumerate(rolling_feature):
        for size in rolling_size:

            df = []
            df_rolling=[]

            for dataset_id in dfs:

                df_train_final.groupby(['dataset_id', 'unit_id'])
                all_units = df_train_final[df_train_final['dataset_id'] == dataset_id]['unit_id'].unique()
                plot_data = df_train_final[(df_train_final['dataset_id'] == dataset_id) & (df_train_final['unit_id'].isin(all_units))].copy()

                for unit_id, group in plot_data.groupby('unit_id'):

                    df.append(group[sensor_columns].rolling(size).aggregate(x).fillna(method='bfill'))

            df_rolling = pd.concat(df, ignore_index=True)
            df_rolling.columns = sensor_columns

            df_train_final = df_train_final.join(df_rolling, rsuffix='_rolling_{0}_{1}'.format(str([*rolling_feature[i]][0]), str(size))).copy()

    df_test_final = df_test_all_normalize.copy()
    df_test_final.reset_index(inplace=True)

    for i ,x in enumerate(rolling_feature):
        for size in rolling_size:

            df = []
            df_rolling=[]

            for dataset_id in dfs:

                df_test_final.groupby(['dataset_id', 'unit_id'])
                all_units = df_test_final[df_test_final['dataset_id'] == dataset_id]['unit_id'].unique()
                plot_data = df_test_final[(df_test_final['dataset_id'] == dataset_id) & (df_test_final['unit_id'].isin(all_units))].copy()

                for unit_id, group in plot_data.groupby('unit_id'):

                    df.append(group[sensor_columns].rolling(size).aggregate(x).fillna(method='bfill'))

            df_rolling = pd.concat(df, ignore_index=True)
            df_rolling.columns = sensor_columns

            df_test_final = df_test_final.join(df_rolling, rsuffix='_rolling_{0}_{1}'.format(str([*rolling_feature[i]][0]), str(size))).copy()

    print('Done!!')
    
    return df_train_final, df_test_final

def get_rolling_features_diff():
    
    print('Calculating rolling features for differentiated sensors readings...')
    
    dfs = ['FD001','FD002', 'FD003', 'FD004']
    settings = ['setting 1', 'setting 2', 'setting 3','dataset_id', 
                'RUL', 'cycle', 'unit_id', 'HDBScan']

    rolling_feature = [{"min": min}, {"max": max}, {"mean": np.mean}, {"std": np.std}, {"median": np.median}]
    rolling_size = [5, 10, 20]
    
    df_train_final_diff = df_train_differentiated.copy()
    df_train_final_diff.reset_index(inplace=True)

    for i ,x in enumerate(rolling_feature):
        for size in rolling_size:

            df = []
            df_rolling=[]

            for dataset_id in dfs:

                df_train_final_diff.groupby(['dataset_id', 'unit_id'])
                all_units = df_train_final_diff[df_train_final_diff['dataset_id'] == dataset_id]['unit_id'].unique()
                plot_data = df_train_final_diff[(df_train_final_diff['dataset_id'] == dataset_id) & (df_train_final_diff['unit_id'].isin(all_units))].copy()

                for unit_id, group in plot_data.groupby('unit_id'):

                    df.append(group[sensor_columns].rolling(size).aggregate(x).fillna(method='bfill'))

            df_rolling = pd.concat(df, ignore_index=True)
            df_rolling.columns = sensor_columns

            df_train_final_diff = df_train_final_diff.join(df_rolling, rsuffix='_rolling_{0}_{1}'.format(str([*rolling_feature[i]][0]), str(size))).copy()

    df_train_final_diff.drop((settings), axis=1, inplace=True)
    
    df_test_final_diff = df_test_differentiated.copy()
    df_test_final_diff.reset_index(inplace=True)

    for i ,x in enumerate(rolling_feature):
        for size in rolling_size:

            df = []
            df_rolling=[]

            for dataset_id in dfs:

                df_test_final_diff.groupby(['dataset_id', 'unit_id'])
                all_units = df_test_final_diff[df_test_final_diff['dataset_id'] == dataset_id]['unit_id'].unique()
                plot_data = df_test_final_diff[(df_test_final_diff['dataset_id'] == dataset_id) & (df_test_final_diff['unit_id'].isin(all_units))].copy()

                for unit_id, group in plot_data.groupby('unit_id'):

                    df.append(group[sensor_columns].rolling(size).aggregate(x).fillna(method='bfill'))

            df_rolling = pd.concat(df, ignore_index=True)
            df_rolling.columns = sensor_columns

            df_test_final_diff = df_test_final_diff.join(df_rolling, rsuffix='_rolling_{0}_{1}'.format(str([*rolling_feature[i]][0]), str(size))).copy()
    
    df_test_final_diff.drop((settings), axis=1, inplace=True)
    
    return df_train_final_diff, df_test_final_diff
    
    print('Done!!')

def join_final_datasets_and_save_to_df():
    
    print('Joining final datasets and saving to dataframes...')
    
    train_final = df_train_final.join(df_train_final_diff, rsuffix='_diff').copy()
    test_final = df_test_final.join(df_test_final_diff, rsuffix='_diff').copy()

    train_final.to_pickle(path + 'dataframes/train__1_diff_10')
    test_final.to_pickle(path + 'dataframes/test__1_diff_10')
    
    print('Done!!')
    
    return train_final, test_final

# Load Data

In [None]:
datasets = []

data = '/datadrive/Turbofan_Engine/' #path to .txt files
path = '/home/rneves/thesis/Turbofan_Engine/' # path to numpy arrays to plot t-sne

text_files = [f for f in os.listdir(data) if f.endswith('.txt') and not f.startswith('r')]
dataframe = [os.path.splitext(f)[0] for f in text_files]

sensor_columns = ["sensor {}".format(s) for s in range(1,22)]
sensor_columns_rolling_mean = ["sensor {}_rolling_mean".format(s) for s in range(1,22)]
sensor_columns_rolling_std = ["sensor {}_rolling_std".format(s) for s in range(1,22)]

info_columns = ['dataset_id', 'unit_id','cycle','setting 1', 'setting 2', 'setting 3']

label_columns = ['dataset_id', 'unit_id', 'rul']

settings = ['setting 1', 'setting 2', 'setting 3']

test_data = []
train_data = []
RUL_data = []

for file in text_files:
    print(file)
    
    if re.match('RUL*', file):
        subset_df = pd.read_csv(data+file, delimiter=r"\s+", header=None)
        unit_id = range(1, subset_df.shape[0] + 1)
        subset_df.insert(0, 'unit_id', unit_id)
        dataset_id = basename(file).split("_")[1][:5]
        subset_df.insert(0, 'dataset_id', dataset_id)    
        RUL_data.append(subset_df)
    
    if re.match('test*', file):
        subset_df = pd.read_csv(data+file, delimiter=r"\s+", header=None, usecols=range(26))
        dataset_id = basename(file).split("_")[1][:5]
        subset_df.insert(0, 'dataset_id', dataset_id)  
        test_data.append(subset_df)
    
    if re.match('train*', file):
        subset_df = pd.read_csv(data+file, delimiter=r"\s+", header=None, usecols=range(26))  
        dataset_id = basename(file).split("_")[1][:5]
        subset_df.insert(0, 'dataset_id', dataset_id)   
        train_data.append(subset_df)


df_train = pd.concat(train_data, ignore_index=True)
df_train.columns = info_columns + sensor_columns               
df_train.sort_values(by=['dataset_id', 'unit_id', 'cycle'], inplace=True)

df_test = pd.concat(test_data, ignore_index=True)
df_test.columns = info_columns + sensor_columns
df_test.sort_values(by=['dataset_id', 'unit_id', 'cycle'], inplace=True)

df_RUL = pd.concat(RUL_data, ignore_index=True)
df_RUL.columns = label_columns
df_RUL.sort_values(by=['dataset_id', 'unit_id'], inplace=True)

test_FD003.txt
train_FD002.txt
test_FD004.txt
RUL_FD003.txt
test_FD001.txt
train_FD001.txt
RUL_FD001.txt
RUL_FD002.txt
train_FD003.txt
train_FD004.txt
test_FD002.txt
RUL_FD004.txt


In [None]:
df_train, df_test = create_RUL_label()
df_train_type1, df_train_type2, df_test_type1, df_test_type2, df_RUL_type1, df_RUL_type2 = split_dataset_by_type()
df_train_type1, df_train_type2, df_test_type1, df_test_type2 = clustering_dataset()
df_train_type1_normalize, df_train_type2_normalize, df_test_type1_normalize, df_test_type2_normalize = normalize_dataset()
df_train_all, df_test_all, df_train_all_normalize, df_test_all_normalize = join_datasets()
df_train_differentiated, df_test_differentiated = differentiate_sensors()
df_train_final, df_test_final = get_rolling_features()
df_train_final_diff, df_test_final_diff = get_rolling_features_diff()
train_final, test_final = join_final_datasets_and_save_to_df()

Creating RUL label....
Done!!
Splitting datasets...
Done!!
Clustering...


  warn('Clusterer does not have any defined clusters, new data'


Number of clusters in training set using HDBScan: 1
Number of clusters in test set using HDBScan: 1
Number of clusters in training set using HDBScan: 6
Number of clusters in test set using HDBScan: 6
Done!!
Normalizing datasets...
Done!!
Joining datasets...
Done!!
Differentiating sensors readings...
