In [141]:
import pandas as pd
import numpy as np
import os.path
import pickle
from tensorflow import keras
from keras.models import Sequential, load_model
from csv import reader
import tensorflow_addons as tfa

from sklearn.preprocessing import MinMaxScaler

In [142]:
# defining the parameters of the data (MUST FIT TO THE MODEL ELSE PREDICTION FAILURE)
REGRESSION = False
SEQUENCE_LENGTH = 50 
Y_THRESHOLD = 70 
FILENAME_IDS = './simulation_data/ids.txt'

if REGRESSION:
    FILENAME = './simulation_data/data_regr.npy'
    model_reg = load_model('./models/reg_model.h5')
    DAYS_MEAN = 6 
    DAYS_VAR = 12 
else:
    FILENAME = './simulation_data/data_class.npy'
    model_class = load_model('./models/class_model.h5')
    # model_class = pickle.load(open('./models/class_model.sav', 'rb'))
    DAYS_MEAN = 6 
    DAYS_VAR = 12 


In [143]:
def add_mean_var(df, days_mean=5, days_var=5):
    # create a df with the mean of the parameter-days with the same index as the original df
    no_days_mean = days_mean
    names = ['mean_' + feature for feature in FEATURES]
    df_mean = df.groupby('id')[FEATURES].rolling(no_days_mean).mean()
    df_mean.columns = names
    df_mean.reset_index(inplace=True)
    df_mean.drop(columns=['level_1', 'id'], inplace=True)
    
    # create a df with the var of the parameter-days with the same index as the original df
    no_days_var = days_var
    names = ['var_' + feature for feature in FEATURES]
    df_var = df.groupby('id')[FEATURES].rolling(no_days_var).var()
    df_var.columns = names
    df_var.reset_index(inplace=True)
    df_var.drop(columns=['level_1', 'id'], inplace=True)
    return df_mean, df_var

In [144]:
def create_sequences(df_eng, seq_len, features, labels):
    X_samples = []
    y_samples = []
    for i in range(df_eng.shape[0]):
        if i + seq_len - 1 >= df_eng.shape[0]:
            break
        X_samples.append(df_eng[features].iloc[i : i + seq_len]) # appending a list of all feature elements to the list
        y_samples.extend(df_eng[labels[0]].iloc[i + seq_len - 1]) # just add the y-value to a list (1D) - appending would create 2D
    print(df_eng['id'].unique(), np.array(X_samples).shape, np.array(y_samples).shape)
    return np.array(X_samples), np.array(y_samples)

In [145]:
# Load data
df_train = pd.read_csv('./reworked_data/train_cat_set1.csv', header=0)
df_test = pd.read_csv('./reworked_data/test_cat_set1.csv', header=0)
df_train.drop(columns=['Unnamed: 0'], inplace=True)
df_test.drop(columns=['Unnamed: 0'], inplace=True)
print(df_test.shape)

(29692, 19)


In [146]:
# copy the original RUL data to display it on scree
df_train['rul_orig'] = df_train['rul']
df_test['rul_orig'] = df_test['rul']

In [147]:
#change the relu to 100 if not within the last 60 days
# change rul to 100 for everything bigger. not important to predict that right. Only the last 100 days are relevant
df_train['rul'].loc[df_train['rul'] > Y_THRESHOLD] = Y_THRESHOLD
df_test['rul'].loc[df_test['rul'] > Y_THRESHOLD] = Y_THRESHOLD

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['rul'].loc[df_train['rul'] > Y_THRESHOLD] = Y_THRESHOLD
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['rul'].loc[df_test['rul'] > Y_THRESHOLD] = Y_THRESHOLD


In [148]:
# create min max scaler for features
INFO = ['id', 'cycle', 'rul', 'label', 'rul_orig']
FEATURES = df_train.drop(columns=INFO).columns
scaler = MinMaxScaler()
df_feat = pd.DataFrame(data=scaler.fit_transform(df_train[FEATURES]), columns=scaler.get_feature_names_out(), index=df_train.index)
df_train = pd.merge(left=df_train[INFO], right=df_feat, left_index=True, right_index=True)
df_feat_test = pd.DataFrame(data=scaler.transform(df_test[FEATURES]), columns=scaler.get_feature_names_out(), index=df_test.index)
df_test = pd.merge(left=df_test[INFO], right=df_feat_test, left_index=True, right_index=True)


In [149]:
# add rolling variance and mean
df_train_mean, df_train_var = add_mean_var(df_train, DAYS_MEAN, DAYS_VAR)
df_test_mean, df_test_var = add_mean_var(df_test, DAYS_MEAN, DAYS_VAR)
print(df_test_mean.shape)

(29692, 15)


In [150]:
# concattenate the dataframes
df_train = df_train.merge(df_train_mean, left_index=True, right_index=True)
df_train = df_train.merge(df_train_var, left_index=True, right_index=True)
df_test = df_test.merge(df_test_mean, left_index=True, right_index=True)
df_test = df_test.merge(df_test_var, left_index=True, right_index=True)

# drop the na which have occured due to the rolling mean/var
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)
print(df_test.shape)

(27492, 50)


In [151]:
# creates a dicionary with key: id value: list of x_test arrays and y_test rul and label for the simulation
LABELS = [['rul_orig']]
FEATURES_NEW = df_train.drop(columns=INFO).columns
arrays = dict()
id_list = df_test['id'].unique()
# save all ids of the data in a txt to be used in the selector of the page
file_ids = open(FILENAME_IDS,'w')
# iterate over the ids. When timeseries is at least 1 Sequence create and add x and y sequences
check_list = []
for id in id_list:
    df = df_test.loc[df_test['id'] == id]
    if df.shape[0] >= SEQUENCE_LENGTH:
        X_test, y_test = create_sequences(df, SEQUENCE_LENGTH, FEATURES_NEW, LABELS)
        arrays[str(id) + '-x'] = X_test
        arrays[str(id) + '-y_rul_orig'] = y_test
        if REGRESSION:
            arrays[str(id) + '-y_regr_pred'] = np.array(model_reg.predict(X_test))
        else:
            arrays[str(id) + '-y_class_pred'] = np.array(model_class.predict(X_test))
        file_ids.write(f"{id} - {y_test[0]} - {y_test[-1]},")
        check_list.append(id)
np.save(FILENAME, arrays)
file_ids.close() 

[1003] (66, 50, 45) (66,)
[1004] (46, 50, 45) (46,)
[1005] (38, 50, 45) (38,)
[1006] (45, 50, 45) (45,)
[1007] (100, 50, 45) (100,)
[1008] (106, 50, 45) (106,)
[1010] (132, 50, 45) (132,)
[1011] (23, 50, 45) (23,)
[1012] (157, 50, 45) (157,)
[1013] (135, 50, 45) (135,)
[1015] (16, 50, 45) (16,)
[1016] (53, 50, 45) (53,)
[1017] (105, 50, 45) (105,)
[1018] (73, 50, 45) (73,)
[1019] (75, 50, 45) (75,)
[1020] (124, 50, 45) (124,)
[1021] (88, 50, 45) (88,)
[1023] (70, 50, 45) (70,)
[1024] (126, 50, 45) (126,)
[1026] (16, 50, 45) (16,)
[1027] (80, 50, 45) (80,)
[1028] (98, 50, 45) (98,)
[1029] (111, 50, 45) (111,)
[1030] (83, 50, 45) (83,)
[1031] (136, 50, 45) (136,)
[1032] (85, 50, 45) (85,)
[1034] (143, 50, 45) (143,)
[1035] (138, 50, 45) (138,)
[1036] (66, 50, 45) (66,)
[1037] (61, 50, 45) (61,)
[1038] (65, 50, 45) (65,)
[1040] (73, 50, 45) (73,)
[1041] (63, 50, 45) (63,)
[1042] (96, 50, 45) (96,)
[1043] (112, 50, 45) (112,)
[1045] (92, 50, 45) (92,)
[1046] (86, 50, 45) (86,)
[1047] (13, 

In [152]:
# array: 
# axis 0: contains all sequences of an ID: from 0 to shape[0] -1 
# axis 1: are all datasets in one sequence --> [-1] is the last data-row of the sequence (leading to the end of all series)
# axis 2: are all datapoints in the dataset --> first 15 are the sensors to display
# print(arrays['1002-x'][0][-1][:15])
# print(arrays['1002-x'].shape[0])
print(arrays['1003-x'].shape)


(66, 50, 45)


In [153]:
# create a dataframe out of it
df = pd.DataFrame(arrays['1003-x'][1])
SHOW_FEATURES = 12
FEATURES_PLOTS = [i for i in range(SHOW_FEATURES, df.shape[1])]
df_plot = df.drop(columns=FEATURES_PLOTS)
df_plot[-20:]
if REGRESSION:
    print(arrays['1004-x'].shape, arrays['1004-y_regr_pred'].shape, arrays['1004-y_rul_orig'].shape)
else:
    print(arrays['1004-x'].shape, arrays['1004-y_class_pred'].shape, arrays['1004-y_rul_orig'].shape)


(46, 50, 45) (46, 3) (46,)


In [155]:
if REGRESSION:
    # Check the predictions and get indication which engines to use for presentation
    SE = 0
    N = 0
    SE60 = 0
    N60 = 0
    SE30 = 0
    N30 = 0
    SE15 = 0
    N15 = 0
    for id in check_list:
        SE60 = 0
        N60 = 0
        SE30 = 0
        N30 = 0
        SE15 = 0
        N15 = 0
        for y_pred, y_true in zip(arrays[str(id) + '-y_regr_pred'], arrays[str(id) + '-y_rul_orig']):
            # y_pred_round = round({y_pred[0]},2)
            # delta = y_true - round({y_pred[0]},2)
            if y_true >= 70:
                y_true = 70
            if 30 < y_true <=60:
                SE60 += np.square(y_true - y_pred[0])    
                N60 += 1
            if 15 < y_true <=30:
                SE30 += np.square(y_true - y_pred[0])    
                N30 += 1
            if -1 < y_true <=15:
                SE15 += np.square(y_true - y_pred[0])    
                N15 += 1
                
                #print(f"ID: {id} y_true: {y_true} y_pred: round({y_pred[0]},2), delta = ")
            SE += np.square(y_true - y_pred[0])
            N += 1
        if N60 > 0: 
            MSE60 = SE60 / N60
            RMSE60 = np.sqrt(MSE60)
        else: 
            MSE60 = None
            RMSE60 = None
        if N30 > 0: 
            MSE30 = SE30 / N30
            RMSE30 = np.sqrt(MSE30)
        else: 
            MSE30 = None
            RMSE30 = None
        if N15 > 0: 
            MSE15 = SE15 / N15
            RMSE15 = np.sqrt(MSE15)
        else: 
            MSE15 = None
            RMSE15 = None
        print(f"ID:, {id}, RMSE60:, {RMSE60}, RMSE30:, {RMSE30}, RMSE15:, {RMSE15},")
    MSE = SE / N
    RMSE = np.sqrt(MSE)
    print(f"RMSE: {RMSE} MSE: {MSE}")
else:
    # Check the predictions and get indication which engines to use for presentation
    for id in check_list:
        CLASS60 = 0
        N60 = 0
        CLASS30 = 0
        N30 = 0
        CLASS15 = 0
        N15 = 0
        for y_pred, y_true in zip(arrays[str(id) + '-y_class_pred'], arrays[str(id) + '-y_rul_orig']):
            y_pred = np.argmax(y_pred)
            if 30 < y_true <=60:
                if y_pred == 0:
                    CLASS60 += 1
                N60 += 1
            if 15 < y_true <=30:
                if y_pred == 1:
                    CLASS30 += 1
                N30 += 1
            if -1 < y_true <=15:
                if y_pred == 2:
                    CLASS15 += 1
                N15 += 1
                #print(f"ID: {id} y_true: {y_true} y_pred: round({y_pred[0]},2), delta = ")
        if N60 > 0: 
            ACC60 = CLASS60 / N60
        else: 
            ACC60 = None
        if N30 > 0: 
            ACC30 = CLASS30 / N30
        else: 
            ACC30 = None
        if N15 > 0: 
            ACC15 = CLASS15 / N15
        else: 
            ACC15 = None
        print(f"ID:, {id}, ACC60:, {ACC60}, ACC30:, {ACC30}, ACC15:, {ACC15},")

ID:, 1003, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1004, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1005, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1006, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1007, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1008, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1010, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1011, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1012, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1013, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1015, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1016, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1017, ACC60:, 1.0, ACC30:, None, ACC15:, None,
ID:, 1018, ACC60:, 1.0, ACC30:, 0.6666666666666666, ACC15:, None,
ID:, 1019, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1020, ACC60:, 1.0, ACC30:, 0.8666666666666667, ACC15:, None,
ID:, 1021, ACC60:, 1.0, ACC30:, None, ACC15:, None,
ID:, 1023, ACC60:, None, ACC30:, None, ACC15:, None,
ID:, 1024, ACC60:, 1.0

In [None]:
file_id = './simulation_data/ids.txt'
with open(file_id, mode='r') as file:
    csv_read = reader(file, delimiter=',')
    id_list = [row[:-1] for row in csv_read][0]
id_list[1][:4]

'1004'

In [None]:
print(arrays['1004-y_rul_orig'][0]//10)

12
