In [24]:
import keras

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation,Conv1D
%matplotlib inline

In [39]:
# read training data 
train_df = pd.read_excel('dataset2-train.xlsx')
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0


In [40]:
train_df.head(10)
train_df.count()

water_ID           15539
cycle              15539
clor               15539
conductivity       15539
dissolvedoxygen    15539
pH                 15539
pressure           15539
turbidity          15539
temperature        15539
corr-DO2           15539
corr-PH            15539
suspend-Turb       15539
corr-TempPh        15539
crack-Pressure     15539
corr-Cond          15539
Fault              15539
dtype: int64

In [41]:
print(train_df.dtypes)

water_ID             int64
cycle                int64
clor               float64
conductivity       float64
dissolvedoxygen    float64
pH                 float64
pressure           float64
turbidity          float64
temperature        float64
corr-DO2             int64
corr-PH              int64
suspend-Turb         int64
corr-TempPh          int64
crack-Pressure       int64
corr-Cond            int64
Fault                int64
dtype: object


In [42]:
# read training data 
test_df = pd.read_excel('dataset2-test.xlsx')
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,4.0522,847.819426,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0
1,1,2,4.0783,816.487198,19.176,6.7011,5.029,7.0839,21.741,0,0,0,0,0,0,0
2,1,3,4.1044,785.15497,18.764,6.6584,5.0,7.0,20.988,0,0,0,0,0,0,0
3,1,4,4.1305,816.487198,19.176,6.7011,5.029,7.0839,20.235,0,0,0,0,0,0,0
4,1,5,4.1566,785.15497,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0


In [43]:
test_df.head()
test_df.count()

water_ID           2947
cycle              2947
clor               2947
conductivity       2947
dissolvedoxygen    2947
pH                 2947
pressure           2947
turbidity          2947
temperature        2947
corr-DO2           2947
corr-PH            2947
suspend-Turb       2947
corr-TempPh        2947
crack-Pressure     2947
corr-Cond          2947
Fault              2947
dtype: int64

In [44]:
# read ground truth data
truth_df = pd.read_excel('dataset2-truth.xlsx', header=None)
truth_df.head()

Unnamed: 0,0
0,19
1,69
2,12
3,487
4,5


In [45]:
truth_df.count()

0    36
dtype: int64

In [46]:
train_df = train_df.sort_values(['water_ID','cycle'])
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0


In [47]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('water_ID')['cycle'].max()).reset_index()
rul.columns = ['water_ID', 'max']
train_df = train_df.merge(rul, on=['water_ID'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0,323
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0,322
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0,321
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0,320
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0,319


In [48]:
# generate label columns for training data
w1 = 40
w0 = 20
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0,323,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0,322,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0,321,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0,320,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0,319,0,0


In [49]:
# MinMax normalization
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['water_ID','cycle','RUL','label1','label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2,cycle_norm
0,1,1,0.397923,0.298017,0.336151,0.53831,0.579542,0.48098,0.377917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,323,0,0,0.0
1,1,2,0.400519,0.310529,0.355935,0.530648,0.587946,0.48661,0.36606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,322,0,0,0.000775
2,1,3,0.397923,0.298017,0.336151,0.53831,0.596349,0.49224,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,321,0,0,0.001549
3,1,4,0.400519,0.310529,0.316366,0.530648,0.604752,0.497869,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,320,0,0,0.002324
4,1,5,0.397923,0.298017,0.336151,0.53831,0.596349,0.503499,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,319,0,0,0.003098


In [50]:
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,cycle_norm
0,1,1,0.405712,0.335554,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.408309,0.323041,0.940645,0.492338,0.596349,0.48098,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000775
2,1,3,0.410905,0.310529,0.920861,0.484676,0.587946,0.47535,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549
3,1,4,0.413502,0.323041,0.940645,0.492338,0.596349,0.48098,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002324
4,1,5,0.416098,0.310529,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003098


In [51]:
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('water_ID')['cycle'].max()).reset_index()
rul.columns = ['water_ID', 'max']
truth_df.columns = ['more']
truth_df['water_ID'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

In [52]:
truth_df.head()

Unnamed: 0,water_ID,max
0,1,52
1,2,132
2,3,98
3,4,597
4,5,63


In [53]:
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,cycle_norm
0,1,1,0.405712,0.335554,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.408309,0.323041,0.940645,0.492338,0.596349,0.48098,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000775
2,1,3,0.410905,0.310529,0.920861,0.484676,0.587946,0.47535,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549
3,1,4,0.413502,0.323041,0.940645,0.492338,0.596349,0.48098,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002324
4,1,5,0.416098,0.310529,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003098


In [54]:
rul.head()

Unnamed: 0,water_ID,max
0,1,33
1,2,63
2,3,86
3,4,110
4,5,58


In [55]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['water_ID'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

In [56]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,cycle_norm,RUL,label1,label2
0,1,1,0.405712,0.335554,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51,0,0
1,1,2,0.408309,0.323041,0.940645,0.492338,0.596349,0.48098,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000775,50,0,0
2,1,3,0.410905,0.310529,0.920861,0.484676,0.587946,0.47535,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549,49,0,0
3,1,4,0.413502,0.323041,0.940645,0.492338,0.596349,0.48098,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002324,48,0,0
4,1,5,0.416098,0.310529,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003098,47,0,0


In [57]:
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2,cycle_norm
0,1,1,0.397923,0.298017,0.336151,0.53831,0.579542,0.48098,0.377917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,323,0,0,0.0
1,1,2,0.400519,0.310529,0.355935,0.530648,0.587946,0.48661,0.36606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,322,0,0,0.000775
2,1,3,0.397923,0.298017,0.336151,0.53831,0.596349,0.49224,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,321,0,0,0.001549
3,1,4,0.400519,0.310529,0.316366,0.530648,0.604752,0.497869,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,320,0,0,0.002324
4,1,5,0.397923,0.298017,0.336151,0.53831,0.596349,0.503499,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,319,0,0,0.003098


In [58]:
# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length, seq_cols):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    data_array = id_df[seq_cols].values
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

In [59]:
# pick the feature columns 
sensor_cols = ['clor', 'conductivity', 'dissolvedoxygen', 'pH', 'pressure', 'turbidity', 'temperature']
sequence_cols = ['cycle_norm']
sequence_cols.extend(sensor_cols)

In [65]:
# generator for the sequences
seq_gen = (list(gen_sequence(train_df[train_df['water_ID']==id], sequence_length, sequence_cols)) 
           for id in train_df['water_ID'].unique())

In [66]:
# pick a large window size of 5 cycles
sequence_length = 8

In [67]:
# generate sequences and convert to numpy array
seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
seq_array.shape

(14675, 8, 8)

In [68]:
# function to generate labels
def gen_labels(id_df, seq_length, label):
    data_array = id_df[label].values
    num_elements = data_array.shape[0]
    return data_array[seq_length:num_elements, :]

In [69]:
# generate labels
label_gen = [gen_labels(train_df[train_df['water_ID']==id], sequence_length, ['label1']) 
             for id in train_df['water_ID'].unique()]
label_array = np.concatenate(label_gen).astype(np.float32)
label_array.shape

(14675, 1)

In [70]:
from keras import backend as K

def f1_score(y_true, y_pred):
    precision = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) / (K.sum(K.round(K.clip(y_pred, 0, 1))) + K.epsilon())
    recall = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) / (K.sum(K.round(K.clip(y_true, 0, 1))) + K.epsilon())
    f1_score = 2 * precision * recall / (precision + recall + K.epsilon())
    return f1_score

In [71]:
from keras.regularizers import L1
from keras.layers import MaxPooling1D
from keras.optimizers import Adam
from keras.layers import GRU
from keras import metrics

nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(Conv1D(32, 3, input_shape=(sequence_length, nb_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(GRU(units=512, return_sequences=True))
model.add(LSTM(units=512, return_sequences=True))
model.add(Dropout(0.05))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.05))
model.add(LSTM(units=40, return_sequences=False, kernel_regularizer=L1(0.01)))
model.add(Dropout(0.05))
model.add(Dense(units=nb_out, activation='relu'))

lr = 0.0001
optimizer = Adam(lr=lr)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', metrics.Precision(), metrics.Recall(), f1_score])


  super().__init__(name, **kwargs)


In [72]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 6, 32)             800       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 3, 32)            0         
 )                                                               
                                                                 
 gru (GRU)                   (None, 3, 512)            838656    
                                                                 
 lstm (LSTM)                 (None, 3, 512)            2099200   
                                                                 
 dropout (Dropout)           (None, 3, 512)            0         
                                                                 
 lstm_1 (LSTM)               (None, 3, 128)            328192    
                                                        

In [73]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

STAMP = 'predictive_binary'
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=10)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

predictive_binary


In [74]:
%%time
import time

# fit the network
start = time.time()
model.fit(seq_array, label_array, epochs=200, batch_size=32, validation_split=0.1, verbose=1,callbacks=[early_stopping, model_checkpoint])
end_train = time.time()

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Wall time: 14min 10s


In [34]:
# training metrics
scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=32)
print('Accuracy: {}'.format(scores[1]))


Accuracy: 0.7813287973403931


In [35]:
pred_test_start = time.time()
y_pred = model.predict(seq_array,verbose=1, batch_size=32)
pred_test_stop = time.time()
y_true = label_array
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(y_true, y_pred.round())
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[9384,  863],
       [2346, 2082]], dtype=int64)

In [36]:
# compute precision and recall
precision = precision_score(y_true, y_pred.round())
recall = recall_score(y_true, y_pred.round())
f1_test = 2 * (precision * recall) / (precision + recall)
print( 'precision = ', precision, '\n', 'recall = ', recall, '\n', 'F1-score: = ', f1_test)

precision =  0.7069609507640068 
 recall =  0.470189701897019 
 F1-score: =  0.5647633256476332


In [37]:
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in test_df['id'].unique() if len(test_df[test_df['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(36, 8, 8)

In [38]:
y_mask = [len(test_df[test_df['id']==id]) >= sequence_length for id in test_df['id'].unique()]


In [39]:
label_array_test_last = test_df.groupby('id')['label1'].nth(-1)[y_mask].values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(36, 1)

In [40]:
print(seq_array_test_last.shape)
print(label_array_test_last.shape)

(36, 8, 8)
(36, 1)


In [41]:
# test metrics
scores_test = model.evaluate(seq_array_test_last, label_array_test_last, verbose=2)
print('Accuracy: {}'.format(scores_test[1]))

2/2 - 0s - loss: 3.0770 - accuracy: 0.6111 - precision: 0.8333 - recall: 0.2778 - f1_score: 0.2174 - 72ms/epoch - 36ms/step
Accuracy: 0.6111111044883728


In [42]:
# make predictions and compute confusion matrix
pred_last_start = time.time()
y_pred_test = model.predict(seq_array_test_last)
pred_last_stop = time.time()
y_true_test = label_array_test_last
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(y_true_test, y_pred_test.round())
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[17,  1],
       [13,  5]], dtype=int64)

In [43]:
# precision_test = precision_score(y_true_test, y_pred_test.round())
#recall_test = recall_score(y_true_test, y_pred_test.round())
#f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
#print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

In [44]:
results_df = pd.DataFrame([[scores_test[1], scores_test[2], scores_test[3], scores_test[4], end_train-start, pred_test_stop-pred_test_start, pred_last_stop-pred_last_start],
                          ],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1Score','Training time',
                                          'Prediction time','Prediction Last Row'],
                         index = ['CNNLSTM'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1Score,Training time,Prediction time,Prediction Last Row
CNNLSTM,0.611111,0.833333,0.277778,0.217391,873.604495,16.573858,0.137636
