In [199]:
import keras

In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation,Conv1D
%matplotlib inline

In [201]:
# read training data 
train_df = pd.read_excel('dataset2-train.xlsx')
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0


In [202]:
# read training data 
test_df = pd.read_excel('dataset2-test.xlsx')
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,4.0522,847.819426,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0
1,1,2,4.0783,816.487198,19.176,6.7011,5.029,7.0839,21.741,0,0,0,0,0,0,0
2,1,3,4.1044,785.15497,18.764,6.6584,5.0,7.0,20.988,0,0,0,0,0,0,0
3,1,4,4.1305,816.487198,19.176,6.7011,5.029,7.0839,20.235,0,0,0,0,0,0,0
4,1,5,4.1566,785.15497,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0


In [203]:
# read ground truth data
truth_df = pd.read_excel('dataset2-truth.xlsx', header=None)
truth_df.head()

Unnamed: 0,0
0,19
1,69
2,12
3,487
4,5


In [204]:
train_df = train_df.sort_values(['water_ID','cycle'])
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0


In [205]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('water_ID')['cycle'].max()).reset_index()
rul.columns = ['water_ID', 'max']
train_df = train_df.merge(rul, on=['water_ID'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0,323
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0,322
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0,321
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0,320
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0,319


In [206]:
# generate label columns for training data
w1 = 40
w0 = 20
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0,323,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0,322,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0,321,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0,320,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0,319,0,0


In [207]:
# MinMax normalization
train_df['cycle_norm'] = train_df['cycle']
cols_normalize = train_df.columns.difference(['water_ID','cycle','RUL','label1','label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]), 
                             columns=cols_normalize, 
                             index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns = train_df.columns)
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2,cycle_norm
0,1,1,0.397923,0.298017,0.336151,0.53831,0.579542,0.48098,0.377917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,323,0,0,0.0
1,1,2,0.400519,0.310529,0.355935,0.530648,0.587946,0.48661,0.36606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,322,0,0,0.000775
2,1,3,0.397923,0.298017,0.336151,0.53831,0.596349,0.49224,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,321,0,0,0.001549
3,1,4,0.400519,0.310529,0.316366,0.530648,0.604752,0.497869,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,320,0,0,0.002324
4,1,5,0.397923,0.298017,0.336151,0.53831,0.596349,0.503499,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,319,0,0,0.003098


In [208]:
test_df['cycle_norm'] = test_df['cycle']
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]), 
                            columns=cols_normalize, 
                            index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns = test_df.columns)
test_df = test_df.reset_index(drop=True)
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,cycle_norm
0,1,1,0.405712,0.335554,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.408309,0.323041,0.940645,0.492338,0.596349,0.48098,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000775
2,1,3,0.410905,0.310529,0.920861,0.484676,0.587946,0.47535,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549
3,1,4,0.413502,0.323041,0.940645,0.492338,0.596349,0.48098,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002324
4,1,5,0.416098,0.310529,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003098


In [209]:
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('water_ID')['cycle'].max()).reset_index()
rul.columns = ['water_ID', 'max']
truth_df.columns = ['more']
truth_df['water_ID'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

In [210]:
truth_df.head()

Unnamed: 0,water_ID,max
0,1,52
1,2,132
2,3,98
3,4,597
4,5,63


In [211]:
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,cycle_norm
0,1,1,0.405712,0.335554,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,0.408309,0.323041,0.940645,0.492338,0.596349,0.48098,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000775
2,1,3,0.410905,0.310529,0.920861,0.484676,0.587946,0.47535,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549
3,1,4,0.413502,0.323041,0.940645,0.492338,0.596349,0.48098,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002324
4,1,5,0.416098,0.310529,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003098


In [212]:
rul.head()

Unnamed: 0,water_ID,max
0,1,33
1,2,63
2,3,86
3,4,110
4,5,58


In [213]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['water_ID'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)

In [214]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,cycle_norm,RUL,label1,label2
0,1,1,0.405712,0.335554,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51,0,0
1,1,2,0.408309,0.323041,0.940645,0.492338,0.596349,0.48098,0.354203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000775,50,0,0
2,1,3,0.410905,0.310529,0.920861,0.484676,0.587946,0.47535,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549,49,0,0
3,1,4,0.413502,0.323041,0.940645,0.492338,0.596349,0.48098,0.330488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002324,48,0,0
4,1,5,0.416098,0.310529,0.96043,0.5,0.587946,0.48661,0.342346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003098,47,0,0


In [215]:
# pick the feature columns 
sensor_cols = ['clor', 'conductivity', 'dissolvedoxygen', 'pH', 'pressure', 'turbidity', 'temperature']
sequence_cols = ['cycle_norm']
sequence_cols.extend(sensor_cols)

In [185]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_val, Y_train, Y_val = train_test_split(train_df[sequence_cols], train_df['label1'], test_size=0.05, 
                                                  shuffle=False, random_state=42)

print ("Train_shape: " + str(X_train.shape))
print ("Val_shape: " + str(X_val.shape))
print ("No of positives in train: " + str(Y_train.sum()))
print ("No of positives in val: " + str(Y_val.sum()))

Train_shape: (14762, 8)
Val_shape: (777, 8)
No of positives in train: 4223
No of positives in val: 205


In [186]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import time



print('Start training...')
start = time.time()
clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=8, max_features='auto', random_state=42)
clf.fit(X_train, Y_train)
end_train = time.time()

print ("Validation Accuracy: "+str(accuracy_score(Y_val, clf.predict(X_val))))

# evaluate the model
#log_scores("Decision Tree", y_test, y_predictions)

Start training...
Validation Accuracy: 0.6061776061776062


In [187]:
#print("Decision Tree\n" + classification_report(y_test, y_predictions))

In [188]:
# training metrics
pred_train = clf.predict(train_df[sequence_cols])
pred_train = np.where(pred_train > 0.5, 1, 0)
print('Accurracy: {}'.format(accuracy_score(train_df['label1'], pred_train)))

Accurracy: 0.8247634982946136


In [189]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(train_df['label1'], pred_train)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[10149,   962],
       [ 1761,  2667]], dtype=int64)

In [190]:
pred_test_start = time.time()
pred_test = clf.predict(test_df[sequence_cols])
pred_test_stop = time.time()
pred_test = np.where(pred_test > 0.5, 1, 0)

print('Accuracy: {}'.format(accuracy_score(test_df['label1'], pred_test)))

Accuracy: 0.8069222938581608


In [191]:
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(test_df['label1'], pred_test)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[2237,  198],
       [ 371,  141]], dtype=int64)

In [192]:
from sklearn.metrics import precision_score, recall_score, f1_score

# assuming pred_test and test_df['label1'] are defined
precision = precision_score(test_df['label1'], pred_test)
recall = recall_score(test_df['label1'], pred_test)
f1 = f1_score(test_df['label1'], pred_test)

print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1 score: {:.2f}'.format(f1))


Precision: 0.42
Recall: 0.28
F1 score: 0.33


In [193]:
label_array_test_last = test_df.groupby('id')['label1'].nth(-1).values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(36, 1)

In [194]:
seq_array_test_last = [test_df[test_df['id']==id][sequence_cols].values[-1] for id in test_df['id'].unique()]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(36, 8)

In [195]:
pred_last_start = time.time()
pred_test_last = clf.predict(seq_array_test_last)
pred_last_stop = time.time()
pred_test_last = np.where(pred_test_last > 0.5, 1, 0)
acc = accuracy_score(label_array_test_last, pred_test_last)
print('Accuracy: {}'.format(acc))

Accuracy: 0.6944444444444444




In [196]:
# make predictions and compute confusion matrix
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(label_array_test_last, pred_test_last)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[18,  0],
       [11,  7]], dtype=int64)

In [197]:
# compute precision and recall
precision_test = precision_score(label_array_test_last, pred_test_last)
recall_test = recall_score(label_array_test_last, pred_test_last)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  1.0 
 Recall:  0.3888888888888889 
 F1-score: 0.56


In [198]:
results_df = pd.DataFrame([[acc,precision_test,recall_test,f1_test,end_train-start, pred_test_stop-pred_test_start, pred_last_stop-pred_last_start],
                          ],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1-score','Training time',
                                          'Prediction time','Prediction Last Row'],
                         index = ['DecisionTree'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score,Training time,Prediction time,Prediction Last Row
DecisionTree,0.694444,1.0,0.388889,0.56,0.020518,0.001392,0.000996
