In [48]:
import keras

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Setting seed for reproducability
np.random.seed(1234)  
PYTHONHASHSEED = 0
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation,Conv1D
%matplotlib inline

In [77]:
# read training data 
train_df = pd.read_excel('dataset2-train.xlsx')

In [78]:
train_df.head(10)

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0
5,1,6,4.0,722.490515,6.176,7.0,5.0,7.3356,19.482,0,0,0,0,0,0,0
6,1,7,4.0261,691.158287,5.764,7.0427,5.029,7.2517,20.235,0,0,0,0,0,0,0
7,1,8,4.0,722.490515,6.176,7.0854,5.058,7.1678,19.482,0,0,0,0,0,0,0
8,1,9,4.0261,691.158287,5.764,7.0427,5.029,7.0839,18.729,0,0,0,0,0,0,0
9,1,10,4.0,722.490515,5.352,7.0854,5.058,7.1678,17.976,0,0,0,0,0,0,0


In [79]:
# read training data 
test_df = pd.read_excel('dataset2-test.xlsx')

In [80]:
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,4.0522,847.819426,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0
1,1,2,4.0783,816.487198,19.176,6.7011,5.029,7.0839,21.741,0,0,0,0,0,0,0
2,1,3,4.1044,785.15497,18.764,6.6584,5.0,7.0,20.988,0,0,0,0,0,0,0
3,1,4,4.1305,816.487198,19.176,6.7011,5.029,7.0839,20.235,0,0,0,0,0,0,0
4,1,5,4.1566,785.15497,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0


In [81]:
# read ground truth data
truth_df = pd.read_excel('dataset2-truth.xlsx', header=None)
truth_df.head()

Unnamed: 0,0
0,19
1,69
2,12
3,487
4,5


In [82]:
train_df = train_df.sort_values(['water_ID','cycle'])
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0


In [83]:
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('water_ID')['cycle'].max()).reset_index()
rul.columns = ['water_ID', 'max']
train_df = train_df.merge(rul, on=['water_ID'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0,323
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0,322
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0,321
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0,320
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0,319


In [84]:
# generate label columns for training data
w1 = 40
w0 = 20
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0 )
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
train_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2
0,1,1,3.9739,753.822743,6.588,6.9573,4.971,7.0839,23.247,0,0,0,0,0,0,0,323,0,0
1,1,2,4.0,785.15497,7.0,6.9146,5.0,7.1678,22.494,0,0,0,0,0,0,0,322,0,0
2,1,3,3.9739,753.822743,6.588,6.9573,5.029,7.2517,21.741,0,0,0,0,0,0,0,321,0,0
3,1,4,4.0,785.15497,6.176,6.9146,5.058,7.3356,20.988,0,0,0,0,0,0,0,320,0,0
4,1,5,3.9739,753.822743,6.588,6.9573,5.029,7.4195,20.235,0,0,0,0,0,0,0,319,0,0


In [85]:
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('water_ID')['cycle'].max()).reset_index()
rul.columns = ['water_ID', 'max']
truth_df.columns = ['more']
truth_df['water_ID'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)

In [86]:
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['water_ID'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL
0,1,1,4.0522,847.819426,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0,51
1,1,2,4.0783,816.487198,19.176,6.7011,5.029,7.0839,21.741,0,0,0,0,0,0,0,50
2,1,3,4.1044,785.15497,18.764,6.6584,5.0,7.0,20.988,0,0,0,0,0,0,0,49
3,1,4,4.1305,816.487198,19.176,6.7011,5.029,7.0839,20.235,0,0,0,0,0,0,0,48
4,1,5,4.1566,785.15497,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0,47


In [87]:
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0 )
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
test_df.head()

Unnamed: 0,water_ID,cycle,clor,conductivity,dissolvedoxygen,pH,pressure,turbidity,temperature,corr-DO2,corr-PH,suspend-Turb,corr-TempPh,crack-Pressure,corr-Cond,Fault,RUL,label1,label2
0,1,1,4.0522,847.819426,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0,51,0,0
1,1,2,4.0783,816.487198,19.176,6.7011,5.029,7.0839,21.741,0,0,0,0,0,0,0,50,0,0
2,1,3,4.1044,785.15497,18.764,6.6584,5.0,7.0,20.988,0,0,0,0,0,0,0,49,0,0
3,1,4,4.1305,816.487198,19.176,6.7011,5.029,7.0839,20.235,0,0,0,0,0,0,0,48,0,0
4,1,5,4.1566,785.15497,19.588,6.7438,5.0,7.1678,20.988,0,0,0,0,0,0,0,47,0,0


In [88]:
# pick the feature columns 
sensor_cols = ['clor', 'conductivity', 'dissolvedoxygen', 'pH', 'pressure', 'turbidity', 'temperature']
sequence_cols = ['cycle']
sequence_cols.extend(sensor_cols)

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_val, Y_train, Y_val = train_test_split(train_df[sequence_cols], train_df['label1'], test_size=0.1, 
                                                  shuffle=False, random_state=42)

print ("Train_shape: " + str(X_train.shape))
print ("Val_shape: " + str(X_val.shape))
print ("No of positives in train: " + str(Y_train.sum()))
print ("No of positives in val: " + str(Y_val.sum()))

Train_shape: (13985, 8)
Val_shape: (1554, 8)
No of positives in train: 4100
No of positives in val: 328


In [90]:
import xgboost as xgb
import time

params = {}
params['booster'] = 'gbtree'
params['objective'] = 'binary:logistic'
params['eta'] = 0.000001
params['eval_metric'] = 'auc'
params['max_depth'] = 30
params['colsample_bytree'] = 0.8
params['subsample'] = 0.8
# params['min_child_weight'] = 5
params['silent'] = 1

d_train = xgb.DMatrix(X_train, label=Y_train)
d_valid = xgb.DMatrix(X_val, label=Y_val)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
start = time.time()
gbm = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=25, verbose_eval=25)
end_train = time.time()

Parameters: { "silent" } are not used.

[0]	train-auc:0.97608	valid-auc:0.63005




[25]	train-auc:0.99984	valid-auc:0.66419
[50]	train-auc:0.99993	valid-auc:0.68001
[60]	train-auc:0.99995	valid-auc:0.67577


In [64]:
from operator import itemgetter

def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance

print(get_importance(gbm, list(X_train.columns.values)))

[('cycle', 1484.0), ('clor', 1210.0), ('pH', 911.0), ('conductivity', 871.0), ('temperature', 856.0), ('pressure', 695.0), ('turbidity', 694.0), ('dissolvedoxygen', 494.0)]


In [65]:
from sklearn.metrics import accuracy_score
# training metrics

d_trn = xgb.DMatrix(train_df[sequence_cols]) 
pred_train = gbm.predict(d_trn)
pred_train = np.where(pred_train > 0.5, 1, 0)
print('Accuracy: {}'.format(accuracy_score(train_df['label1'], pred_train)))

Accuracy: 0.9721989832035524


In [66]:
# compute precision and recall
precision_train = precision_score(train_df['label1'], pred_train)
recall_train = recall_score(train_df['label1'], pred_train)
f1_train = 2 * (precision_train * recall_train) / (precision_train + recall_train)
print( 'Precision: ', precision_train, '\n', 'Recall: ', recall_train,'\n', 'F1-score:', f1_train )

Precision:  0.9653004191895669 
 Recall:  0.9360885275519422 
 F1-score: 0.9504700756707178


In [67]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score

print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(train_df['label1'], pred_train)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[10962,   149],
       [  283,  4145]], dtype=int64)

In [68]:
d_test =xgb.DMatrix(test_df[sequence_cols])
pred_test_start = time.time()
pred_test = gbm.predict(d_test)
pred_test_stop = time.time()
pred_test = np.where(pred_test > 0.5, 1, 0)
print('Accuracy: {}'.format(accuracy_score(test_df['label1'], pred_test)))

Accuracy: 0.8103155751611809


In [69]:
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(test_df['label1'], pred_test)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[2265,  170],
       [ 389,  123]], dtype=int64)

In [70]:
# compute precision and recall
precision_test = precision_score(test_df['label1'], pred_test)
recall_test = recall_score(test_df['label1'], pred_test)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.4197952218430034 
 Recall:  0.240234375 
 F1-score: 0.30559006211180123


In [71]:
label_array_test_last = test_df.groupby('water_ID')['label1'].nth(-1).values
label_array_test_last = label_array_test_last.reshape(label_array_test_last.shape[0],1).astype(np.float32)
label_array_test_last.shape

(36, 1)

In [72]:
seq_array_test_last = [test_df[test_df['water_ID']==id][sequence_cols].values[-1] for id in test_df['water_ID'].unique()]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)
seq_array_test_last.shape

(36, 8)

In [73]:
d_test_last = xgb.DMatrix(seq_array_test_last)
d_test_last.feature_names = d_trn.feature_names

pred_last_start = time.time()
pred_test_last = gbm.predict(d_test_last)
pred_last_stop = time.time()

pred_test_last = np.where(pred_test_last > 0.5, 1, 0)
acc = accuracy_score(label_array_test_last, pred_test_last)
print('Accuracy: {}'.format(acc))

Accuracy: 0.6666666666666666


In [74]:
# make predictions and compute confusion matrix
print('Confusion matrix\n- x-axis is true labels.\n- y-axis is predicted labels')
cm = confusion_matrix(label_array_test_last, pred_test_last)
cm

Confusion matrix
- x-axis is true labels.
- y-axis is predicted labels


array([[17,  1],
       [11,  7]], dtype=int64)

In [75]:
# compute precision and recall
precision_test = precision_score(label_array_test_last, pred_test_last)
recall_test = recall_score(label_array_test_last, pred_test_last)
f1_test = 2 * (precision_test * recall_test) / (precision_test + recall_test)
print( 'Precision: ', precision_test, '\n', 'Recall: ', recall_test,'\n', 'F1-score:', f1_test )

Precision:  0.875 
 Recall:  0.3888888888888889 
 F1-score: 0.5384615384615385


In [76]:
results_df = pd.DataFrame([[acc,precision_test,recall_test,f1_test,end_train-start, pred_test_stop-pred_test_start, pred_last_stop-pred_last_start],
                          ],
                         columns = ['Accuracy', 'Precision', 'Recall', 'F1-score','Training time',
                                          'Prediction time','Prediction Last Row'],
                         index = ['XGBoost'])
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-score,Training time,Prediction time,Prediction Last Row
XGBoost,0.666667,0.875,0.388889,0.538462,0.528304,0.00108,0.0
