## Importing Required Modules

In [1]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

In [2]:
BATCH_SIZE = 64
MERGE_SIZE = 400

## Loading and preprocessing our Parquet Data

In [3]:
metadata_train = pd.read_csv('/Users/sshubam/Desktop/fsm_project/Dataset/metadata_train.csv')

In [4]:
def read_wave_data(parquet_path,col_nums,end_col_num, merge_size=800):
    df_diff = None
    for i, col_num in tqdm(enumerate(col_nums)):
        start = col_num
        if i == len(col_nums) - 1:
            end = end_col_num
        else:
            end = col_nums[i + 1]
        columns = [str(j) for j in range(start,end)]
        tmp_df = pq.read_pandas(parquet_path, columns=columns).to_pandas()
        group_id = np.repeat(range(len(tmp_df) // merge_size), merge_size)
        tmp_df['group_id'] = pd.Series(group_id)
        tmp_diff = (tmp_df.groupby('group_id').max() - tmp_df.groupby('group_id').min()) / 256
        if df_diff is None:
            df_diff = tmp_diff
        else:
            df_diff = pd.concat([df_diff, tmp_diff], axis=1)
    df_diff = df_diff.astype('float16')
    return df_diff

In [5]:
train_parquet_path = '/Users/sshubam/Desktop/fsm_project/train.parquet'
end_col_num = metadata_train['signal_id'].values[-1] + 1
col_nums = metadata_train['signal_id'].values[::500].tolist()
train_diff = read_wave_data(train_parquet_path,col_nums,end_col_num,merge_size=MERGE_SIZE)
print(train_diff.shape)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, col_num in tqdm(enumerate(col_nums)):


0it [00:00, ?it/s]

(2000, 8712)


In [10]:
def train_data_gen(metadata_train, train_diff, batch_size=128, is_reverse=False):
    np.random.seed(1)
    while True:
        x_train = []
        y_train = []
        true_sample = metadata_train[metadata_train['target']==1].sample(batch_size // 2)
        neg_sample = metadata_train[metadata_train['target']==0].sample(batch_size // 2)
    
        sample_signal_id = np.concatenate([true_sample['signal_id'].values,neg_sample['signal_id'].values])
        np.random.shuffle(sample_signal_id)
        for signal_id in sample_signal_id:
            diffs = train_diff[str(signal_id)].values.T
            
            if is_reverse:
                diffs = diffs[::-1]
            
            data = diffs[:, np.newaxis]
            x_train.append(data)
            y_train.append(metadata_train[metadata_train['signal_id']==signal_id]['target'].values[0])
            
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        yield x_train, y_train

In [7]:
metadata_train, metadata_val = train_test_split(metadata_train, test_size=0.2, random_state=42)
print(metadata_train.shape)
print(metadata_val.shape)

(6969, 4)
(1743, 4)


In [8]:
x_val = []
y_val = []
for signal_id in metadata_val['signal_id'].values:
    diffs = train_diff[str(signal_id)].values.T
    data = diffs[:, np.newaxis]
    x_val.append(data)
    y_val.append(metadata_val[metadata_val['signal_id']==signal_id]['target'].values[0])
x_val = np.array(x_val)
y_val = np.array(y_val)    
print(x_val.shape)
print(y_val.shape)

(1743, 2000, 1)
(1743,)


In [10]:
from keras.models import Sequential
from keras import layers
import keras.models as models
import keras.backend as K
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau

Init Plugin
Init Graph Optimizer
Init Kernel


In [47]:
from sklearn.metrics import confusion_matrix

def mcc(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TN = cm[1][1]
    val = ((TP * TN) - (FP * FN)) / ((TP + FP)*(TP + FN)*(TN + FP)*(TN + FN))**0.5
    return val

In [14]:
def matthews_corr_coeff(y_true, y_pred):
    y_pos_pred = K.round(K.clip(y_pred, 0, 1))
    y_pos_true = K.round(K.clip(y_true, 0, 1))
    
    y_neg_pred = 1 - y_pos_pred
    y_neg_true = 1 - y_pos_true

    tp = K.sum(y_pos_true * y_pos_pred)
    tn = K.sum(y_neg_true * y_neg_pred)
    fp = K.sum(y_neg_true * y_pos_pred)
    fn = K.sum(y_pos_true * y_neg_pred)
    return (tp * tn - fp * fn) / (K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) + K.epsilon())

In [10]:
length_of_sequence = 2000
drop_out_rate = 0.2
recurrent_dropout = 0.5
STEPS_PER_EPOCH = 100
EPOCHS = 50

## Building our CNN-LSTM

In [11]:
model = Sequential()

model.add(layers.Conv1D(32, 8, padding='same',input_shape=(length_of_sequence, 1),activation='relu'))
model.add(layers.MaxPooling1D(2, padding='same'))
model.add(layers.Conv1D(64, 8, padding='same', activation='relu'))
model.add(layers.MaxPooling1D(2, padding='same'))
model.add(layers.Conv1D(128, 8, padding='same', activation='relu'))
model.add(layers.MaxPooling1D(2, padding='same'))
model.add(layers.Conv1D(256, 8, padding='same', activation='relu'))
model.add(layers.LSTM(64, dropout = drop_out_rate,recurrent_dropout = recurrent_dropout))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy',matthews_corr_coeff])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 2000, 32)          288       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1000, 32)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 64)          16448     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 500, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 128)          65664     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 250, 128)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 250, 256)         

In [24]:
weight_path="{}_weights.best.hdf5".format('lstm_model')
early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=10) 
lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min', save_weights_only = True)

callbacks_list = [checkpoint, early, lr]

In [25]:
train_gen = train_data_gen(metadata_train, train_diff, batch_size=BATCH_SIZE)

In [None]:
history = model.fit_generator(
                train_gen,
                steps_per_epoch=STEPS_PER_EPOCH,
                epochs=EPOCHS,
                validation_data=(x_val,y_val),
                callbacks=callbacks_list)

In [42]:
model.load_weights('lstm_model_weights.best.hdf5')

In [15]:
import tensorflow as tf
model = tf.keras.models.load_model('model', custom_objects={'matthews_corr_coeff':matthews_corr_coeff})

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-08-02 09:12:16.488691: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-02 09:12:16.489323: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [24]:
y_val_pred = model.predict(x_val)

2022-07-15 20:37:39.539906: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-07-15 20:37:39.542992: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-07-15 20:37:39.654784: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [68]:
y_val_pred

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [69]:
y_val_pred = y_val_pred.flatten()
y_val_pred[y_val_pred >= 0.5] = 1
y_val_pred[y_val_pred < 0.5] = 0
y_val_pred.sum()

136.0

In [70]:
y_val.sum()

102

In [252]:
x_val.dtype

dtype('float16')

In [None]:
x_val_csv = np.array[x_val]

In [71]:
mcc(y_val,y_val_pred)

0.5927272380944404

In [5]:
import tensorflow as tf

In [73]:
metadata_test = pd.read_csv('Dataset/metadata_test.csv')
metadata_train = pd.read_csv('Dataset/metadata_train.csv')

In [40]:
test_parquet_path = 'test.parquet'
end_col_num = metadata_test['signal_id'].values[-1] + 1
col_nums = metadata_test['signal_id'].values[::500].tolist()
test_diff = read_wave_data(test_parquet_path,col_nums,end_col_num,merge_size=MERGE_SIZE)
print(test_diff.shape)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, col_num in tqdm(enumerate(col_nums)):


0it [00:00, ?it/s]

(2000, 20337)


In [157]:
x_test = []
for c in test_diff.columns:
    diffs = test_diff[c].values.T
    data = diffs[:, np.newaxis]
    x_test.append(data)
    
x_test = np.array(x_test)
print(x_test.shape)

(20337, 2000, 1)


In [128]:
y_test = model.predict(x_test)
y_test = y_test.flatten()
y_test[y_test >= 0.5] = 1
y_test[y_test < 0.5] = 0
print(y_test.sum())

1220.0


In [129]:
x_test.shape

(20337, 2000, 1)

In [130]:
y_test.sum()

1220.0

In [131]:
y_test[0]

0.0

In [141]:
x_test = np.array(x_test)

In [142]:
x_test[0].shape

(2000, 1)

In [247]:
x_test.shape

(20337, 2000, 1)

## Testing a Sample

In [20]:
import random
n = random.randint(0, 1743)
print(f"Predicting on Signal number {n} from x_val")
test = x_val[n]
test = np.expand_dims(test, axis = 0)
print(f"Fault: {y_val[n]}")
if (model.predict(test)).item() > 0.5:
    prediction = 1
else:
    prediction = 0
print(f"Prediction: {prediction}")

Predicting on Signal number 1336 from x_val
Fault: 0
Prediction: 0


In [227]:
x_val[0].squeeze().shape

(2000,)

In [216]:
print(np.where(y_val == 1))

(array([  35,   54,  115,  123,  133,  148,  175,  187,  228,  232,  234,
        235,  246,  255,  269,  283,  289,  304,  351,  361,  365,  368,
        376,  393,  407,  434,  438,  447,  459,  497,  514,  530,  542,
        546,  558,  563,  583,  632,  635,  656,  662,  702,  717,  719,
        727,  766,  770,  786,  806,  837,  849,  855,  860,  866,  870,
        896,  905,  912,  939,  958,  965, 1030, 1069, 1100, 1118, 1129,
       1130, 1134, 1138, 1207, 1234, 1237, 1241, 1245, 1263, 1266, 1270,
       1283, 1292, 1301, 1317, 1324, 1331, 1371, 1393, 1423, 1432, 1449,
       1494, 1499, 1500, 1501, 1555, 1594, 1618, 1659, 1667, 1675, 1704,
       1713, 1721, 1736]),)


In [159]:
model.predict(test)

array([[0.09625411]], dtype=float32)

In [76]:
y_test

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [9]:
x_val.dtype

dtype('float16')

In [10]:
x_val

array([[[0.01953],
        [0.02734],
        [0.02344],
        ...,
        [0.01953],
        [0.01953],
        [0.02344]],

       [[0.01953],
        [0.02344],
        [0.01953],
        ...,
        [0.02344],
        [0.01953],
        [0.02344]],

       [[0.01953],
        [0.01953],
        [0.01953],
        ...,
        [0.01953],
        [0.02344],
        [0.02344]],

       ...,

       [[0.02344],
        [0.01953],
        [0.02344],
        ...,
        [0.01953],
        [0.02344],
        [0.01563]],

       [[0.01953],
        [0.02344],
        [0.02734],
        ...,
        [0.02344],
        [0.01563],
        [0.01953]],

       [[0.04688],
        [0.03516],
        [0.04297],
        ...,
        [0.03906],
        [0.04297],
        [0.03906]]], dtype=float16)

In [12]:
x_val = np.array(x_val, dtype=np.float16)

In [13]:
x_val.dtype

dtype('float16')

In [15]:
x_val = x_val[:, :, 0]

In [38]:
dataframe=pd.DataFrame(x_val) 

In [48]:
test = test[:, :, 0]
test = pd.DataFrame(test, dtype='float16') 

In [49]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.378906,0.203125,0.246094,0.359375,0.136719,0.363281,0.230469,0.089844,0.074219,0.167969,...,0.144531,0.3125,0.242188,-0.28125,0.191406,0.34375,0.425781,0.316406,-0.28125,0.308594


In [38]:
test.to_csv('ONE_CSV_withfault')

In [39]:
dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.019531,0.027344,0.023438,0.023438,0.023438,0.027344,0.023438,0.019531,0.023438,0.019531,...,0.019531,0.023438,0.027344,0.023438,0.023438,0.027344,0.027344,0.019531,0.019531,0.023438
1,0.019531,0.023438,0.019531,0.019531,0.019531,0.019531,0.023438,0.023438,0.023438,0.019531,...,0.023438,0.019531,0.015625,0.015625,0.027344,0.015625,0.019531,0.023438,0.019531,0.023438
2,0.019531,0.019531,0.019531,0.023438,0.019531,0.023438,0.023438,0.023438,0.023438,-0.003906,...,0.027344,0.019531,0.019531,0.023438,0.019531,0.023438,0.023438,0.019531,0.023438,0.023438
3,0.027344,0.027344,0.027344,0.027344,0.031250,0.019531,0.023438,0.023438,0.031250,0.027344,...,0.031250,0.035156,0.031250,0.027344,0.035156,0.035156,0.035156,0.035156,0.031250,0.031250
4,0.023438,0.023438,0.023438,0.019531,0.019531,0.023438,0.023438,0.019531,0.023438,0.019531,...,0.019531,0.019531,0.023438,0.023438,0.023438,0.023438,0.031250,0.023438,0.027344,0.023438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738,0.027344,0.023438,0.019531,0.023438,0.023438,0.023438,0.027344,0.019531,0.019531,0.023438,...,0.023438,0.019531,0.019531,0.023438,0.023438,0.023438,0.019531,0.015625,0.023438,0.019531
1739,0.019531,0.019531,0.023438,0.019531,0.019531,0.019531,0.015625,0.027344,0.019531,0.019531,...,0.019531,0.023438,0.027344,0.023438,0.019531,0.015625,0.019531,0.015625,0.015625,0.019531
1740,0.023438,0.019531,0.023438,0.023438,0.027344,0.023438,0.023438,0.023438,0.023438,0.027344,...,0.019531,0.019531,0.027344,0.019531,0.015625,0.023438,0.023438,0.019531,0.023438,0.015625
1741,0.019531,0.023438,0.027344,0.023438,0.027344,0.023438,0.027344,0.019531,0.023438,0.023438,...,0.023438,0.019531,0.023438,0.019531,0.019531,0.023438,0.019531,0.023438,0.015625,0.019531


In [58]:
ONE_DEMO = dataframe.iloc[1]

In [60]:
ONE_DEMO.head()

0    0.019531
1    0.023438
2    0.019531
3    0.019531
4    0.019531
Name: 1, dtype: float16

In [62]:
ONE_DEMO.to_csv('ONE_DEMO', header=False, index=False)

In [50]:
test = test.T

In [54]:
test

Unnamed: 0,0
0,0.378906
1,0.203125
2,0.246094
3,0.359375
4,0.136719
...,...
1995,0.343750
1996,0.425781
1997,0.316406
1998,-0.281250


In [55]:
test.to_csv('ONE_DEMO_withfault.csv', header=False, index=False)