In [1]:
import os
import random
import pickle
import numpy as np
import tensorflow as tf
import pandas as pd

os.environ['TF_DETERMINISTIC_OPS'] = '1'

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

## ML

In [2]:
ingv_data = pd.read_csv('./INGV_ML_CSV_DATA.csv')
ingv_data.head()

Unnamed: 0,X,y
0,"[[0.001455342979170382, 0.012778562493622303, ...",1
1,"[[-0.0035912226885557175, -0.0085179153829813,...",0
2,"[[-0.6320589780807495, -0.3878442347049713, -0...",0
3,"[[0.0018271339358761907, 0.029985496774315834,...",1
4,"[[-0.01246583554893732, -0.013441404327750206,...",1


In [3]:
far_source_majority = ingv_data[ingv_data['y'] == 0] #39043 
near_source_minority = ingv_data[ingv_data['y'] == 1] #3110
bias = near_source_minority.shape[0]/far_source_majority.shape[0] #0.0796
print(f'minor: {near_source_minority.shape[0]} >> major: {far_source_majority.shape[0]} >> bias: {bias}')

minor: 3110 >> major: 11886 >> bias: 0.26165236412586235


In [4]:
from sklearn.utils import resample, shuffle

train = pd.concat([
    far_source_majority.sample(frac=0.8,random_state=200),
    near_source_minority.sample(frac=0.8,random_state=200)
])
test = pd.concat([
    far_source_majority.drop(far_source_majority.sample(frac=0.8,random_state=200).index),
    near_source_minority.drop(near_source_minority.sample(frac=0.8,random_state=200).index)
])

train = shuffle(train)
test = shuffle(test)

print('Near source data in training:',(train.y == 1).sum())
print('Far source data in training:',(train.y == 0).sum())
print('Near source data in test:',(test.y == 1).sum())
print('Far source data in test:',(test.y == 0).sum())

Near source data in training: 2488
Far source data in training: 31234
Near source data in test: 622
Far source data in test: 7809


In [20]:
# upsampling or downsampling

MODE = 'DOWNSAMPLING'
# Separate majority and minority classes in training data for up sampling 
data_majority = train[train['y'] == 0]
data_minority = train[train['y'] == 1]

print("majority class before upsample:", data_majority.shape)
print("minority class before upsample:", data_minority.shape)

if MODE == 'UPSAMPLING':
    # Upsample minority class
    data_minority_upsampled = resample(
        data_minority,
        replace=True,     # sample with replacement
        n_samples= data_majority.shape[0],    # to match majority class
        random_state=123) # reproducible results

    # Combine majority class with upsampled minority class
    data_resampled = pd.concat([data_majority, data_minority_upsampled])
else:
    # Downsample majority class
    data_majority_downsampled = resample(
        data_majority, 
        replace=True,     # sample with replacement
        n_samples= int(0.8 * data_minority.shape[0]),    # to match majority class
        random_state=123) # reproducible results

    # Combine majority class with downsampled minority class
    data_resampled = pd.concat([data_majority_downsampled, data_minority])

data_resampled = shuffle(data_resampled)
# Display new class counts
print(f"After {MODE}\n",data_resampled.y.value_counts(),sep = "")

majority class before upsample: (31234, 2)
minority class before upsample: (2488, 2)
After DOWNSAMPLING
1    2488
0    1990
Name: y, dtype: int64


In [6]:
# evaluate test
data_resampled.head()

Unnamed: 0,X,y
24547,"[[0.0009152837446890771, 0.0001872387656476348...",0
41959,"[[-0.001398019609041512, 2.772395237116143e-05...",0
33984,"[[0.0006510239909403026, -0.002088340930640697...",1
38895,"[[-0.01728072389960289, -0.07464507222175598, ...",1
23943,"[[0.018745873123407364, -0.11022474616765976, ...",0


In [7]:
# data_resampled.to_csv('TRAIN_DOWNSAMPLED_INGV_ML_CSV_DATA.csv')

In [21]:
import os
import random
import pickle
import numpy as np
import tensorflow as tf
import pandas as pd

os.environ['TF_DETERMINISTIC_OPS'] = '1'

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [15]:
# data_resampled = pd.read_csv('TRAIN_DOWNSAMPLED_INGV_ML_CSV_DATA.csv')

In [22]:
import json

import numpy as np
from sklearn.model_selection import train_test_split


X_train = np.array(data_resampled['X'].apply(json.loads).to_list())
y_train = np.array(data_resampled['y'].to_list())

print(len(X_train), len(y_train))
print(X_train.shape, y_train.shape)

n_features = X_train.shape[1]
n_steps = X_train.shape[2]

4478 4478
(4478, 13, 100) (4478,)


In [23]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [26]:
# define model
model = Sequential()
model.add(
    LSTM(
        100,
        activation='sigmoid',
        input_shape=(n_features, n_steps),
        return_sequences=False,
        # dropout=0.9,
        # recurrent_dropout=0.9
    )
)
model.add(Dense(1))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())
# bias = 0.0796557641574674
# I have defined weight of majority class to be 1 and of minority class to be a multiple of 1/bias
class_weights = {0: 1, 1: 1.6/bias}
# fit model
history = model.fit(X_train, y_train, epochs=360, batch_size=30)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 1)                 101       
                                                                 
Total params: 80,501
Trainable params: 80,501
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/360
Epoch 2/360
Epoch 3/360
Epoch 4/360
Epoch 5/360
Epoch 6/360
Epoch 7/360
Epoch 8/360
Epoch 9/360
Epoch 10/360
Epoch 11/360
Epoch 12/360
Epoch 13/360
Epoch 14/360
Epoch 15/360
Epoch 16/360
Epoch 17/360
Epoch 18/360
Epoch 19/360
Epoch 20/360
Epoch 21/360
Epoch 22/360
Epoch 23/360
Epoch 24/360
Epoch 25/360
Epoch 26/360
Epoch 27/360
Epoch 28/360
Epoch 29/360
Epoch 30/360
Epoch 31/360
Epoch 32/360
Epoch 33/360
Epoch 34/360
Epoch 35

In [25]:
# evaluate training
train_acc = model.evaluate(X_train, y_train, verbose=0)
print("Accuracy: %.2f%%" % (train_acc[1]*100))

Accuracy: 75.08%
