In [None]:
# Import required Packages
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import torch
import sklearn

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping
from keras.layers import LeakyReLU


In [None]:
import matplotlib.pyplot as plt

In [None]:
data = np.load("/kaggle/input/bert-all/data31_bert.npz")
positive = data["positive"]
negative = data["negative"]

Y_positive = np.ones(positive.shape[0])
Y_negative = np.zeros(negative.shape[0])
X = np.concatenate((positive, negative), axis=0)
Y = np.concatenate((Y_positive, Y_negative), axis=0)
print(X.shape)

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
X_train, X_t, Y_train, Y_t = train_test_split( X, Y, test_size=0.05, random_state=42)

In [None]:
print(X_train.shape)
print(Y_train.shape)
print(X_t.shape)

In [None]:
# Function to get_score from the model given the training and test dataset

def get_score(model, X_train, X_test, y_train, y_test):
    # patient early stopping

    es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=20)
    # history = model.fit(X_train, y_train, epochs=20, validation_data = (X_test, y_test), verbose=0, callbacks=[es])
    
    history = model.fit(X_train, y_train, epochs=30, batch_size = 1024*2, validation_data = (X_test, y_test), verbose=0)

    X_shape = X_train.shape
   
        
  # evaluate the model
    _, train_acc = model.evaluate(X_train, y_train, verbose=0)
    _, test_acc = model.evaluate(X_test, y_test, verbose=0)
    
    y_prediction = model.predict(X_test) > 0.5
    y_prediction = np.where(y_prediction > 0.5, 1, 0)
    mcc =sklearn.metrics.matthews_corrcoef(y_test, y_prediction)
    print('mcc:', mcc)
    return train_acc, test_acc, history, mcc

In [None]:

def build_model():
  model = models.Sequential()
  model.add(layers.Conv1D(filters = 144, kernel_size = 1, activation='relu', input_shape= (31, 1024), padding="same"))
  model.add(layers.MaxPooling1D(pool_size=2))
  model.add(layers.Dropout(0.5))
  model.add(layers.Conv1D(filters = 128, kernel_size = 1, activation='relu'))
  model.add(layers.MaxPooling1D(pool_size=2))
  model.add(layers.Dropout(0.5))
  model.add(layers.Flatten())
  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(16, activation='relu'))
  model.add(layers.Dense(1, activation = 'sigmoid'))
  return model

In [None]:
def compile_model(model):
    model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
    return model



In [None]:
folds = StratifiedKFold(n_splits=10)
scores = {}
scores['train_acc'] = []
scores['val_acc'] = []
max_test_mcc = 0
max_val_acc = 0
X = X_train
Y = Y_train
mcc_hist = []
acc_hist = []
conf_mat_hist = []
f1_s_hist = []
for train_index, test_index in folds.split(X,Y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], \
                                    Y[train_index], Y[test_index]
    model = build_model()
    model = compile_model(model)
    train_acc, val_acc, history, mcc = get_score(model, X_train, X_test, y_train, y_test)
    print('-----------------------------------------------')
    print(train_acc, val_acc)
    print('-----------------------------------------------')
    
    mcc_hist.append(mcc)
    acc_hist.append(val_acc)
#     conf_mat_hist.append(conf)
#     f1_s_hist.append(f1)
    
    if val_acc > max_val_acc:
        hist = history
        model.save('/kaggle/working/bert_win31_acc.h5')
        max_val_acc = val_acc
    
    if mcc > max_test_mcc:
        model.save('/kaggle/working/bert_win31_mcc.h5')
        max_test_mcc = mcc
    scores['train_acc'].append(train_acc)
    scores['val_acc'].append(val_acc)

    
train_acc_history = hist.history['accuracy']
val_acc_history = hist.history['val_accuracy']
loss_history = hist.history['loss']
val_loss_history = hist.history['val_loss']

epochs = range(1, len(train_acc_history) + 1)
        
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,7))
ax1.plot(epochs, train_acc_history, label='Training Acc')
ax1.plot(epochs, val_acc_history, label='Validation acc')
ax1.set_title('Training and validation accuracy')
ax1.legend()
ax2.plot(epochs, loss_history, label='Training Loss')
ax2.plot(epochs, val_loss_history, label='Validation Loss')
ax2.set_title('Training and validation loss')
ax2.legend()

fig.savefig('xyz.jpg')

    

In [None]:
print(np.mean(mcc_hist))
print(np.std(mcc_hist))

print(np.mean(acc_hist))
print(np.std(acc_hist))

In [None]:
# model.save('/kaggle/working/model6412864w21.h5')

In [None]:
from tensorflow.keras.models import load_model
new_model = load_model("/kaggle/working/bert_win31_acc.h5")
new_model.summary()

In [None]:
test = np.load("/kaggle/input/independent-bert/bert_test.npz")
test1 = np.load("/kaggle/input/independent-bert-all/bert_neg_independent_test.npz")
test_neg = test1["negative"]
test_pos = test["positive"]
print(test_neg.shape)
print(test_pos.shape)

In [None]:
p_label = np.ones(test_pos.shape[0])
n_label = np.zeros(test_neg.shape[0])
print(p_label.shape)
print(n_label.shape)

In [None]:
X_test_all = np.vstack([test_pos, test_neg])
Y_test_all = np.vstack([p_label.reshape(-1,1), n_label.reshape(-1,1)])

In [None]:
#Predict
y_prediction = new_model.predict(X_test_all)
y_prediction = np.where(y_prediction > 0.5, 1, 0)

#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(Y_test_all, y_prediction)

In [None]:
print("Confusion Matrix: \n",result)


In [None]:
import sklearn
y_pred = new_model.predict(X_test_all) > 0.5
mcc =sklearn.metrics.matthews_corrcoef(Y_test_all, y_prediction)
print(mcc)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test_all, y_prediction)
print(accuracy)

In [None]:
result = f1_score(Y_test_all, y_prediction)
print("F1_score: \n",result)

# 