In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score
import gc

In [None]:
# tf.debugging.set_log_device_placement(True)
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)

In [None]:
window_size = 15

def build_model():
    model = models.Sequential()
    model.add(layers.Conv1D(filters = 144, kernel_size = 1,activation='relu', 
                      input_shape= (31,1024), padding="same"))
#     model.add(layers.MaxPooling1D(pool_size=2))
    model.add(layers.Dropout(0.5))
    model.add(layers.Conv1D(filters = 128, kernel_size = 1, activation = 'relu'))
    model.add(layers.MaxPooling1D(pool_size=2))
    model.add(layers.Dropout(0.5))
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    return model


def get_model():
    model1 = build_model()
    model2 = build_model()
    model1_model2 = layers.concatenate([model1.output, model2.output], name = "concatenated_layer")
    output_layer = layers.Dense(16, activation='relu')(model1_model2)

    output_layer = layers.Dropout(0.2)(output_layer)
    output_layer = layers.Dense(1, activation='sigmoid')(output_layer)
    model = models.Model(inputs=[model1.input, model2.input], outputs=output_layer, name ="merged_layers")
    return model
# model = get_model()
# model.summary()



In [None]:
model = get_model()
model.summary()

In [None]:
def compile_model(model):
  model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
  return model


def get_score(model, X_train, X_test, y_train, y_test):
  # patient early stopping
  es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=10)
  history = model.fit(X_train, y_train, epochs=25, batch_size = 1024*2, validation_data = (X_test, y_test), verbose=0)

  train_acc_history = history.history['accuracy']
  val_acc_history = history.history['val_accuracy']
  # evaluate the model
  _, train_acc = model.evaluate(X_train, y_train, verbose=0)
  _, test_acc = model.evaluate(X_test, y_test, verbose=0)
    
  # ---------------------------------------------------------  
  
  return train_acc, test_acc, history

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

data_bert = np.load('//kaggle/input/bert-all/data31_bert.npz')
positive_bert = data_bert['positive']
negative_bert = data_bert['negative']
data_T5 = np.load('/kaggle/input/uni-31/data31.npz')
positive_T5 = data_T5['positive']
negative_T5 = data_T5['negative']



positive_dataset = np.hstack((positive_bert, positive_T5))
negative_dataset = np.hstack((negative_bert, negative_T5))

p_label = np.ones(positive_dataset.shape[0])
n_label = np.zeros(negative_dataset.shape[0])


X = np.vstack([positive_dataset, negative_dataset])
Y = np.vstack([p_label.reshape(-1,1), n_label.reshape(-1,1)])


print(X.shape)
print(Y.shape)

del data_bert
del positive_bert
del negative_bert
del data_T5
del positive_T5
del negative_T5
del positive_dataset
del negative_dataset
gc.collect()



In [None]:
import sklearn
from sklearn.model_selection import train_test_split
import gc
X_train, X_t, Y_train, Y_t = train_test_split( X, Y, test_size=0.05, random_state=42)
folds = StratifiedKFold(n_splits=10)
scores = {}
scores['train_acc'] = []
scores['val_acc'] = []
max_val_acc = 0
max_test_mcc = 0
max_mcc = 0
X = X_train
Y = Y_train

mcc_hist = []
acc_hist = []
conf_mat_hist = []
f1_s_hist = []

X_te = [X_t[:, :31], X_t[:, 31:]]
y_true = Y_t
for train_index, test_index in folds.split(X,Y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], \
                                    Y[train_index], Y[test_index]
    model = get_model()
    model = compile_model(model)
    X_train = [X_train[:, :31], X_train[:, 31:]]
    X_test = [X_test[:, :31], X_test[:, 31:]]
    train_acc, val_acc, history = get_score(model, X_train, X_test, y_train, y_test)
    print('-----------------------------------------------')
    y_prediction = model.predict(X_te) > 0.5
    y_prediction = np.where(y_prediction > 0.5, 1, 0)
    conf = confusion_matrix(y_true, y_prediction , normalize='pred')
    print("Confusion Matrix: \n",conf)
    mcc =sklearn.metrics.matthews_corrcoef(y_true, y_prediction)
    print('mcc:', mcc)
    accuracy = accuracy_score(y_true, y_prediction)
    print('acc:',accuracy)
    print('-----------------------------------------------')

    
   
    if mcc > max_test_mcc:
        model.save('/kaggle/working/comb_mcc.h5')
        max_test_mcc = mcc
#     scores['train_acc'].append(train_acc)
#     scores['val_acc'].append(val_acc)

    del model
    del X_train
    del X_test
    del y_train
    del y_test
    gc.collect()

In [None]:
import matplotlib.pyplot as plt

train_acc_history = hist.history['accuracy']
val_acc_history = hist.history['val_accuracy']
loss_history = hist.history['loss']
val_loss_history = hist.history['val_loss']

epochs = range(1, len(train_acc_history) + 1)
        
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,7))
ax1.plot(epochs, train_acc_history, label='Training Acc')
ax1.plot(epochs, val_acc_history, label='Validation acc')
ax1.set_title('Training and validation accuracy')
ax1.legend()
ax2.plot(epochs, loss_history, label='Training Loss')
ax2.plot(epochs, val_loss_history, label='Validation Loss')
ax2.set_title('Training and validation loss')
ax2.legend()

fig.savefig('xyz.jpg')

In [None]:
print(np.mean(mcc_hist))
print(np.std(mcc_hist))

print(np.mean(acc_hist))
print(np.std(acc_hist))

In [None]:
bert_neg_test_i = np.load('/kaggle/input/independent-bert-all/bert_neg_independent_test.npz')['negative']
bert_pos_test_i = np.load('/kaggle/input/independent-bert-all/bert_test.npz')['positive']

data = np.load('/kaggle/input/independent-t5-test/t5_independent_test.npz')
t5_neg_test_i = data['negative']
t5_pos_test_i = data['positive']

positive_dataset_test = np.hstack((bert_pos_test_i, t5_pos_test_i))
negative_dataset_test = np.hstack((bert_neg_test_i, t5_neg_test_i))

p_label = np.ones(positive_dataset_test.shape[0])
n_label = np.zeros(negative_dataset_test.shape[0])

X_t = np.vstack((positive_dataset_test, negative_dataset_test))
Y_t = np.vstack((p_label.reshape(-1,1), n_label.reshape(-1,1)))



In [None]:
import sklearn
y_true= Y_t
new_model = tf.keras.models.load_model('/kaggle/input/combined-model-1/comb_mcc (6).h5')
X_te = [X_t[:, :31], X_t[:, 31:]]
y_pred = new_model.predict(X_te) > 0.50
mcc =sklearn.metrics.matthews_corrcoef(y_true, y_pred)
print(mcc)

In [None]:
f1_score = sklearn.metrics.f1_score(y_true, y_pred)
print("f1 score: ",f1_score)
acc = sklearn.metrics.accuracy_score(y_true, y_pred)
print("acc: ", acc)
confusion_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
print("confusion matrix: ",confusion_matrix)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Create true labels and predicted labels

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Create heatmap
sns.heatmap(cm, annot=True, cmap="Blues")

# Add axis labels and title
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix")

# Display the heatmap
plt.show()

In [None]:
#TSNE plot
from sklearn.manifold import TSNE

# The default of 1,000 iterations gives fine results, but I'm training for longer just to eke
# out some marginal improvements. NB: This takes almost an hour!
tsne = TSNE(random_state=1, n_iter=15000, metric="cosine")
X_mid = X[:,11].reshape((10027,1024))
print(X_mid.shape)
embs = tsne.fit_transform(X_mid)


In [None]:
from matplotlib import pyplot as plt
FS = (10, 8)
fig, ax = plt.subplots(figsize=FS)
# Make points translucent so we can visually identify regions with a high density of overlapping points
ax.scatter(embs[:, 0], embs[:, 1], alpha=.1);