In [1]:
import os 
import pickle
import numpy as np
from os.path import join  

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.distribute import MirroredStrategy, OneDeviceStrategy

from utils import *

datasets = join(os.getcwd(), "datasets")
preprocessed_datasets = join(datasets, "preprocessed")

X_all_sep, y_all, X_test_sep = pickle.load(open(join(preprocessed_datasets, "cleaned_separated.pickle"), 'rb'))
X_all_ohe, _, X_test_ohe = pickle.load(open(join(preprocessed_datasets, "cleaned_ohe.pickle"), 'rb'))

ohe_split =  X_all_ohe.shape[1] - X_all_sep[0].shape[1]
X_all = (*X_all_sep, X_all_ohe[:,-ohe_split:])
X_test = (*X_test_sep, X_test_ohe[:,-ohe_split:])

strategy = OneDeviceStrategy("CPU")

In [2]:
def train_test_tuple_split(X, y, train_size, seed=None):
    def apply_tuple_split(X, split):
        train, test = [], []
        for x in X:
            train.append(x[:split])
            test.append(x[split:])
        return tuple(train), tuple(test)
    
    np.random.seed(seed)
    
    indices_ = np.arange(len(y))
    np.random.shuffle(indices_)
    
    splitter_indice = int(len(y) * train_size)
    train_indices, test_indices = np.split(indices_, [splitter_indice])
    
    X_train, X_test = apply_tuple_split(X, splitter_indice)
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test
    

X_train, X_val, y_train, y_val = train_test_tuple_split(X_all, y_all, .75, seed=42)

In [3]:
with strategy.scope():
    
    conv_len, vp_len, ohe_len = len(X_train[0][-1]), len(X_train[1][-1]), len(X_train[2][-1])
    # vocab_dim = int(1.2*len(np.unique(np.r_[X_train[1], X_test[1]])))
    vocab_dim = int(np.unique(np.r_[X_train[1], X_test[1]]).max())+1
    
    conv_input = tf.keras.layers.Input(shape=(conv_len,))
    
    
    vp_input = tf.keras.layers.Input(shape=(vp_len,))
    embedded = tf.keras.layers.Embedding(vocab_dim, 128, input_length=vp_len)(vp_input)
    dense_emb = tf.keras.layers.Dense(1)(embedded)
    flatten_emb = tf.keras.layers.Flatten()(dense_emb)
    
    ohe_input = tf.keras.layers.Input(shape=(ohe_len,))
    
    concat = tf.keras.layers.Concatenate()([conv_input, flatten_emb, ohe_input])
    
    output = tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.L1(0.01), activation='sigmoid')(concat)
    
    model = tf.keras.models.Model(inputs=[conv_input, vp_input, ohe_input], outputs=output)
    
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.keras.optimizers.Adam(lr=1e-3),
        metrics=[bcr, p1, p2, m1, m2, "accuracy"]
    )

callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_bcr', factor=.5, patience=10, verbose=2),
    BCREarlyStopping(patience=30, restore_best_weights=True)
]

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 6)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 6, 128)       4480        input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 6, 1)         129         embedding[0][0]                  
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 1306624)]    0                                            
______________________________________________________________________________________________

In [4]:
history = model.fit(X_train, y_train, 
                    validation_data=(X_val, y_val),
                    batch_size=120,
                    epochs=200,
                    callbacks=callbacks
                   )

Epoch 1/200


  return bcr - bcr_delta * (1 - np.exp(-bcr_delta / sigma))


New best p value : 0.5
Epoch 2/200


  return bcr - bcr_delta * (1 - np.exp(-bcr_delta / sigma))


Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 12/200
New best p value : 0.5145556734294201
Epoch 13/200
Epoch 14/200
New best p value : 0.5614986011618932
Epoch 15/200
New best p value : 0.5807903942661091
Epoch 16/200
Epoch 17/200


Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200

Epoch 00029: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200


Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Restoring model weights from the end of the best epoch. Best value : 0.581
Epoch 00045: early stopping


In [5]:
report = Report(model, X_train, y_train, X_val, y_val).to_stdout()

------------------ Report for Functional -------------------

P score : 0.573
BCR     : 0.673
BCR hat : 0.569
