# Imports & constants

In [None]:
from pathlib import Path
from os.path import abspath
from typing import Tuple, List

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertModel
from sklearn.preprocessing import LabelEncoder

transformer_checkpoint = 'distilbert-base-uncased'
dataset_path = Path(abspath('.')).parent.parent / "raw" / "dataset.csv"

BATCH_SIZE = 256
MAX_LENGTH = 512
EPOCHS = 10
GRAPH_DIM = 5

TRAIN_SPLIT = .8
VAL_SPLIT = .1

# Dataset

In [None]:
dataset = pd.read_csv(dataset_path, lineterminator='\n')
dataset.info()

# Data imputation

In [None]:
dataset.description = dataset.description.fillna("")

# Data split (train, val, test)

In [None]:
train_len = int(dataset.shape[0] * TRAIN_SPLIT)
val_len = int(train_len * VAL_SPLIT)
train_len -= val_len
test_len = dataset.shape[0] - train_len - val_len
print(f"Train: {train_len}, Val: {val_len} & Test: {test_len}")

# train
X_train = dataset.iloc[:train_len, :-1].values
y_train = dataset.iloc[:train_len, -1].values

#val
X_val = dataset.iloc[train_len:train_len + val_len, :-1].values
y_val = dataset.iloc[train_len:train_len + val_len, -1].values

#test
X_test = dataset.iloc[train_len + val_len:, :-1].values
y_test = dataset.iloc[train_len + val_len:, -1].values

# Divide data into graph & nlp

In [None]:
def divide_dataset(dataset: np.ndarray) -> Tuple[tf.constant, np.ndarray]:
    """ Return (graph_X, nlp_X) """

    bool_encoder = LabelEncoder()
    graph_X = dataset[:, [1, 2, 3, 4, 8]]
    nlp_X = dataset[:, [5, 6, 7]]
    graph_X[:, 1] = bool_encoder.fit_transform(graph_X[:, 1])

    return tf.constant(graph_X, dtype='int32'), nlp_X

def prepare_nlp_ds(nlp_dataset: np.ndarray) -> List[str]:
    dataset = []
    for row in nlp_dataset:
        dataset.append(row[0]+ " " + row[2])

    return dataset

In [None]:
graph_train, nlp_train = divide_dataset(X_train)
graph_val, nlp_val = divide_dataset(X_val)
graph_test, nlp_test = divide_dataset(X_test)

In [None]:
graph_train.shape

# Batch encode

In [None]:
def batch_encode(tokenizer: DistilBertTokenizerFast, nlp_dataset: np.ndarray):
    input_ids = []
    attention_mask = []
    for i in range(0, len(nlp_dataset), BATCH_SIZE):
        batch = prepare_nlp_ds(nlp_dataset[i: i + BATCH_SIZE])
        tokens = tokenizer(
            batch,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
        )
        input_ids.extend(tokens["input_ids"])
        attention_mask.extend(tokens["attention_mask"])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

tokenizer = DistilBertTokenizerFast.from_pretrained(transformer_checkpoint)

In [None]:
nlp_train_ids, nlp_train_attention = batch_encode(tokenizer, nlp_train)
nlp_val_ids, nlp_val_attention = batch_encode(tokenizer, nlp_val)
nlp_test_ids, nlp_test_attention = batch_encode(tokenizer, nlp_test)

In [None]:
distilBERT = TFDistilBertModel.from_pretrained(transformer_checkpoint)
for layer in distilBERT.layers:
    layer.trainable = False

In [None]:
def build_model(transformer: TFDistilBertModel) -> tf.keras.Model:
    input_ids_layer = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='input_ids', dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(MAX_LENGTH,), name='input_attention', dtype='int32')
    input_graph_layer = tf.keras.layers.Input(shape=(GRAPH_DIM,), name='input_graph_stats', dtype='int32')

    features_unbatched = transformer([input_ids_layer, input_attention_layer])[0]
    cls_token = features_unbatched[:, 0, :]
    x = tf.keras.layers.Dense(5, activation='relu')(input_graph_layer)
    x = tf.keras.layers.concatenate([x, cls_token])
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    return tf.keras.Model(inputs=[input_ids_layer, input_attention_layer, input_graph_layer], outputs=output)

In [None]:
model = build_model(distilBERT)
model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),  # due to some issues with Apple M1
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)
model.summary()

# Tensorflow dataset

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(((nlp_train_ids, nlp_train_attention, graph_train), y_train))
val_ds = tf.data.Dataset.from_tensor_slices(((nlp_val_ids, nlp_val_attention, graph_val), y_val))

train_ds = train_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
with tf.device('/GPU:0'):
    history = model.fit(
        x=train_ds,
        epochs=EPOCHS,
        validation_data=val_ds
    )