# Imports & constants

In [1]:
from pathlib import Path
from os.path import abspath
from typing import Tuple, List

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from livelossplot import PlotLossesKeras
from scipy.stats import ttest_rel
from tabulate import tabulate

transformer_checkpoint = 'distilbert-base-uncased'
dataset_path = Path(abspath('.')).parent.parent / "raw" / "dataset.csv"

BATCH_SIZE = 128
MAX_LENGTH = 512
EPOCHS = 80
GRAPH_DIM = 5

SEED = 42

# Dataset

In [None]:
dataset = pd.read_csv(dataset_path, lineterminator='\n')
dataset.info()

# Data imputation

In [None]:
dataset.description = dataset.description.fillna("")

# Data split (train, val, test)

In [None]:
# train & test
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1].values, dataset.iloc[:, -1].values, test_size=.2, random_state=SEED)

# test & val
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.5, random_state=SEED)

# Divide data into graph & nlp

In [None]:
def divide_dataset(dataset: np.ndarray) -> Tuple[tf.constant, np.ndarray]:
    """ Return (graph_X, nlp_X) """

    bool_encoder = LabelEncoder()
    graph_X = dataset[:, [1, 2, 3, 4, 8]]
    nlp_X = dataset[:, [5, 6, 7]]
    graph_X[:, 1] = bool_encoder.fit_transform(graph_X[:, 1])

    return tf.constant(graph_X, dtype='int32'), nlp_X

def prepare_nlp_ds(nlp_dataset: np.ndarray) -> List[str]:
    dataset = []
    for row in nlp_dataset:
        dataset.append(row[0]+ " " + row[2])

    return dataset

In [None]:
graph_train, nlp_train = divide_dataset(X_train)
graph_val, nlp_val = divide_dataset(X_val)
graph_test, nlp_test = divide_dataset(X_test)

In [None]:
graph_train.shape

# Batch encode

In [None]:
def batch_encode(tokenizer: DistilBertTokenizerFast, nlp_dataset: np.ndarray):
    input_ids = []
    attention_mask = []
    for i in range(0, len(nlp_dataset), BATCH_SIZE):
        batch = prepare_nlp_ds(nlp_dataset[i: i + BATCH_SIZE])
        tokens = tokenizer(
            batch,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
        )
        input_ids.extend(tokens["input_ids"])
        attention_mask.extend(tokens["attention_mask"])

    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

tokenizer = DistilBertTokenizerFast.from_pretrained(transformer_checkpoint)

In [None]:
def get_distill_bert() -> TFDistilBertModel:
    distilBERT = TFDistilBertModel.from_pretrained(transformer_checkpoint)
    for layer in distilBERT.layers:
        layer.trainable = False

    return distilBERT

In [None]:
nlp_train_ids, nlp_train_attention = batch_encode(tokenizer, nlp_train)
nlp_val_ids, nlp_val_attention = batch_encode(tokenizer, nlp_val)
nlp_test_ids, nlp_test_attention = batch_encode(tokenizer, nlp_test)

In [None]:
def save_embeddings() -> None:
    bert = get_distill_bert()
    train_ds = tf.data.Dataset.from_tensor_slices(((nlp_train_ids, nlp_train_attention), y_train))
    val_ds = tf.data.Dataset.from_tensor_slices(((nlp_val_ids, nlp_val_attention), y_val))
    test_ds = tf.data.Dataset.from_tensor_slices(((nlp_test_ids, nlp_test_attention), y_test))

    train_ds = train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    val_ds = val_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    train_tokens = np.empty((0, 768))
    val_tokens = np.empty((0, 768))
    test_tokens = np.empty((0, 768))
    with tf.device('/GPU:0'):
        print("Embedding train dataset")
        for (ids, attention), _ in train_ds:
            feature = bert([ids, attention])[0]
            cls_token = feature[:, 0, :]
            train_tokens = np.append(train_tokens, cls_token.numpy(), axis=0)
        np.save('train.embd', train_tokens)
        print("Embedding val dataset")
        for (ids, attention), _ in val_ds:
            feature = bert([ids, attention])[0]
            cls_token = feature[:, 0, :]
            val_tokens = np.append(val_tokens, cls_token.numpy(), axis=0)
        np.save('val.embd', val_tokens)
        print("Embedding test dataset")
        for (ids, attention), _ in test_ds:
            feature = bert([ids, attention])[0]
            cls_token = feature[:, 0, :]
            test_tokens = np.append(test_tokens, cls_token.numpy(), axis=0)
        np.save('test.embd', test_tokens)

In [None]:
save_embeddings()

In [None]:
def build_model(with_grap_meta: bool = False) -> tf.keras.Model:
    input_embedding_layer = tf.keras.layers.Input(shape=(768,), name='input_embedding_layer', dtype='float32')
    x = tf.keras.layers.Dense(64, activation='relu')(input_embedding_layer)
    x = tf.keras.layers.BatchNormalization()(x)
    if with_grap_meta:
        input_graph_layer = tf.keras.layers.Input(shape=(GRAPH_DIM,), name='input_graph_stats', dtype='float32')
        x = tf.keras.layers.concatenate([x, input_graph_layer])
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(40)(x)
    x = tf.keras.layers.Dropout(.1)(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    if with_grap_meta:
        model = tf.keras.Model(inputs=[input_embedding_layer, input_graph_layer], outputs=output)
    else:
        model = tf.keras.Model(inputs=input_embedding_layer, outputs=output)

    return model


# Graph model

In [None]:
graph_model = build_model(True)
graph_model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),  # due to some issues with Apple M1
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
)
graph_model.summary()

In [None]:
train_embeddings = np.load('train.embd.npy')
val_embeddings = np.load('val.embd.npy')
test_embeddings = np.load('test.embd.npy')

train_embeddings.shape

In [None]:
graph_train_ds = tf.data.Dataset.from_tensor_slices(((train_embeddings, graph_train), y_train))
graph_val_ds = tf.data.Dataset.from_tensor_slices(((val_embeddings, graph_val), y_val))
graph_test_ds = tf.data.Dataset.from_tensor_slices(((test_embeddings, graph_test), y_test))

graph_train_ds = graph_train_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
graph_val_ds = graph_val_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, min_delta=.001, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(factor=.2),
    PlotLossesKeras()
]

with tf.device('/GPU:0'):
    graph_history = graph_model.fit(
        x=graph_train_ds,
        epochs=EPOCHS,
        validation_data=graph_val_ds,
        callbacks=callbacks
    )

In [None]:
graph_model.save('best_graph')

In [None]:
with tf.device('/GPU:0'):
    graph_model.evaluate(graph_test_ds.batch(BATCH_SIZE))

# NLP model

In [None]:
nlp_model = build_model(False)
nlp_model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),  # due to some issues with Apple M1
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
)
nlp_model.summary()


In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, min_delta=.001, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(factor=.2),
    PlotLossesKeras()
]
nlp_train_ds = tf.data.Dataset.from_tensor_slices((train_embeddings, y_train))
nlp_val_ds = tf.data.Dataset.from_tensor_slices((val_embeddings, y_val))
nlp_test_ds = tf.data.Dataset.from_tensor_slices((test_embeddings, y_test))

nlp_train_ds = nlp_train_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
nlp_val_ds = nlp_val_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

with tf.device('/GPU:0'):
    nlp_history = nlp_model.fit(
        x=nlp_train_ds,
        epochs=EPOCHS,
        validation_data=nlp_val_ds,
        callbacks=callbacks
    )

In [None]:
nlp_model.save('best_nlp')

# Tensorflow dataset

In [None]:
with tf.device('/GPU:0'):
    nlp_model.evaluate(nlp_test_ds.batch(BATCH_SIZE))

# Results

In [9]:
models = ['distilBERT + none', 'bert none', 'distilBERT + graph', 'bert + graph', 'lstm unidirectional', 'lstm bidirectional', 'lstm bidirectional + dense64', 'lstm bidirectional + graph']
scores = np.array([
    [0.87052, 0.8498, 0.7115],
    [0.8669, 0.8596, 0.6863],
    [0.8663, 0.8454, 0.6948],
    [0.8474, 0.8495, 0.6771],
    [0.8945, 0.84120, 0.8169],
    [0.8971, 0.84257, 0.82544],
    [0.8894, 0.8250, 0.8202],
    [0.89017, 0.8211, 0.82938],
])

# Statistics

In [23]:
def calculate_t_statistics(scores: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    t_statistic = np.zeros((len(models), len(models)))
    p_value = np.zeros((len(models), len(models)))

    for i in range(len(models)):
        for j in range(len(models)):
            t_statistic[i, j], p_value[i, j] = ttest_rel(scores[i], scores[j])

    return t_statistic, p_value

def print_statistics(scores: np.ndarray) -> np.ndarray:
    alpha = .05
    headers = models
    column_names = list(map(lambda model: [model], models))
    t_statistics, p_value = calculate_t_statistics(scores)
    t_statistics_table = np.concatenate((column_names, t_statistics), axis=1)
    t_statistics_table = tabulate(t_statistics_table, headers, floatfmt=".2f")
    p_value_table = np.concatenate((column_names, p_value), axis=1)
    p_value_table = tabulate(p_value_table, headers, floatfmt=".2f")
    print("t-statistic:\n", t_statistics_table, "\n\np-value\n", p_value_table)
    advantage = np.zeros((len(models), len(models)))
    advantage[t_statistics > 0] = 1
    advantage_table = tabulate(np.concatenate((column_names, advantage), axis=1), headers)
    print("Advantage:\n", advantage_table)
    significance = np.zeros((len(models), len(models)))
    significance[p_value <= alpha] = 1
    significance_table = tabulate(np.concatenate((column_names, significance), axis=1), headers)
    print("Statistical significance (alpha = 0.05):\n", significance_table)

    return significance


In [24]:
significance = print_statistics(scores)
significance
raw_df = {model: significance[:, i] for i, model in enumerate(models)}
df = pd.DataFrame(raw_df, index=models, columns=models)
df.to_csv('wyniki.csv')

t-statistic:
                                 distilBERT + none    bert none    distilBERT + graph    bert + graph    lstm unidirectional    lstm bidirectional    lstm bidirectional + dense64    lstm bidirectional + graph
----------------------------  -------------------  -----------  --------------------  --------------  ---------------------  --------------------  ------------------------------  ----------------------------
distilBERT + none                          nan            0.62                  2.04            1.92                  -1.19                 -1.23                           -0.87                         -0.84
bert none                                   -0.62       nan                     0.32            3.93                  -1.06                 -1.10                           -0.82                         -0.80
distilBERT + graph                          -2.04        -0.32                nan               1.45                  -1.29                 -1.32         