# RATIO 2019 - Benchmarking Workshop

https://gluon-nlp.mxnet.io/install.html

```
pip install --upgrade 'mxnet>=1.3.0'
pip install gluonnlp
wget https://gluon-nlp.mxnet.io/_downloads/sentence_embedding.zip
unzip sentence_embedding.zip
ln -s sentence_embedding/bert bert
```

In [1]:
import datetime
import logging
import os
import random
import time
import warnings

import csv
import gluonnlp as nlp
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
import seaborn as sns

from bert import *
from mxboard import SummaryWriter
from mxnet import gluon
from mxnet.gluon.data import Dataset, SimpleDataset
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn import utils
from tqdm import tqdm

In [2]:
%matplotlib inline

In [3]:
warnings.filterwarnings('ignore')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

In [4]:
# set repeatable random state
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)

In [5]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [6]:
# make tqdm jupyter friendly
from tqdm import tqdm_notebook as tqdm
# for .progress_apply() we have to hack it like this?
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [8]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'
new_within_test = 'data/same-side-classification/within-topic/within_test.csv'

### Load within-topics and cross-topics data

In [9]:
with Timer("read cross"):
    cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL,
                                    encoding='utf-8',
                                    escapechar='\\',
                                    doublequote=False,
                                    index_col='id')
    cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id')

with Timer("read within"):
    within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                     quotechar='"',
                                     quoting=csv.QUOTE_ALL,
                                     encoding='utf-8',
                                     escapechar='\\',
                                     doublequote=False,
                                     index_col='id')
    # within_test_df = pd.read_csv(data_within_path.format('test'),
    #                              quotechar='"',
    #                              quoting=csv.QUOTE_ALL,
    #                              encoding='utf-8',
    #                              escapechar='\\',
    #                              doublequote=True,  # <-- change, "" as quote escape in text?
    #                              index_col='id')
    within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id')

with Timer("read new within"):
    new_within_test_df = pd.read_csv(new_within_test, index_col='id')

Time for [read cross]: 0:00:00.864055
Time for [read within]: 0:00:00.845692
Time for [read new within]: 0:00:00.364691


In [None]:
! head -n 5 data/same-side-classification/within-topic/test.csv

In [None]:
! head -n 5 data/same-side-classification/within-topic/within_test.csv

In [10]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if "abortion" in title:
        row['tag'] = 'abortion'
    elif "gay marriage"  in title:
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row


with Timer("tag cross traindev"):
    cross_traindev_df = cross_traindev_df.progress_apply(add_tag, axis=1)
with Timer("tag cross test"):
    cross_test_df = cross_test_df.progress_apply(add_tag, axis=1)

with Timer("tag within traindev"):
    within_traindev_df = within_traindev_df.progress_apply(add_tag, axis=1)
with Timer("tag within test"):
    within_test_df = within_test_df.progress_apply(add_tag, axis=1)
with Timer("tag new within test"):
    new_within_test_df = new_within_test_df.progress_apply(add_tag, axis=1)

HBox(children=(IntProgress(value=0, max=61048), HTML(value='')))


Time for [tag cross traindev]: 0:00:32.645155


HBox(children=(IntProgress(value=0, max=6163), HTML(value='')))


Time for [tag cross test]: 0:00:03.295782


HBox(children=(IntProgress(value=0, max=63903), HTML(value='')))


Time for [tag within traindev]: 0:00:34.389123


HBox(children=(IntProgress(value=0, max=3552), HTML(value='')))


Time for [tag within test]: 0:00:01.933077


HBox(children=(IntProgress(value=0, max=31475), HTML(value='')))


Time for [tag new within test]: 0:00:16.910464


### Load artificial evalset

In [11]:
fn_art_eval = "data/artificial_evalset/artificial_evalset.tsv"

In [12]:
artificial_evalset_df = pd.DataFrame.from_csv(fn_art_eval, sep='\t', index_col=None)

new_cols = artificial_evalset_df.columns.to_list()
new_cols[2] = "type"
artificial_evalset_df.columns = new_cols

In [13]:
def fix_cols(row):
    row["argument1_id"] = row['arg_id']
    row["argument2_id"] = "{}-{}".format(row['arg_id'], row['type'])
    row["topic"] = "gay marriage"
    return row

artificial_evalset_df = artificial_evalset_df.apply(fix_cols, axis=1)

In [14]:
def add_tag(row):
    row["tag"] = "gay marriage"
    return row

artificial_evalset_df = artificial_evalset_df.apply(add_tag, axis=1)

In [None]:
artificial_evalset_df = artificial_evalset_df.apply(tokenize_arguments, axis=1)
artificial_evalset_df = artificial_evalset_df.apply(sentenize_arguments, axis=1)

get_overview(artificial_evalset_df)

artificial_evalset_df.describe()

### Get an overview about each dataset

In [15]:
# requires nltk  wordtokenize
# from nltk.tokenize import sent_tokenize, word_tokenize
# model uses BERT Tokenizer ...

def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')
    
    return

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [None]:
with Timer("overview cross"):
    get_overview(cross_traindev_df)

In [None]:
with Timer("overview within"):
    get_overview(within_traindev_df)

##### Count raw length

In [None]:
def compute_arg_len(row):
    row['argument1_len'] = len(row['argument1'])
    row['argument2_len'] = len(row['argument2'])
    row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
    row['argument12_len_diff_abs'] = np.abs(row['argument12_len_diff']
    return row


cross_traindev_df = cross_traindev_df.progress_apply(compute_arg_len, axis=1)
within_traindev_df = within_traindev_df.progress_apply(compute_arg_len, axis=1)
cross_test_df = cross_test_df.progress_apply(compute_arg_len, axis=1)
within_test_df = within_test_df.progress_apply(compute_arg_len, axis=1)

In [None]:
cross_traindev_df.describe()

In [None]:
within_traindev_df.describe()

In [None]:
within_test_df.describe()

##### Tokenize and count tokens

In [16]:
stats = False

In [17]:
if stats:
    ctx = mx.cpu()
    _, vocabulary = nlp.model.get_model('bert_12_768_12',
                                        dataset_name='book_corpus_wiki_en_uncased',
                                        pretrained=True, ctx=ctx, use_pooler=True,
                                        use_decoder=False, use_classifier=False)
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    tokenizer = bert_tokenizer

In [18]:
if stats:
    from nltk.tokenize import sent_tokenize, word_tokenize
    # nltk.download('punct')


    # tokenizer from BERT
    def tokenize_arguments(row):
        # tokenize
        row['argument1_tokens'] = tokenizer(row['argument1'])
        row['argument2_tokens'] = tokenizer(row['argument2'])

        # count tokens
        row['argument1_len'] = len(row['argument1_tokens'])
        row['argument2_len'] = len(row['argument2_tokens'])
        # token number diff
        row['argument12_len_sum'] = row['argument1_len'] + row['argument2_len']
        row['argument12_len_sum_half'] = row['argument12_len_sum'] / 2
        row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
        row['argument12_len_diff_abs'] = np.abs(row['argument12_len_diff'])
        return row


    cross_traindev_df = cross_traindev_df.progress_apply(tokenize_arguments, axis=1)
    within_traindev_df = within_traindev_df.progress_apply(tokenize_arguments, axis=1)
    cross_test_df = cross_test_df.progress_apply(tokenize_arguments, axis=1)
    within_test_df = within_test_df.progress_apply(tokenize_arguments, axis=1)

In [19]:
if stats:
    # [len(sent_tokenize(x)) for x in df['argument2'].values])
    from nltk.tokenize import sent_tokenize, word_tokenize
    # nltk.download('punct')


    # tokenizer from BERT
    def sentenize_arguments(row):
        # tokenize
        row['argument1_sentences'] = sent_tokenize(row['argument1'])
        row['argument2_sentences'] = sent_tokenize(row['argument2'])

        # count tokens
        row['argument1_sent_num'] = len(row['argument1_sentences'])
        row['argument2_sent_num'] = len(row['argument2_sentences'])
        # token number diff
        row['argument12_sent_num_sum'] = row['argument1_sent_num'] + row['argument2_sent_num']
        row['argument12_sent_num_sum_half'] = row['argument12_sent_num_sum'] / 2
        row['argument12_sent_num_diff'] = row['argument1_sent_num'] - row['argument2_sent_num']
        row['argument12_sent_num_diff_abs'] = np.abs(row['argument12_sent_num_diff'])
        return row


    cross_traindev_df = cross_traindev_df.progress_apply(sentenize_arguments, axis=1)
    within_traindev_df = within_traindev_df.progress_apply(sentenize_arguments, axis=1)
    cross_test_df = cross_test_df.progress_apply(sentenize_arguments, axis=1)
    within_test_df = within_test_df.progress_apply(sentenize_arguments, axis=1)

In [20]:
if stats and False:
    import pickle

    with open("data/same-side-classification/cross_traindev_df_stats.p", "wb") as fp:
        pickle.dump(cross_traindev_df, fp, pickle.HIGHEST_PROTOCOL)
    with open("data/same-side-classification/within_traindev_df_stats.p", "wb") as fp:
        pickle.dump(within_traindev_df, fp, pickle.HIGHEST_PROTOCOL)
    with open("data/same-side-classification/cross_test_df_stats.p", "wb") as fp:
        pickle.dump(cross_test_df, fp, pickle.HIGHEST_PROTOCOL)
    with open("data/same-side-classification/within_test_df_stats.p", "wb") as fp:
        pickle.dump(within_test_df, fp, pickle.HIGHEST_PROTOCOL)

In [21]:
if stats:
    def get_overview(df, task='same-side', class_name='is_same_side'):
        # Total instance numbers
        total = len(df)
        print("Task: ", task)
        print('=' * 40, '\n')

        print('Total instances: ', total)
        print('\n')

        print('For each topic:')
        for tag, tag_df in df.groupby(['tag']):
            print(tag, ': ', len(tag_df), ' instances')
            print('')
            print('\t\tUnique argument1:', len(tag_df['argument1'].unique()))
            print('\t\tUnique argument2:', len(tag_df['argument2'].unique()))
            arguments = np.concatenate([tag_df['argument1'].values, tag_df['argument2'].values])
            print('\t\tUnique total arguments:', len(set(list(arguments))), '\n')
            if class_name in df.columns:
                for is_same_side, side_df in tag_df.groupby([class_name]):
                    print('\t\t', is_same_side, ': ', len(side_df), ' instances')
        print('\n')

        if class_name in df.columns:
            print('For each class value:')
            for class_value, class_df in df.groupby([class_name]):
                print(class_value, ': ', len(class_df), ' instances')
                print('\t\tUnique argument1:', len(class_df['argument1'].unique()))
                print('\t\tUnique argument2:', len(class_df['argument2'].unique()))
                arguments = np.concatenate([class_df['argument1'].values, class_df['argument2'].values])
                print('\t\tUnique total arguments:', len(set(list(arguments))), '\n')
            print('\n')

        print('Unique argument1:', len(df['argument1'].unique()))
        print('Unique argument2:', len(df['argument2'].unique()))
        arguments = df['argument1'].values
        arguments = np.concatenate([arguments, df['argument2'].values])

        print('Unique total arguments:', len(set(list(arguments))), '\n')

        print('-' * 40, '\n')

        arguments_length_lst = [x for x in df['argument1_len'].values]
        arguments_length_lst.extend([x for x in df['argument2_len'].values])
        print('Words:')
        print('\tshortest argument:', min(arguments_length_lst), ' words')
        print('\tlongest argument:', max(arguments_length_lst), ' words')
        print('\targument average length:', np.mean(arguments_length_lst),
              ' words')

        arguments_sent_length_lst = [x for x in df['argument1_sent_num'].values]
        arguments_sent_length_lst.extend([x for x in df['argument2_sent_num'].values])
        print('Sentences:')
        print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
        print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
        print('\targument average length:', np.mean(arguments_sent_length_lst),
              ' sentences')

In [22]:
if stats:
    cross_traindev_df.describe()

In [23]:
if stats:
    within_traindev_df.describe()

In [24]:
if stats:
    within_test_df.describe()

In [25]:
if stats:
    def plot_lengths(df, slicen=None, abs_diff=True, title=None):
        if df is None:
            print("no lengths to plot")
            return

        arg1_lens = df['argument1_len']
        arg2_lens = df['argument2_len']
        arg_diff_len = df['argument12_len_diff']

        if abs_diff:
            arg_diff_len = np.abs(arg_diff_len)

        if slicen is not None:
            arg1_lens = arg1_lens[slicen]
            arg2_lens = arg2_lens[slicen]
            arg_diff_len = arg_diff_len[slicen]

        x = np.arange(len(arg1_lens))  # arange/linspace

        plt.subplot(2, 1, 1)
        plt.plot(x, arg1_lens, label='argument1')  # Linie: '-', 'o-', '.-'
        plt.plot(x, arg2_lens, label='argument2')  # Linie: '-', 'o-', '.-'
        plt.legend()
        plt.title('Lengths of arguments' if not title else title)
        plt.ylabel('Lengths of arguments 1 and 2')

        plt.subplot(2, 1, 2)
        plt.plot(x, arg_diff_len)
        plt.xlabel('Index')
        plt.ylabel('Differences')

        plt.show()


    plot_lengths(within_traindev_df, slice(None, None, 500), title='Length of arguments within train/dev, every 500')
    plot_lengths(cross_traindev_df, slice(None, None, 500), title='Length of arguments cross train/dev, every 500')
    plot_lengths(within_test_df, slice(None, None, 1), title='Length of arguments within test')

## Train model - Baseline

In [26]:
names_columns_X = ['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']
names_columns_X2 = ['argument1', 'argument2', 'tag']
names_columns_y = ['is_same_side']

### train dev set - 70% 30%

In [27]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[names_columns_X]
    y = df[names_columns_y]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

### distinct train dev sets

In [28]:
import pickle


def load_distinct_df_raw(name="within"):
    fn = "data/distinct_sets/{name}/{name}_{mode}_arg_pickle.pkl"
    fn_train = fn.format(mode="train", name=name)
    fn_dev = fn.format(mode="dev", name=name)

    with open(fn_train, "rb") as fp:
        train_df = pickle.load(fp)
    with open(fn_dev, "rb") as fp:
        dev_df = pickle.load(fp)
        
    # return pd.concat([train_df, dev_df])
    return train_df, dev_df


def load_distinct_data(name="within"):
    train_df, dev_df = load_distinct_df_raw(name)

    X_train = train_df[names_columns_X]
    y_train = train_df[names_columns_y]
    X_dev = dev_df[names_columns_X]
    y_dev = dev_df[names_columns_y]
    
    return X_train, X_dev, y_train, y_dev

In [None]:
distinct_within_train_df, distinct_within_dev_df = load_distinct_df_raw("within")

distinct_within_train_df = distinct_within_train_df.progress_apply(add_tag, axis=1)
distinct_within_dev_df = distinct_within_dev_df.progress_apply(add_tag, axis=1)

get_overview(distinct_within_train_df, task="same-side distinct within train")
get_overview(distinct_within_dev_df, task="same-side distinct within dev")

In [None]:
distinct_cross_train_df, distinct_cross_dev_df = load_distinct_df_raw("cross")

distinct_cross_train_df = distinct_cross_train_df.progress_apply(add_tag, axis=1)
distinct_cross_dev_df = distinct_cross_dev_df.progress_apply(add_tag, axis=1)

get_overview(distinct_cross_train_df, task="same-side distinct cross train")
get_overview(distinct_cross_dev_df, task="same-side distinct cross dev")

In [None]:
get_overview(artificial_evalset_df, task="same-side artificial evalset")

### within as a dev set for cross etc.

In [29]:
def split_within_by_topic(within_df):
    groups = within_df.groupby(['tag'])
    abortion_df = groups.get_group("abortion")
    gay_marriage_df = groups.get_group("gay marriage")
    
    X_abortion = abortion_df[names_columns_X]
    y_abortion = abortion_df[names_columns_y]
    X_gay_marriage = gay_marriage_df[names_columns_X]
    y_gay_marriage = gay_marriage_df[names_columns_y]
    
    return X_abortion, X_gay_marriage, y_abortion, y_gay_marriage

---

### BERT

- https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html

In [30]:
class MyBERTDataset(SimpleDataset):
    def __init__(self, X, y=None):
        self._X = X
        self._y = y
        super(MyBERTDataset, self).__init__(self._convert())

    def _convert(self):
        allsamples = list()

        if self._y is not None:
            df = self._X.merge(self._y, left_index=True, right_index=True)
            for _, row in df.iterrows():
                # allsamples.append([
                #     row['argument1'], row['argument2'],
                #     "1" if str(row['is_same_side']) == "True" else "0"
                # ])
                allsamples.append([
                    row['argument1'], row['argument2'],
                    1 if str(row['is_same_side']) == "True" else 0
                ])

        else:
            for _, row in self._X.iterrows():
                allsamples.append([row['argument1'], row['argument2'], None])

        return allsamples

###### my own `BERTDatasetTransform` for extracting chunks from arguments or last part etc.

```python
transform = dataset.BERTDatasetTransform(bert_tokenizer, 512,
                                         labels=['0', '1'],
                                         label_dtype='int32',
                                         pad=True,
                                         pair=True)
```

http://localhost:9001/edit/bert/dataset.py @454
```python
# substitute with my own (e. g. last part, many parts etc.)
def __init__(...):
    self._bert_xform = BERTSentenceTransform(tokenizer, max_seq_length, pad=pad, pair=pair)
```
https://gluon-nlp.mxnet.io/master/_modules/gluonnlp/data/transforms.html#BERTSentenceTransform
```python
# substitute with my own (e. g. only last part (trim from start))
self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_length - 3)
```

https://mxnet.incubator.apache.org/_modules/mxnet/gluon/data/dataset.html#Dataset.transform

In [31]:
from gluonnlp.data import BERTSentenceTransform


class LastPartBERTSentenceTransform(BERTSentenceTransform):
    def __init__(self, tokenizer, max_seq_length, pad=True, pair=True):
        super(LastPartBERTSentenceTransform, self).__init__(tokenizer, max_seq_length, pad=pad, pair=pair)


    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length.
        Removes from end of token list."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop(0)
            else:
                tokens_b.pop(0)


class FirstAndLastPartBERTSentenceTransform(BERTSentenceTransform):
    def __init__(self, tokenizer, max_seq_length, pad=True, pair=True):
        super(FirstAndLastPartBERTSentenceTransform,
              self).__init__(tokenizer, max_seq_length, pad=pad, pair=pair)

    def __call__(self, line):
        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer(text_a)
        tokens_a_epi = tokens_a.copy()
        tokens_b = None
        tokens_b_epi = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)
            tokens_b_epi = tokens_b.copy()

        if tokens_b:
            self._truncate_seq_pair_prolog(tokens_a, tokens_b,
                                           self._max_seq_length - 3)
            self._truncate_seq_pair_epilog(tokens_a_epi, tokens_b_epi,
                                           self._max_seq_length - 3)
        else:
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]
            if len(tokens_a_epi) > self._max_seq_length - 2:
                tokens_a_epi = tokens_a_epi[0:(self._max_seq_length - 2)]

        vocab = self._tokenizer.vocab
        tokens, tokens_epi = [], []
        tokens.append(vocab.cls_token)
        tokens_epi.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens_epi.extend(tokens_a_epi)
        tokens.append(vocab.sep_token)
        tokens_epi.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)
        segment_ids_epi = [0] * len(tokens_epi)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens_epi.extend(tokens_b_epi)
            tokens.append(vocab.sep_token)
            tokens_epi.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
            segment_ids_epi.extend([1] * (len(tokens) - len(segment_ids_epi)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
        input_ids_epi = self._tokenizer.convert_tokens_to_ids(tokens_epi)
        valid_length = len(input_ids)
        valid_length_epi = len(input_ids_epi)

        if self._pad:
            padding_length = self._max_seq_length - valid_length
            padding_length_epi = self._max_seq_length - valid_length_epi
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            input_ids_epi.extend([vocab[vocab.padding_token]] *
                                 padding_length_epi)
            segment_ids.extend([0] * padding_length)
            segment_ids_epi.extend([0] * padding_length_epi)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32'), np.array(input_ids_epi, dtype='int32'),\
            np.array(valid_length_epi, dtype='int32'), np.array(segment_ids_epi, dtype='int32')

    def _truncate_seq_pair_prolog(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def _truncate_seq_pair_epilog(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length.
        Removes from end of token list."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop(0)
            else:
                tokens_b.pop(0)

In [32]:
class LastPartBERTDatasetTransform(dataset.BERTDatasetTransform):
    def __init__(self, tokenizer, max_seq_length, labels=None, pad=True, pair=True, label_dtype='float32'):
        super(LastPartBERTDatasetTransform, self).__init__(tokenizer, max_seq_length, labels=labels, pad=pad, pair=pair, label_dtype=label_dtype)
        self._bert_xform = LastPartBERTSentenceTransform(tokenizer, max_seq_length, pad=pad, pair=pair)


class FirstAndLastPartBERTDatasetTransform(dataset.BERTDatasetTransform):
    def __init__(self,
                 tokenizer,
                 max_seq_length,
                 labels=None,
                 pad=True,
                 pair=True,
                 label_dtype='float32'):
        super(FirstAndLastPartBERTDatasetTransform,
              self).__init__(tokenizer,
                             max_seq_length,
                             labels=labels,
                             pad=pad,
                             pair=pair,
                             label_dtype=label_dtype)
        self._bert_xform = FirstAndLastPartBERTSentenceTransform(
            tokenizer, max_seq_length, pad=pad, pair=pair)

    def __call__(self, line):
        input_ids, valid_length, segment_ids, input_ids_epi, valid_length_epi, segment_ids_epi = self._bert_xform(
            line[:-1])

        label = line[-1]

        # if label is None than we are predicting unknown data
        if label is None:
            # early abort
            return input_ids, valid_length, segment_ids, input_ids_epi, valid_length_epi, segment_ids_epi
            
        if self.labels:  # for classification task
            label = self._label_map[label]
        label = np.array([label], dtype=self.label_dtype)

        return input_ids, valid_length, segment_ids, input_ids_epi, valid_length_epi, segment_ids_epi, label

In [33]:
from mxnet.gluon import Block
from mxnet.gluon import nn


class BERTProEpiClassifier(Block):
    """Model for sentence (pair) classification task with BERT.

    The model feeds token ids and token type ids into BERT to get the
    pooled BERT sequence representation, then apply a Dense layer for
    classification. Does this also for an adversarial classifier.

    Parameters
    ----------
    bert: BERTModel
        Bidirectional encoder with transformer.
    num_classes : int, default is 2
        The number of target classes.
    dropout : float or None, default 0.0.
        Dropout probability for the bert output.
    prefix : str or None
        See document of `mx.gluon.Block`.
    params : ParameterDict or None
        See document of `mx.gluon.Block`.
    """

    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=0.0,
                 prefix=None,
                 params=None):
        super(BERTProEpiClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))

    def forward(self,
                inputs,
                token_types,
                valid_length=None,
                inputs_epi=None,
                token_types_epi=None,
                valid_length_epi=None):  # pylint: disable=arguments-differ
        """Generate the unnormalized scores for the given the input sequences.
        From both classifiers (classifier + adversarial_classifier).

        Parameters
        ----------
        inputs : NDArray, shape (batch_size, seq_length)
            Input words for the sequences.
        token_types : NDArray, shape (batch_size, seq_length)
            Token types for the sequences, used to indicate whether the word belongs to the
            first sentence or the second one.
        valid_length : NDArray or None, shape (batch_size)
            Valid length of the sequence. This is used to mask the padded tokens.
        inputs_epi : NDArray or None, shape (batch_size, seq_length)
            Input words for the sequences. If None then same as inputs.
        token_types_epi : NDArray or None, shape (batch_size, seq_length)
            Token types for the sequences, used to indicate whether the word belongs to the
            first sentence or the second one. If None then same as token_types.
        valid_length_epi : NDArray or None, shape (batch_size)
            Valid length of the sequence. This is used to mask the padded tokens.

        Returns
        -------
        outputs : NDArray
            Shape (batch_size, num_classes), outputs of classifier.
        """
        # if inputs_epi is None and token_types_epi is None:
        #     inputs_epi = inputs
        #     token_types_epi = token_types
        #     valid_length_epi = valid_length

        _, pooler_out = self.bert(inputs, token_types, valid_length)
        _, pooler_out_epi = self.bert(inputs_epi, token_types_epi, valid_length_epi)
        pooler_concat = mx.nd.concat(pooler_out, pooler_out_epi, dim=1)
        return self.classifier(pooler_concat)

In [34]:
def setup_bert():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(1)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # print(bert_base)

    # model = BERTProEpiClassifier(bert_base, num_classes=2, dropout=0.1)
    # model = BERTProEpiClassifier(bert_base, num_classes=1, dropout=0.1)
    model = bert.BERTClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    # sigmoid binary cross entropy loss for classification
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 512  # + batch_size: 6 ?
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    # transform = dataset.BERTDatasetTransform
    # transform = FirstAndLastPartBERTDatasetTransform(bert_tokenizer,
    #                                                  max_len,
    #                                                  labels=all_labels,
    #                                                  label_dtype='int32',
    #                                                  pad=True,
    #                                                  pair=True)
    transform = LastPartBERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels


def setup_bert_pro128bce():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # print(bert_base)

    model = bert.BERTClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    # sigmoid binary cross entropy loss for classification
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 128  # + batch_size: 6 ?
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    transform = dataset.BERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels


def setup_bert_epi128bce():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # print(bert_base)

    model = bert.BERTClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    # sigmoid binary cross entropy loss for classification
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 128  # + batch_size: 6 ?
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    # transform = dataset.BERTDatasetTransform
    transform = LastPartBERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels


def setup_bert_proepi512bce():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(1)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # print(bert_base)

    # model = BERTProEpiClassifier(bert_base, num_classes=2, dropout=0.1)
    model = BERTProEpiClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    # sigmoid binary cross entropy loss for classification
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 512  # + batch_size: 2
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    transform = FirstAndLastPartBERTDatasetTransform(bert_tokenizer,
                                                     max_len,
                                                     labels=all_labels,
                                                     label_dtype='int32',
                                                     pad=True,
                                                     pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels


def setup_bert_pro512bce():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # print(bert_base)

    model = bert.BERTClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    # sigmoid binary cross entropy loss for classification
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 512  # + batch_size: 6 ?
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    transform = dataset.BERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels


def setup_bert_epi512bce():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    # print(bert_base)

    # model = BERTProEpiClassifier(bert_base, num_classes=2, dropout=0.1)
    # model = BERTProEpiClassifier(bert_base, num_classes=1, dropout=0.1)
    model = bert.BERTClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    # sigmoid binary cross entropy loss for classification
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 512  # + batch_size: 6 ?
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    transform = LastPartBERTDatasetTransform(bert_tokenizer, max_len,
                                             labels=all_labels,
                                             label_dtype='int32',
                                             pad=True,
                                             pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels

In [35]:
def transform_dataset(X, y, transform):
    data_train_raw = MyBERTDataset(X, y)
    data_train = data_train_raw.transform(transform)
    return data_train_raw, data_train


def predict_out_to_ys(all_predictions, all_labels):
    y_true, y_pred = list(), list()

    for _, y_true_many, y_pred_many in all_predictions:
        y_true_many = y_true_many.T[0].asnumpy()
        # https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss
        # pred: the prediction tensor, where the batch_axis dimension ranges over batch size and axis dimension ranges over the number of classes.
        #y_pred_many = np.argmax(y_pred_many, axis=1).asnumpy()
        y_pred_many = y_pred_many.asnumpy()

        y_true.extend(list(y_true_many))
        y_pred.extend(list(y_pred_many))
        # TODO: convert label_id to label?
        # y_pred.extend(all_labels[c] for c in list(y_pred_many))

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return y_true, y_pred

Multi-GPU?
- https://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html

In [36]:
def train(model,
          data_train,
          ctx,
          metric,
          loss_function,
          batch_size=32,
          lr=5e-6,
          num_epochs=3,
          sw=None,
          checkpoint_dir="data",
          use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(), 'adam', {
            'learning_rate': lr,
            'epsilon': 1e-9
        })

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [
            p for p in model.collect_params().values() if p.grad_req != 'null'
        ]

    log_interval = 500
    global_step = 0
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(
                    epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(
                        checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                global_step = epoch_id * len(bert_dataloader)
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               label) in enumerate(tqdm(bert_dataloader)):
                    global_step += 1
                    with mx.autograd.record():
                        # load data to GPU
                        token_ids = token_ids.as_in_context(ctx)
                        valid_length = valid_length.as_in_context(ctx)
                        segment_ids = segment_ids.as_in_context(ctx)
                        label = label.as_in_context(ctx)

                        # forward computation
                        out = model(token_ids, segment_ids,
                                    valid_length.astype('float32'))
                        label = label.astype('float32')
                        ls = loss_function(out, label).mean()

                    # backward computation
                    ls.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    step_loss += ls.asscalar()
                    out = out.sigmoid().round().astype('int32')
                    label = label.astype('int32')
                    metric.update([label], [out])
                    stats.append((metric.get()[1], ls.asscalar()))

                    if sw:
                        sw.add_scalar(tag='T-ls', value=ls.asscalar(), global_step=global_step)
                        sw.add_scalar(tag='T-acc', value=metric.get()[1], global_step=global_step)

                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(
                                epoch_id, batch_id + 1, len(bert_dataloader),
                                step_loss / log_interval,
                                trainer.learning_rate,
                                metric.get()[1],
                                datetime.timedelta(seconds=(time.time() -
                                                            t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)

    return stats


def train_proepi(model,
          data_train,
          ctx,
          metric,
          loss_function,
          batch_size=32,
          lr=5e-6,
          num_epochs=3,
          sw=None,
          checkpoint_dir="data",
          use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(), 'adam', {
            'learning_rate': lr,
            'epsilon': 1e-9
        })

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [
            p for p in model.collect_params().values() if p.grad_req != 'null'
        ]

    log_interval = 500
    global_step = 0
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(
                    epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(
                        checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                global_step = epoch_id * len(bert_dataloader)
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               token_ids_epi, valid_length_epi,
                               segment_ids_epi,
                               label) in enumerate(tqdm(bert_dataloader)):
                    global_step += 1
                    with mx.autograd.record():
                        # load data to GPU
                        token_ids = token_ids.as_in_context(ctx)
                        valid_length = valid_length.as_in_context(ctx)
                        segment_ids = segment_ids.as_in_context(ctx)
                        token_ids_epi = token_ids_epi.as_in_context(ctx)
                        valid_length_epi = valid_length_epi.as_in_context(ctx)
                        segment_ids_epi = segment_ids_epi.as_in_context(ctx)
                        label = label.as_in_context(ctx)

                        # forward computation
                        out = model(token_ids, segment_ids,
                                    valid_length.astype('float32'),
                                    token_ids_epi, segment_ids_epi,
                                    valid_length_epi.astype('float32'))
                        label = label.astype('float32')
                        ls = loss_function(out, label).mean()

                    # backward computation
                    ls.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    step_loss += ls.asscalar()
                    out = out.sigmoid().round().astype('int32')
                    label = label.astype('int32')
                    metric.update([label], [out])
                    stats.append((metric.get()[1], ls.asscalar()))

                    if sw:
                        sw.add_scalar(tag='T-ls', value=ls.asscalar(), global_step=global_step)
                        sw.add_scalar(tag='T-acc', value=metric.get()[1], global_step=global_step)

                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(
                                epoch_id, batch_id + 1, len(bert_dataloader),
                                step_loss / log_interval,
                                trainer.learning_rate,
                                metric.get()[1],
                                datetime.timedelta(seconds=(time.time() -
                                                            t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)

    return stats

In [37]:
def train_multi(model,
                data_train,
                ctx,
                metric,
                loss_function,
                batch_size=32,
                lr=5e-6,
                num_epochs=3,
                checkpoint_dir="data",
                use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(),
                                'adam', {
                                    'learning_rate': lr,
                                    'epsilon': 1e-9
                                },
                                update_on_kvstore=False)

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [
            p for p in model.collect_params().values() if p.grad_req != 'null'
        ]

    log_interval = 500
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(
                    epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(
                        checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               label) in enumerate(bert_dataloader):
                    with mx.autograd.record():
                        # load data to GPU
                        token_ids = gluon.utils.split_and_load(
                            token_ids, ctx, even_split=False)
                        valid_length = gluon.utils.split_and_load(
                            valid_length, ctx, even_split=False)
                        segment_ids = gluon.utils.split_and_load(
                            segment_ids, ctx, even_split=False)
                        label = gluon.utils.split_and_load(label,
                                                           ctx,
                                                           even_split=False)

                        # forward computation
                        out = [
                            model(t1, s1, v1.astype('float32'), t2, s2,
                                  v2.astype('float32'))
                            for t1, s1, v1, t2, s2, v2 in zip(
                                token_ids, segment_ids, valid_length)
                        ]
                        ls = [
                            loss_function(o, l.astype('float32')).mean()
                            for o, l in zip(out, label)
                        ]

                    # backward computation
                    for l in ls:
                        l.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    for l in ls:
                        step_loss += l.asscalar()
                    for o, l in zip(out, label):
                        metric.update([l.astype('int32')],
                                      [o.sigmoid().round().astype('int32')])
                    stats.append((metric.get()[1], [l.asscalar() for l in ls]))
                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(
                                epoch_id, batch_id + 1, len(bert_dataloader),
                                step_loss / log_interval,
                                trainer.learning_rate,
                                metric.get()[1],
                                datetime.timedelta(seconds=(time.time() -
                                                            t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)

    return stats


def train_multi_proepi(model,
                data_train,
                ctx,
                metric,
                loss_function,
                batch_size=32,
                lr=5e-6,
                num_epochs=3,
                checkpoint_dir="data",
                use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(),
                                'adam', {
                                    'learning_rate': lr,
                                    'epsilon': 1e-9
                                },
                                update_on_kvstore=False)

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [
            p for p in model.collect_params().values() if p.grad_req != 'null'
        ]

    log_interval = 500
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(
                    epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(
                        checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               token_ids_epi, valid_length_epi,
                               segment_ids_epi,
                               label) in enumerate(bert_dataloader):
                    with mx.autograd.record():
                        # load data to GPU
                        token_ids = gluon.utils.split_and_load(
                            token_ids, ctx, even_split=False)
                        valid_length = gluon.utils.split_and_load(
                            valid_length, ctx, even_split=False)
                        segment_ids = gluon.utils.split_and_load(
                            segment_ids, ctx, even_split=False)
                        token_ids_epi = gluon.utils.split_and_load(
                            token_ids_epi, ctx, even_split=False)
                        valid_length_epi = gluon.utils.split_and_load(
                            valid_length_epi, ctx, even_split=False)
                        segment_ids_epi = gluon.utils.split_and_load(
                            segment_ids_epi, ctx, even_split=False)
                        label = gluon.utils.split_and_load(label,
                                                           ctx,
                                                           even_split=False)

                        # forward computation
                        out = [
                            model(t1, s1, v1.astype('float32'), t2, s2,
                                  v2.astype('float32'))
                            for t1, s1, v1, t2, s2, v2 in zip(
                                token_ids, segment_ids, valid_length,
                                token_ids_epi, segment_ids_epi,
                                valid_length_epi)
                        ]
                        ls = [
                            loss_function(o, l.astype('float32')).mean()
                            for o, l in zip(out, label)
                        ]

                    # backward computation
                    for l in ls:
                        l.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    for l in ls:
                        step_loss += l.asscalar()
                    for o, l in zip(out, label):
                        metric.update([l.astype('int32')],
                                      [o.sigmoid().round().astype('int32')])
                    stats.append((metric.get()[1], [l.asscalar() for l in ls]))
                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(
                                epoch_id, batch_id + 1, len(bert_dataloader),
                                step_loss / log_interval,
                                trainer.learning_rate,
                                metric.get()[1],
                                datetime.timedelta(seconds=(time.time() -
                                                            t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)

    return stats

In [38]:
def predict(model, data_predict, ctx, metric, loss_function, batch_size=32, sw=None):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict,
                                               batch_size=batch_size)

    all_predictions = list()

    with Timer("prediction"):
        metric.reset()
        cum_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids,
                       label) in enumerate(tqdm(bert_dataloader)):
            global_step = batch_id
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'))
            label = label.astype('float32')
            ls = loss_function(out, label).mean()

            out = out.sigmoid().round().astype('int32')
            label = label.astype('int32')
            metric.update([label], [out])
            cum_loss += ls.asscalar()  # .sum() ?

            if sw:
                sw.add_scalar(tag='P-ls', value=ls.asscalar(), global_step=global_step)
                sw.add_scalar(tag='P-acc', value=metric.get()[1], global_step=global_step)

            all_predictions.append((batch_id, label, out))

    return all_predictions, cum_loss


def predict_proepi(model, data_predict, ctx, metric, loss_function, batch_size=32, sw=None):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict,
                                               batch_size=batch_size)

    all_predictions = list()

    with Timer("prediction"):
        metric.reset()
        cum_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids, token_ids_epi,
                       valid_length_epi, segment_ids_epi,
                       label) in enumerate(tqdm(bert_dataloader)):
            global_step = batch_id
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            token_ids_epi = token_ids_epi.as_in_context(ctx)
            valid_length_epi = valid_length_epi.as_in_context(ctx)
            segment_ids_epi = segment_ids_epi.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'),
                        token_ids_epi, segment_ids_epi,
                        valid_length_epi.astype('float32'))
            label = label.astype('float32')
            ls = loss_function(out, label).mean()

            out = out.sigmoid().round().astype('int32')
            label = label.astype('int32')
            metric.update([label], [out])
            cum_loss += ls.asscalar()  # .sum() ?

            if sw:
                sw.add_scalar(tag='P-ls', value=ls.asscalar(), global_step=global_step)
                sw.add_scalar(tag='P-acc', value=metric.get()[1], global_step=global_step)

            all_predictions.append((batch_id, label, out))

    return all_predictions, cum_loss

In [39]:
def predict_unknown(model, data_predict, ctx, label_map=None, batch_size=32):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict,
                                               batch_size=batch_size)

    predictions = list()

    with Timer("prediction"):
        for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm(bert_dataloader)):
            global_step = batch_id
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'))

            # to binary: 0/1
            out = out.sigmoid().round().astype('int32')
            # to numpy (not mxnet)
            out = out.asnumpy()
            # get mapping type
            if label_map:
                out = [label_map[c] for c in list(out)]

            predictions.extend(out)

    # list to numpy array
    predictions = np.array(predictions)

    return predictions


def predict_unknown_proepi(model, data_predict, ctx, label_map=None, batch_size=32):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict,
                                               batch_size=batch_size)

    predictions = list()

    with Timer("prediction"):
        for batch_id, (token_ids, valid_length, segment_ids, token_ids_epi,
                       valid_length_epi,
                       segment_ids_epi) in enumerate(tqdm(bert_dataloader)):
            global_step = batch_id
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            token_ids_epi = token_ids_epi.as_in_context(ctx)
            valid_length_epi = valid_length_epi.as_in_context(ctx)
            segment_ids_epi = segment_ids_epi.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'),
                        token_ids_epi, segment_ids_epi,
                        valid_length_epi.astype('float32'))

            # to binary: 0/1
            out = out.sigmoid().round().astype('int32')
            # to numpy (not mxnet)
            out = out.asnumpy()
            # get mapping type
            if label_map:
                out = [label_map[c] for c in list(out)]

            predictions.extend(out)

    # list to numpy array
    predictions = np.array(predictions)

    return predictions

In [40]:
def print_infos(vocabulary, data_train_raw, data_train):
    sample_id = 0

    # sentence a
    print(data_train_raw[sample_id][0])
    # sentence b
    print(data_train_raw[sample_id][1])
    # 1 means equivalent, 0 means not equivalent
    print(data_train_raw[sample_id][2])

    print('vocabulary used for tokenization = \n%s' % vocabulary)
    print('[PAD] token id = %s' % (vocabulary['[PAD]']))
    print('[CLS] token id = %s' % (vocabulary['[CLS]']))
    print('[SEP] token id = %s' % (vocabulary['[SEP]']))

    print('token ids = \n%s' % data_train[sample_id][0])
    print('valid length = \n%s' % data_train[sample_id][1])
    print('segment ids = \n%s' % data_train[sample_id][2])
    print('label = \n%s' % data_train[sample_id][3])


def print_infos_proepi(vocabulary, data_train_raw, data_train):
    sample_id = 0

    # sentence a
    print(data_train_raw[sample_id][0])
    # sentence b
    print(data_train_raw[sample_id][1])
    # 1 means equivalent, 0 means not equivalent
    print(data_train_raw[sample_id][2])

    print('vocabulary used for tokenization = \n%s' % vocabulary)
    print('[PAD] token id = %s' % (vocabulary['[PAD]']))
    print('[CLS] token id = %s' % (vocabulary['[CLS]']))
    print('[SEP] token id = %s' % (vocabulary['[SEP]']))

    print('token ids = \n%s' % data_train[sample_id][0])
    print('valid length = \n%s' % data_train[sample_id][1])
    print('segment ids = \n%s' % data_train[sample_id][2])
    print('epi token ids = \n%s' % data_train[sample_id][3])
    print('epi valid length = \n%s' % data_train[sample_id][4])
    print('epi segment ids = \n%s' % data_train[sample_id][5])
    print('label = \n%s' % data_train[sample_id][6])


def plot_train_stats(stats):
    if not stats:
        print("no stats to plot")
        return

    x = np.arange(len(stats))  # arange/linspace

    acc_dots, loss_dots = zip(*stats)
    # if isinstance(loss_dots, tuple):
    #     loss_dots, loss_dots2 = zip(*loss_dots)

    plt.subplot(2, 1, 1)
    plt.plot(x, acc_dots)  # Linie: '-', 'o-', '.-'
    plt.title('Training BERTClassifier')
    plt.ylabel('Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(x, loss_dots)
    plt.xlabel('Batches')
    plt.ylabel('Loss')

    plt.show()

### Evaluate

In [41]:
def compute_metrics(conf_mat, precision=3, dump=True):
    conf_mat = np.array(conf_mat)
    tn, fp, fn, tp = conf_mat.ravel()

    acc = (tp + tn) / (tp + tn + fp + fn)
    prec = tp / (tp + fp)
    rec  = tp / (tp + fn)
    f1 = 2 * (prec * rec) / (prec + rec)

    if dump:
        print("{:>10}: {:.{prec}f}".format("accuracy", acc, prec=precision))
        print("{:>10}: {:.{prec}f}".format("precision", prec, prec=precision))
        print("{:>10}: {:.{prec}f}".format("recall", rec, prec=precision))
        print("{:>10}: {:.{prec}f}".format("f1-score", f1, prec=precision))

    return prec, rec, f1, acc

In [42]:
def heatconmat(y_test, y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(9, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(np.unique(y_test)))
    plt.show()


def report_training_results(y_test, y_pred, name=None, heatmap=True):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print()
    compute_metrics(confusion_matrix(y_test, y_pred))
    if heatmap:
        heatconmat(y_test, y_pred)
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2), '\n')  #

    print('Report{}:'.format("" if not name else " for [{}]".format(name)))
    print(classification_report(y_test, y_pred))

    f1_dic = {}
    f1_dic['macro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(
        f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic

### (B.1) Within distinct datasets

In [None]:
with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = load_distinct_data("within")

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    # print_infos(vocabulary, data_train_raw, data_train)

run_name = "within_traindev_epi512_BCE_distinct"
! mkdir data/within_traindev_epi512_BCE_distinct

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    # print_infos(vocabulary, data_dev_raw, data_dev)

for epoch_id in range(3):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - distinct within BCE epilog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "within_traindev_epi512_BCE_distinct"
with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - distinct within BCE epilog", heatmap=False)

```
Time for [prediction]: 0:00:57.187743

Accuracy: 0.6220657276995305

Confusion Matrix:
[[368 224]
 [259 427]]

Accuracy:  0.62 

Report for [BERTClassifier - distinct within 512 BCE epilog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.59      0.62      0.60       592
           1       0.66      0.62      0.64       686

    accuracy                           0.62      1278
   macro avg       0.62      0.62      0.62      1278
weighted avg       0.62      0.62      0.62      1278

Time for [6 - evaluate]: 0:00:57.491961
```

### (B.2) Cross distinct datasets

In [None]:
with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = load_distinct_data("cross")

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    # print_infos(vocabulary, data_train_raw, data_train)

run_name = "cross_traindev_epi512_BCE_distinct"
! mkdir data/cross_traindev_epi512_BCE_distinct

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    # print_infos(vocabulary, data_dev_raw, data_dev)

for epoch_id in range(3):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - distinct cross BCE epilog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "cross_traindev_epi512_BCE_distinct"
with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - distinct cross BCE epilog", heatmap=False)

```
Time for [prediction]: 0:02:32.090311
Accuracy: 0.7154929577464789

Confusion Matrix:
[[1449  386]
 [ 523  837]]

Accuracy:  0.72 

Report for [BERTClassifier - distinct cross BCE 512 epilog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.73      0.79      0.76      1835
           1       0.68      0.62      0.65      1360

    accuracy                           0.72      3195
   macro avg       0.71      0.70      0.70      3195
weighted avg       0.71      0.72      0.71      3195

Time for [6 - evaluate]: 0:02:32.553864
```

In [None]:
compute_metrics([[1449, 386], [523, 837]])

### (B.3) within, cluster distinct

In [None]:
fn_cluster_distinct_within = "data/distinct_sets_gw/within_split_by_clusters.pkl"

with open(fn_cluster_distinct_within, "rb") as fp:
    within_train_df = pickle.load(fp)
    within_dev_df = pickle.load(fp)

In [None]:
with Timer("1 - load within cluster distinct test/train"):
    fn_cluster_distinct_within = "data/distinct_sets_gw/within_split_by_clusters.pkl"

    with open(fn_cluster_distinct_within, "rb") as fp:
        within_train_df = pickle.load(fp)
        within_dev_df = pickle.load(fp)
    
    X_train = within_train_df[names_columns_X]
    y_train = within_train_df[names_columns_y]
    X_dev = within_dev_df[names_columns_X]
    y_dev = within_dev_df[names_columns_y]


with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    # print_infos(vocabulary, data_train_raw, data_train)

run_name = "within_traindev_epi512_BCE_distinct_cluster"
! mkdir data/within_traindev_epi512_BCE_distinct_cluster

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    # print_infos(vocabulary, data_dev_raw, data_dev)

for epoch_id in range(3):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - distinct cluster within BCE 512 epilog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
with Timer("1 - load within cluster distinct test/train"):
    fn_cluster_distinct_within = "data/distinct_sets_gw/within_split_by_clusters.pkl"

    with open(fn_cluster_distinct_within, "rb") as fp:
        within_train_df = pickle.load(fp)
        within_dev_df = pickle.load(fp)
    
    X_train = within_train_df[names_columns_X]
    y_train = within_train_df[names_columns_y]
    X_dev = within_dev_df[names_columns_X]
    y_dev = within_dev_df[names_columns_y]


with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

In [None]:
run_name = "within_traindev_epi512_BCE_distinct_cluster"
with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - distinct cluster within BCE 512 epilog (3 epochs)", heatmap=False)

```
Time for [prediction]: 0:17:08.991174
Accuracy: 0.6497847321883247

Confusion Matrix:
[[8815 3593]
 [3972 5221]]

Accuracy:  0.65 

Report for [BERTClassifier - distinct cluster within BCE 512 epilog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.69      0.71      0.70     12408
           1       0.59      0.57      0.58      9193

    accuracy                           0.65     21601
   macro avg       0.64      0.64      0.64     21601
weighted avg       0.65      0.65      0.65     21601

Time for [6 - evaluate]: 0:17:11.581655
```

In [None]:
compute_metrics([[8815, 3593], [3972, 5221]])

---

### (C.1) Within Pro 128 BCE

In [None]:
with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro128bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "within_traindev_pro128_BCE"
! mkdir data/within_traindev_pro128_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 128 prolog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "within_traindev_pro128_BCE"

with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro128bce()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint4.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 5:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 128 prolog", heatmap=False)

```
Time for [prediction]: 0:01:02.161065
Accuracy in epoch 2: 0.8490064152714755
Confusion Matrix:
[[2669  290]
 [ 675 2757]]

Accuracy:  0.85 

Report for [BERTClassifier - within 0.1 BCE 128 prolog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      2959
           1       0.90      0.80      0.85      3432

    accuracy                           0.85      6391
   macro avg       0.85      0.85      0.85      6391
weighted avg       0.86      0.85      0.85      6391

Time for [6 - evaluate - 2]: 0:01:02.227211
```

```
Time for [prediction]: 0:00:59.388592
Accuracy in epoch 5: 0.8662181192301674
Confusion Matrix:
[[2587  372]
 [ 483 2949]]

  accuracy: 0.866
 precision: 0.888
    recall: 0.859
  f1-score: 0.873

Accuracy:  0.87 

Report for [BERTClassifier - within BCE 128 prolog (5 epochs)]:
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      2959
           1       0.89      0.86      0.87      3432

    accuracy                           0.87      6391
   macro avg       0.87      0.87      0.87      6391
weighted avg       0.87      0.87      0.87      6391

Time for [6 - evaluate]: 0:00:59.567344
```

### (C.2) Cross Pro 128 BCE

In [None]:
with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro128bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "cross_traindev_pro128_BCE"
! mkdir data/cross_traindev_pro128_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 128 prolog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "cross_traindev_pro128_BCE"

with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro128bce()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint4.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 5:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 128 prolog", heatmap=False)

```
Time for [prediction]: 0:00:58.160635
Accuracy in epoch 4: 0.8014742014742015
Confusion Matrix:
[[2606  418]
 [ 794 2287]]

Accuracy:  0.8 

Report for [BERTClassifier - cross BCE 128 prolog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      3024
           1       0.85      0.74      0.79      3081

    accuracy                           0.80      6105
   macro avg       0.81      0.80      0.80      6105
weighted avg       0.81      0.80      0.80      6105

Time for [6 - evaluate - 4]: 0:00:58.344300
```

```
Time for [prediction]: 0:00:59.223507
Accuracy in epoch 4: 0.8134316134316134
Confusion Matrix:
[[2439  585]
 [ 554 2527]]

  accuracy: 0.813
 precision: 0.812
    recall: 0.820
  f1-score: 0.816

Accuracy:  0.81 

Report for [BERTClassifier - cross BCE 128 prolog (5 epochs)]:
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      3024
           1       0.81      0.82      0.82      3081

    accuracy                           0.81      6105
   macro avg       0.81      0.81      0.81      6105
weighted avg       0.81      0.81      0.81      6105

Time for [6 - evaluate - 4]: 0:00:59.415538
```

### (C.3) Within Epi 128 BCE

In [None]:
with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi128bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "within_traindev_epi128_BCE"
! mkdir data/within_traindev_epi128_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 128 epilog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "within_traindev_epi128_BCE"

with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi128bce()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint4.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 5:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 128 epilog", heatmap=False)

```
Time for [prediction]: 0:01:01.648894
Accuracy in epoch 4: 0.8543263964950711
Confusion Matrix:
[[2603  356]
 [ 575 2857]]

Accuracy:  0.85 

Report for [BERTClassifier - within BCE 128 epilog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      2959
           1       0.89      0.83      0.86      3432

    accuracy                           0.85      6391
   macro avg       0.85      0.86      0.85      6391
weighted avg       0.86      0.85      0.85      6391

Time for [6 - evaluate - 4]: 0:01:01.835131
```

```
Time for [prediction]: 0:00:59.473044
Accuracy in epoch 5: 0.8709122203098106
Confusion Matrix:
[[2577  382]
 [ 443 2989]]

  accuracy: 0.871
 precision: 0.887
    recall: 0.871
  f1-score: 0.879

Accuracy:  0.87 

Report for [BERTClassifier - within BCE 128 epilog (5 epochs)]:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      2959
           1       0.89      0.87      0.88      3432

    accuracy                           0.87      6391
   macro avg       0.87      0.87      0.87      6391
weighted avg       0.87      0.87      0.87      6391

Time for [6 - evaluate]: 0:00:59.655873
```

### (C.4) Cross Epi 128 BCE

In [None]:
with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi128bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "cross_traindev_epi128_BCE"
! mkdir data/cross_traindev_epi128_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 128 epilog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "cross_traindev_epi128_BCE"

with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi128bce()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint4.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 5:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 128 epilog", heatmap=False)

```
Time for [prediction]: 0:01:02.160098
Accuracy in epoch 2: 0.8610974610974611
Confusion Matrix:
[[2679  345]
 [ 503 2578]]

Accuracy:  0.86 

Report for [BERTClassifier - cross BCE 128 epilog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      3024
           1       0.88      0.84      0.86      3081

    accuracy                           0.86      6105
   macro avg       0.86      0.86      0.86      6105
weighted avg       0.86      0.86      0.86      6105

Time for [6 - evaluate - 4]: 0:01:02.347843
```

```
Time for [prediction]: 0:00:59.955945
Accuracy in epoch 5: 0.884029484029484
Confusion Matrix:
[[2652  372]
 [ 336 2745]]

  accuracy: 0.884
 precision: 0.881
    recall: 0.891
  f1-score: 0.886

Accuracy:  0.88 

Report for [BERTClassifier - cross BCE 128 epilog (5 epochs)]:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      3024
           1       0.88      0.89      0.89      3081

    accuracy                           0.88      6105
   macro avg       0.88      0.88      0.88      6105
weighted avg       0.88      0.88      0.88      6105

Time for [6 - evaluate]: 0:01:00.147448
```

### (D.1) Within 512 Pro

In [None]:
with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro512bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "within_traindev_pro512_BCE"
! mkdir data/within_traindev_pro512_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 512 prolog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "within_traindev_pro512_BCE"

with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro512bce()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint4.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 5:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 512 prolog (5 epochs)", heatmap=False)

```
Time for [prediction]: 0:03:17.058585
Accuracy in epoch 5: 0.9184791112501955
Confusion Matrix:
[[2736  223]
 [ 298 3134]]

  accuracy: 0.918
 precision: 0.934
    recall: 0.913
  f1-score: 0.923

Accuracy:  0.92 

Report for [BERTClassifier - within BCE 512 prolog (5 epochs)]:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2959
           1       0.93      0.91      0.92      3432

    accuracy                           0.92      6391
   macro avg       0.92      0.92      0.92      6391
weighted avg       0.92      0.92      0.92      6391

Time for [6 - evaluate]: 0:03:17.400522
```

### (D.3) Within 512 Epi

In [None]:
with Timer("1 - load within test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi512bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "within_traindev_epi512_BCE"
! mkdir data/within_traindev_epi512_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - within BCE 512 epilog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

Time for [1 - load within test/train]: 0:00:00.019201
Time for [2 - setup BERT model]: 0:00:02.404405
Time for [3 - prepare training data]: 0:00:03.082411
mkdir: cannot create directory ‘data/within_traindev_epi512_BCE’: File exists
Time for [5 - prepare eval data]: 0:00:00.350073


HBox(children=(IntProgress(value=0, max=57512), HTML(value='')))


Time for [setup training]: 0:04:03.167932


HBox(children=(IntProgress(value=0, max=9590), HTML(value='')))

2019-12-06 15:08:15,371 : INFO : successfully opened events file: data/within_traindev_epi512_BCE/events.out.tfevents.1575641295.cuda
2019-12-06 15:08:15,396 : INFO : wrote 1 event to disk
2019-12-06 15:08:15,398 : INFO : wrote 1 event to disk
2019-12-06 15:09:15,722 : INFO : wrote 138 events to disk
2019-12-06 15:10:16,201 : INFO : wrote 138 events to disk
2019-12-06 15:11:16,815 : INFO : wrote 138 events to disk
2019-12-06 15:12:17,443 : INFO : wrote 138 events to disk
2019-12-06 15:13:18,284 : INFO : wrote 138 events to disk
2019-12-06 15:14:19,089 : INFO : wrote 138 events to disk
2019-12-06 15:15:19,215 : INFO : wrote 136 events to disk


[Epoch 0 Batch 500/9590] loss=0.6767, lr=0.0000050, acc=0.577 - time 0:07:19.848978


2019-12-06 15:16:19,781 : INFO : wrote 138 events to disk
2019-12-06 15:17:20,603 : INFO : wrote 138 events to disk
2019-12-06 15:18:20,890 : INFO : wrote 136 events to disk
2019-12-06 15:19:21,321 : INFO : wrote 138 events to disk
2019-12-06 15:20:21,853 : INFO : wrote 138 events to disk
2019-12-06 15:21:22,360 : INFO : wrote 138 events to disk
2019-12-06 15:22:22,511 : INFO : wrote 136 events to disk


[Epoch 0 Batch 1000/9590] loss=0.6141, lr=0.0000050, acc=0.615 - time 0:07:20.038714


2019-12-06 15:23:22,736 : INFO : wrote 138 events to disk
2019-12-06 15:24:23,322 : INFO : wrote 138 events to disk
2019-12-06 15:25:23,895 : INFO : wrote 138 events to disk
2019-12-06 15:26:24,037 : INFO : wrote 136 events to disk
2019-12-06 15:27:24,065 : INFO : wrote 136 events to disk
2019-12-06 15:28:24,227 : INFO : wrote 138 events to disk
2019-12-06 15:29:24,790 : INFO : wrote 138 events to disk


[Epoch 0 Batch 1500/9590] loss=0.5399, lr=0.0000050, acc=0.652 - time 0:07:19.412128


2019-12-06 15:30:24,910 : INFO : wrote 136 events to disk
2019-12-06 15:31:24,996 : INFO : wrote 136 events to disk
2019-12-06 15:32:25,292 : INFO : wrote 136 events to disk
2019-12-06 15:33:25,453 : INFO : wrote 136 events to disk
2019-12-06 15:34:26,219 : INFO : wrote 138 events to disk
2019-12-06 15:35:26,455 : INFO : wrote 136 events to disk
2019-12-06 15:36:27,241 : INFO : wrote 138 events to disk
2019-12-06 15:37:27,711 : INFO : wrote 138 events to disk


[Epoch 0 Batch 2000/9590] loss=0.4622, lr=0.0000050, acc=0.683 - time 0:07:21.237450


2019-12-06 15:38:28,394 : INFO : wrote 138 events to disk
2019-12-06 15:39:28,457 : INFO : wrote 136 events to disk
2019-12-06 15:40:29,293 : INFO : wrote 138 events to disk
2019-12-06 15:41:29,917 : INFO : wrote 138 events to disk
2019-12-06 15:42:30,752 : INFO : wrote 138 events to disk
2019-12-06 15:43:31,115 : INFO : wrote 138 events to disk
2019-12-06 15:44:31,380 : INFO : wrote 136 events to disk


[Epoch 0 Batch 2500/9590] loss=0.3724, lr=0.0000050, acc=0.712 - time 0:07:20.319506


2019-12-06 15:45:31,469 : INFO : wrote 138 events to disk
2019-12-06 15:46:31,519 : INFO : wrote 136 events to disk
2019-12-06 15:47:31,934 : INFO : wrote 138 events to disk
2019-12-06 15:48:32,216 : INFO : wrote 136 events to disk
2019-12-06 15:49:32,919 : INFO : wrote 138 events to disk
2019-12-06 15:50:33,652 : INFO : wrote 138 events to disk
2019-12-06 15:51:33,660 : INFO : wrote 136 events to disk


[Epoch 0 Batch 3000/9590] loss=0.3556, lr=0.0000050, acc=0.734 - time 0:07:20.194741


2019-12-06 15:52:33,700 : INFO : wrote 136 events to disk
2019-12-06 15:53:33,732 : INFO : wrote 136 events to disk
2019-12-06 15:54:33,879 : INFO : wrote 136 events to disk
2019-12-06 15:55:34,193 : INFO : wrote 136 events to disk
2019-12-06 15:56:34,205 : INFO : wrote 136 events to disk
2019-12-06 15:57:34,670 : INFO : wrote 138 events to disk
2019-12-06 15:58:35,454 : INFO : wrote 138 events to disk
2019-12-06 15:59:36,021 : INFO : wrote 136 events to disk


[Epoch 0 Batch 3500/9590] loss=0.3260, lr=0.0000050, acc=0.750 - time 0:07:21.639886


2019-12-06 16:00:36,750 : INFO : wrote 136 events to disk
2019-12-06 16:01:37,117 : INFO : wrote 136 events to disk
2019-12-06 16:02:37,250 : INFO : wrote 136 events to disk
2019-12-06 16:03:38,028 : INFO : wrote 138 events to disk
2019-12-06 16:04:38,787 : INFO : wrote 138 events to disk
2019-12-06 16:05:39,571 : INFO : wrote 138 events to disk
2019-12-06 16:06:40,455 : INFO : wrote 138 events to disk


[Epoch 0 Batch 4000/9590] loss=0.3452, lr=0.0000050, acc=0.761 - time 0:07:22.273317


2019-12-06 16:07:40,710 : INFO : wrote 136 events to disk
2019-12-06 16:08:41,550 : INFO : wrote 138 events to disk
2019-12-06 16:09:41,996 : INFO : wrote 136 events to disk
2019-12-06 16:10:42,265 : INFO : wrote 136 events to disk
2019-12-06 16:11:43,080 : INFO : wrote 138 events to disk
2019-12-06 16:12:43,174 : INFO : wrote 136 events to disk
2019-12-06 16:13:43,614 : INFO : wrote 136 events to disk


[Epoch 0 Batch 4500/9590] loss=0.3475, lr=0.0000050, acc=0.769 - time 0:07:22.224663


2019-12-06 16:14:44,301 : INFO : wrote 138 events to disk
2019-12-06 16:15:45,132 : INFO : wrote 138 events to disk
2019-12-06 16:16:45,285 : INFO : wrote 136 events to disk
2019-12-06 16:17:45,868 : INFO : wrote 138 events to disk
2019-12-06 16:19:47,251 : INFO : wrote 138 events to disk
2019-12-06 16:20:47,775 : INFO : wrote 136 events to disk


[Epoch 0 Batch 5000/9590] loss=0.3316, lr=0.0000050, acc=0.778 - time 0:07:21.045190


2019-12-06 16:21:48,605 : INFO : wrote 138 events to disk
2019-12-06 16:22:48,640 : INFO : wrote 136 events to disk
2019-12-06 16:23:49,357 : INFO : wrote 138 events to disk
2019-12-06 16:24:49,523 : INFO : wrote 138 events to disk
2019-12-06 16:25:49,558 : INFO : wrote 138 events to disk
2019-12-06 16:26:49,716 : INFO : wrote 136 events to disk
2019-12-06 16:27:50,328 : INFO : wrote 138 events to disk
2019-12-06 16:28:50,975 : INFO : wrote 138 events to disk


[Epoch 0 Batch 5500/9590] loss=0.2788, lr=0.0000050, acc=0.786 - time 0:07:19.259725


2019-12-06 16:29:51,087 : INFO : wrote 136 events to disk
2019-12-06 16:30:51,849 : INFO : wrote 138 events to disk
2019-12-06 16:31:51,958 : INFO : wrote 136 events to disk
2019-12-06 16:32:52,489 : INFO : wrote 138 events to disk
2019-12-06 16:33:52,741 : INFO : wrote 136 events to disk
2019-12-06 16:34:53,217 : INFO : wrote 138 events to disk
2019-12-06 16:35:53,653 : INFO : wrote 136 events to disk


[Epoch 0 Batch 6000/9590] loss=0.2833, lr=0.0000050, acc=0.793 - time 0:07:21.022062


2019-12-06 16:36:54,151 : INFO : wrote 138 events to disk
2019-12-06 16:37:54,611 : INFO : wrote 140 events to disk
2019-12-06 16:38:54,705 : INFO : wrote 136 events to disk
2019-12-06 16:39:55,383 : INFO : wrote 138 events to disk
2019-12-06 16:40:56,005 : INFO : wrote 138 events to disk
2019-12-06 16:41:56,704 : INFO : wrote 138 events to disk
2019-12-06 16:42:56,711 : INFO : wrote 136 events to disk


[Epoch 0 Batch 6500/9590] loss=0.2924, lr=0.0000050, acc=0.798 - time 0:07:19.510354


2019-12-06 16:43:57,216 : INFO : wrote 136 events to disk
2019-12-06 16:44:58,082 : INFO : wrote 138 events to disk
2019-12-06 16:45:58,159 : INFO : wrote 136 events to disk
2019-12-06 16:46:58,333 : INFO : wrote 136 events to disk
2019-12-06 16:47:58,434 : INFO : wrote 136 events to disk
2019-12-06 16:48:58,730 : INFO : wrote 136 events to disk
2019-12-06 16:49:58,827 : INFO : wrote 136 events to disk
2019-12-06 16:50:59,120 : INFO : wrote 136 events to disk


[Epoch 0 Batch 7000/9590] loss=0.3004, lr=0.0000050, acc=0.803 - time 0:07:22.166044


2019-12-06 16:51:59,304 : INFO : wrote 136 events to disk
2019-12-06 16:52:59,609 : INFO : wrote 138 events to disk
2019-12-06 16:53:59,730 : INFO : wrote 136 events to disk
2019-12-06 16:55:00,670 : INFO : wrote 138 events to disk
2019-12-06 16:56:01,412 : INFO : wrote 138 events to disk
2019-12-06 16:57:01,916 : INFO : wrote 138 events to disk
2019-12-06 16:58:02,840 : INFO : wrote 138 events to disk


[Epoch 0 Batch 7500/9590] loss=0.2758, lr=0.0000050, acc=0.808 - time 0:07:20.224421


2019-12-06 16:59:03,212 : INFO : wrote 138 events to disk
2019-12-06 17:00:03,955 : INFO : wrote 138 events to disk
2019-12-06 17:01:04,084 : INFO : wrote 138 events to disk
2019-12-06 17:02:04,342 : INFO : wrote 136 events to disk
2019-12-06 17:03:04,715 : INFO : wrote 138 events to disk
2019-12-06 17:04:05,180 : INFO : wrote 138 events to disk
2019-12-06 17:05:05,566 : INFO : wrote 136 events to disk


[Epoch 0 Batch 8000/9590] loss=0.2786, lr=0.0000050, acc=0.812 - time 0:07:19.779985


2019-12-06 17:06:05,676 : INFO : wrote 136 events to disk
2019-12-06 17:07:06,103 : INFO : wrote 136 events to disk
2019-12-06 17:08:06,207 : INFO : wrote 136 events to disk
2019-12-06 17:09:06,473 : INFO : wrote 136 events to disk
2019-12-06 17:10:06,986 : INFO : wrote 136 events to disk
2019-12-06 17:11:07,362 : INFO : wrote 138 events to disk
2019-12-06 17:12:07,703 : INFO : wrote 136 events to disk


[Epoch 0 Batch 8500/9590] loss=0.2887, lr=0.0000050, acc=0.815 - time 0:07:22.386638


2019-12-06 17:13:08,624 : INFO : wrote 138 events to disk
2019-12-06 17:14:09,176 : INFO : wrote 138 events to disk
2019-12-06 17:15:09,214 : INFO : wrote 136 events to disk
2019-12-06 17:16:09,249 : INFO : wrote 136 events to disk
2019-12-06 17:17:10,103 : INFO : wrote 138 events to disk
2019-12-06 17:18:10,942 : INFO : wrote 138 events to disk
2019-12-06 17:19:10,952 : INFO : wrote 136 events to disk
2019-12-06 17:20:11,827 : INFO : wrote 138 events to disk


[Epoch 0 Batch 9000/9590] loss=0.2693, lr=0.0000050, acc=0.818 - time 0:07:20.954757


2019-12-06 17:21:12,302 : INFO : wrote 138 events to disk


### (D.2) Cross 512 Pro

In [None]:
with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro512bce()

with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)

run_name = "cross_traindev_pro512_BCE"
! mkdir data/cross_traindev_pro512_BCE

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=epoch_id + 1, sw=sw, checkpoint_dir="data/" + run_name)
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 512 prolog", heatmap=False)

    model.save_parameters("data/" + run_name + "/bert.model.params")

In [None]:
run_name = "cross_traindev_pro512_BCE"

with Timer("1 - load cross test/train"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_pro512bce()

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint4.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 5:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 512 prolog (5 epochs)", heatmap=False)

In [None]:
with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    model.load_parameters("data/" + run_name + "/bert.model.checkpoint2.params", ctx=ctx)
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=32, sw=sw)
    print("Accuracy in epoch 3:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE 512 prolog (3 epochs)", heatmap=False)

```
Time for [prediction]: 0:03:12.088763
Accuracy in epoch 3: 0.8520884520884521
Confusion Matrix:
[[2608  416]
 [ 487 2594]]

  accuracy: 0.852
 precision: 0.862
    recall: 0.842
  f1-score: 0.852

Accuracy:  0.85 

Report for [BERTClassifier - cross BCE 512 prolog (3 epochs)]:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      3024
           1       0.86      0.84      0.85      3081

    accuracy                           0.85      6105
   macro avg       0.85      0.85      0.85      6105
weighted avg       0.85      0.85      0.85      6105

Time for [6 - evaluate]: 0:03:12.321989
```

```
Time for [prediction]: 0:03:29.716794
Accuracy in epoch 5: 0.8665028665028665

Confusion Matrix:
[[2766  258]
 [ 557 2524]]

  accuracy: 0.867
 precision: 0.907
    recall: 0.819
  f1-score: 0.861

Accuracy:  0.87 

Report for [BERTClassifier - cross BCE 512 prolog (5 epochs)]:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      3024
           1       0.91      0.82      0.86      3081

    accuracy                           0.87      6105
   macro avg       0.87      0.87      0.87      6105
weighted avg       0.87      0.87      0.87      6105

Time for [6 - evaluate - 4]: 0:03:29.954158

```

---

### E - artificial evalset

In [None]:
names_columns_X2 = ['argument1', 'argument2']
# X_arteval_dev = artificial_evalset_df[names_columns_X]
# y_arteval_dev = artificial_evalset_df[names_columns_y]

In [None]:
fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
exp_name = "within_traindev_pro512_BCE"
run_name = "artificial_evalset_" + exp_name

! mkdir data/artificial_evalset/within_traindev_pro512_BCE

if os.path.exists(fn):
    artificial_evalset_df = pd.DataFrame.from_csv(fn, sep='\t')

with Timer("1 - load artificial test"):
    X_dev, y_dev = artificial_evalset_df[names_columns_X2], artificial_evalset_df[names_columns_y]

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi512bce()

with Timer("2 - load BERT model: {}".format(exp_name)):
    model.load_parameters("data/" + exp_name + "/bert.model.checkpoint4.params", ctx=ctx)

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
    print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - {} - artificial evalset".format(exp_name), heatmap=False)

with Timer("9 - store results"):
    col_name = "preds-{}".format(exp_name)
    artificial_evalset_df[col_name] = y_pred
    fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
    artificial_evalset_df.to_csv(fn, sep="\t")

In [None]:
fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
exp_name = "cross_traindev_epi512_BCE_0.1"
run_name = "artificial_evalset_" + exp_name

! mkdir data/artificial_evalset/cross_traindev_epi512_BCE

if os.path.exists(fn):
    artificial_evalset_df = pd.DataFrame.from_csv(fn, sep='\t')

with Timer("1 - load artificial test"):
    X_dev, y_dev = artificial_evalset_df[names_columns_X2], artificial_evalset_df[names_columns_y]

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi512bce()

with Timer("2 - load BERT model: {}".format(exp_name)):
    model.load_parameters("data/" + exp_name + "/bert.model.checkpoint4.params", ctx=ctx)

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - {} - artificial evalset".format(exp_name), heatmap=False)

with Timer("9 - store results"):
    col_name = "preds-{}".format(exp_name)
    artificial_evalset_df[col_name] = y_pred
    fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
    artificial_evalset_df.to_csv(fn, sep="\t")

In [None]:
len(y_pred)

In [None]:
fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
exp_name = "within_traindev_epi128_BCE"
run_name = "artificial_evalset_" + exp_name

! mkdir data/artificial_evalset/within_traindev_epi128_BCE

if os.path.exists(fn):
    artificial_evalset_df = pd.DataFrame.from_csv(fn, sep='\t')

with Timer("1 - load artificial test"):
    X_dev, y_dev = artificial_evalset_df[names_columns_X2], artificial_evalset_df[names_columns_y]

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi128bce()

with Timer("2 - load BERT model: {}".format(exp_name)):
    model.load_parameters("data/" + exp_name + "/bert.model.checkpoint4.params", ctx=ctx)

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - {} - artificial evalset".format(exp_name), heatmap=False)

with Timer("9 - store results"):
    col_name = "preds-{}".format(exp_name)
    artificial_evalset_df[col_name] = y_pred
    fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
    artificial_evalset_df.to_csv(fn, sep="\t")

In [None]:
fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
exp_name = "cross_traindev_epi128_BCE"
run_name = "artificial_evalset_" + exp_name

! mkdir data/artificial_evalset/cross_traindev_epi128_BCE

if os.path.exists(fn):
    artificial_evalset_df = pd.DataFrame.from_csv(fn, sep='\t')

with Timer("1 - load artificial test"):
    X_dev, y_dev = artificial_evalset_df[names_columns_X2], artificial_evalset_df[names_columns_y]

with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert_epi128bce()

with Timer("2 - load BERT model: {}".format(exp_name)):
    model.load_parameters("data/" + exp_name + "/bert.model.checkpoint4.params", ctx=ctx)

with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)

with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - {} - artificial evalset".format(exp_name), heatmap=False)

with Timer("9 - store results"):
    col_name = "preds-{}".format(exp_name)
    artificial_evalset_df[col_name] = y_pred
    fn = "data/artificial_evalset/artificial_evalset.pred.tsv"
    artificial_evalset_df.to_csv(fn, sep="\t")

In [None]:
def dump_art_eval_results(artificial_evalset_df):
    cols = [c for c in artificial_evalset_df.columns.tolist() if c.startswith("preds-")]
    
    for col in cols:
        model_name = col[6:]
        print("Model:", model_name, "\n")
        
        for crit, crit_df in artificial_evalset_df.groupby("type"):
            crit_df = crit_df[["is_same_side", col]].astype({"is_same_side": "int32"})
            labels = crit_df["is_same_side"].values
            preds = crit_df[col].values

            if "NEG" in crit:
                # invert values for conf_mat
                labels = [abs(v - 1) for v in labels]
                preds = [abs(v - 1) for v in preds]

            conf_mat = confusion_matrix(labels, preds)
            print("Criterion:", crit)            
            compute_metrics(conf_mat)
            print()
        
        print("\n")

In [None]:
dump_art_eval_results(artificial_evalset_df)

In [None]:
gi = iter(artificial_evalset_df.groupby("type"))

In [None]:
crit, df = next(gi)
print(crit)
df = df[["is_same_side", "preds-cross_traindev_epi512_BCE_0.1"]].astype({"is_same_side": "int32"})
df = df["is_same_side"].values, df["preds-cross_traindev_epi512_BCE_0.1"].values
tn, fp, fn, tp = confusion_matrix(*df).ravel()
tn, fp, fn, tp

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(*df), recall_score(*df), tp / (tp + fp), tp / (tp + fn)

---

### (A.1) Within topic - Training and evaluating model 

In [None]:
#within_traindev_df = within_traindev_df[:1000]

In [None]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    # X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)
    X_train, X_dev, y_train, y_dev = load_distinct_data("within")

    # X_abortion, X_gay_marriage, y_abortion, y_gay_marriage = split_within_by_topic(within_traindev_df)

In [None]:
# 2. setup
with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
# print(model)

In [None]:
with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    print_infos(vocabulary, data_train_raw, data_train)

In [None]:
run_name = "within_traindev_epi512_BCE_0.1"

In [None]:
! mkdir data/within_traindev_epi512_BCE_0.1

In [None]:
with Timer("4 - train model"), SummaryWriter(logdir="data/" + run_name, flush_secs=600) as sw:
    stats = train(model, data_train, ctx, metric, loss_function, batch_size=2, lr=5e-6, num_epochs=5, sw=sw, checkpoint_dir="data/" + run_name)
    model.save_parameters("data/" + run_name + "/bert.model.params")

    plot_train_stats(stats)

In [None]:
with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    # bert.model.checkpoint4.params
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - BCE epilog 0.1 split", heatmap=False)

### Train and evaluate each epoch

In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        # stats = train(model, data_train, ctx, metric, loss_function, batch_size=32, lr=5e-6, num_epochs=epoch_id + 1)
        stats = train(model, data_train, ctx, metric, loss_function, batch_size=2, lr=5e-6, num_epochs=epoch_id + 1)  # seq_len: 512
        # stats = train_multi(model, data_train, ctx, metric, loss_function, batch_size=4, lr=5e-6, num_epochs=epoch_id + 1)  # seq_len: 512
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        # all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function)
        all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)  # seq_len: 512
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true, y_pred, name="BERTClassifier - last part", heatmap=False)

    model.save_parameters("data/bert.model.params")

### (A.2) Cross topic - Training and evaluating model 

In [None]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df, ratio=0.1)
    # X_train, X_dev, y_train, y_dev = load_distinct_data("cross")

    X_abortion, X_gay_marriage, y_abortion, y_gay_marriage = split_within_by_topic(within_traindev_df)
    
    # cross:  abortion
    # within: abortion + gay marriage

In [None]:
# 2. setup
with Timer("2 - setup BERT model"):
    model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
with Timer("3 - prepare training data"):
    data_train_raw, data_train = transform_dataset(X_train, y_train, transform)
    print_infos(vocabulary, data_train_raw, data_train)

In [None]:
run_name = "cross_traindev_epi512_BCE_0.1"

In [None]:
! mkdir data/cross_traindev_epi512_BCE_0.1

In [None]:
with Timer("4 - train model"), SummaryWriter(logdir="data/" + run_name, flush_secs=600) as sw:
    stats = train(model, data_train, ctx, metric, loss_function, batch_size=6, lr=5e-6, num_epochs=5, sw=sw, checkpoint_dir="data/" + run_name)
    model.save_parameters("data/" + run_name + "/bert.model.params")

    plot_train_stats(stats)

In [None]:
with Timer("5 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("6 - evaluate"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    # bert.model.checkpoint4.params
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=6, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - cross BCE epilog 0.1 split", heatmap=False)

### (A.3) cross with distinct topics from within

In [None]:
with Timer("7 - prepare eval data - for within foreign topic"):
    data_dev_raw, data_dev = transform_dataset(X_gay_marriage, y_gay_marriage, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("8 - evaluate - within foreign topic"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    # bert.model.checkpoint4.params
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within foreign topic BCE epilog 0.1 split", heatmap=False)

In [None]:
with Timer("7 - prepare eval data - for within same topic"):
    data_dev_raw, data_dev = transform_dataset(X_abortion, y_abortion, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("8 - evaluate - within same topic"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    # bert.model.checkpoint4.params
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within same topic BCE epilog 0.1 split", heatmap=False)

In [None]:
with Timer("1 - test/train split (within)"):
    _, X_dev_within, _, y_dev_within = get_train_test_sets(within_traindev_df, ratio=0.1)

with Timer("7 - prepare eval data - for within both topics (0.1 split)"):
    data_dev_raw, data_dev = transform_dataset(X_dev_within, y_dev_within, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("8 - evaluate - within both topics"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    # bert.model.checkpoint4.params
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within both topics topic BCE epilog 0.1 split", heatmap=False)

In [None]:
with Timer("1 - test/train split (within)"):
    _, X_dev_within, _, y_dev_within = get_train_test_sets(within_traindev_df, ratio=0.3)

with Timer("7 - prepare eval data - for within both topics (0.3 split)"):
    data_dev_raw, data_dev = transform_dataset(X_dev_within, y_dev_within, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("8 - evaluate - within both topics (0.3 split)"), SummaryWriter(logdir="data/" + run_name, flush_secs=60) as sw:
    # model.load_parameters("data/" + run_name + "/bert.model.params", ctx=ctx)
    # bert.model.checkpoint4.params
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2, sw=sw)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier - within both topics topic (0.3 within) BCE epilog 0.1 split", heatmap=False)

.

In [None]:
for epoch_id in range(5):
    with Timer("4 - train model - {}".format(epoch_id)):
        stats = train(model,
                      data_train,
                      ctx,
                      metric,
                      loss_function,
                      batch_size=2,
                      lr=5e-6,
                      num_epochs=epoch_id + 1,
                      checkpoint_dir='data/cross_traindev_epi512_BCE')
        plot_train_stats(stats)

    with Timer("6 - evaluate - {}".format(epoch_id)):
        all_predictions, cum_loss = predict(model,
                                            data_dev,
                                            ctx,
                                            metric,
                                            loss_function,
                                            batch_size=2)
        print("Accuracy in epoch {}:".format(epoch_id), metric.get()[1])
        y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
        report_training_results(y_true,
                                y_pred,
                                name="BERTClassifier",
                                heatmap=False)

    model.save_parameters(
        "data/cross_traindev_epi512_BCE/bert.model.params")

In [None]:
with Timer("11 - test/train split"):
    # evaluate on "within" test-data
    _, X_dev, _, y_dev = get_train_test_sets(within_traindev_df)

with Timer("12 - prepare eval data"):
    data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
    print_infos(vocabulary, data_dev_raw, data_dev)

with Timer("13 - evaluate"):
    # model from "cross"
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier cross with within", heatmap=False)

---
---

##### Test Cross-Model with Within-Test

5 epochs of cross

In [None]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
model.load_parameters('data/cross_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)

In [None]:
_, X_dev, _, y_dev = get_train_test_sets(within_traindev_df)

data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)
print("Accuracy:", metric.get()[1])

y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
report_training_results(y_true, y_pred, name="BERTClassifier cross with within", heatmap=False)

```
Time for [prediction]: 0:24:48.940295
Accuracy: 0.8536330916488446
Confusion Matrix:
[[7659 1174]
 [1632 8706]]

Accuracy:  0.85 

Report for [BERTClassifier cross with within]:
              precision    recall  f1-score   support

           0       0.82      0.87      0.85      8833
           1       0.88      0.84      0.86     10338

    accuracy                           0.85     19171
   macro avg       0.85      0.85      0.85     19171
weighted avg       0.85      0.85      0.85     19171
```

#### Test Within-Model with Cross-Test

5 epochs of within

In [None]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
model.load_parameters('data/within_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)

In [None]:
_, X_dev, _, y_dev = get_train_test_sets(cross_traindev_df)

data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("evaluate within with cross"):
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier within with cross", heatmap=False)

```
Time for [prediction]: 0:22:17.542674
Accuracy: 0.9379197379197379
Confusion Matrix:
[[8397  539]
 [ 598 8781]]

Accuracy:  0.94 

Report for [BERTClassifier]:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      8936
           1       0.94      0.94      0.94      9379

    accuracy                           0.94     18315
   macro avg       0.94      0.94      0.94     18315
weighted avg       0.94      0.94      0.94     18315

Time for [6 - evaluate]: 0:22:19.841677
```

#### Test Within-Model with Within-Test

5 epochs of within

In [None]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
model.load_parameters('data/within_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)

In [None]:
_, X_dev, _, y_dev = get_train_test_sets(within_traindev_df)

data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
with Timer("evaluate within with within"):
    all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)
    print("Accuracy:", metric.get()[1])

    y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
    report_training_results(y_true, y_pred, name="BERTClassifier within with within", heatmap=False)

```
Time for [prediction]: 0:19:51.733113
Accuracy: 0.9069427781545042
Confusion Matrix:
[[7972  861]
 [ 923 9415]]

Accuracy:  0.91 

Report for [BERTClassifier within with within]:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      8833
           1       0.92      0.91      0.91     10338

    accuracy                           0.91     19171
   macro avg       0.91      0.91      0.91     19171
weighted avg       0.91      0.91      0.91     19171

Time for [evaluate within with cross]: 0:19:52.352049
```

#### Test Cross-Model with Cross-Test

5 epochs of cross

In [None]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
model.load_parameters('data/cross_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)

In [None]:
_, X_dev, _, y_dev = get_train_test_sets(cross_traindev_df)

data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)
print("Accuracy:", metric.get()[1])

y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
report_training_results(y_true, y_pred, name="BERTClassifier cross", heatmap=False)

```
Time for [prediction]: 0:23:28.845010
Accuracy: 0.9197925197925197
Confusion Matrix:
[[8329  607]
 [ 862 8517]]

Accuracy:  0.92 

Report for [BERTClassifier cross]:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      8936
           1       0.93      0.91      0.92      9379

    accuracy                           0.92     18315
   macro avg       0.92      0.92      0.92     18315
weighted avg       0.92      0.92      0.92     18315
```

---
---

#### Details to wrong classified arguments

within_traindev

In [None]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

model.load_parameters('data/within_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)

In [None]:
_, X_dev, _, y_dev = get_train_test_sets(within_traindev_df)

data_dev_raw, data_dev = transform_dataset(X_dev, y_dev, transform)
# print_infos(vocabulary, data_dev_raw, data_dev)

In [None]:
all_predictions, cum_loss = predict(model, data_dev, ctx, metric, loss_function, batch_size=2)
print("Accuracy:", metric.get()[1])

y_true, y_pred = predict_out_to_ys(all_predictions, all_labels)
report_training_results(y_true, y_pred, name="BERTClassifier within-within", heatmap=False)

In [None]:
# convert predictions to dataframe
dev_pred_df = pd.DataFrame(data=y_pred, columns=["prediction"], dtype="bool")

# merge all dataframes
dev_df = X_dev.join(y_dev)
dev_df = dev_df.reset_index()
dev_df = pd.merge(dev_df, dev_pred_df, left_index=True, right_index=True, how='inner')
dev_df.set_index('id', inplace=True)

# re-apply tag value
dev_df = dev_df.progress_apply(add_tag, axis=1)
# info
dev_df.info()

In [None]:
import pickle


dev_df_ser_file = "data/within_traindev_epi512_BCE/eval_dev_df.pickle"


with open(dev_df_ser_file, "wb") as f:
    pickle.dump(dev_df, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(dev_df_ser_file, "rb") as f:
    dev_df = pickle.load(f)


dev_df.info()

In [None]:
FPFN_df = dev_df[(dev_df['is_same_side'] != dev_df['prediction'])]  #  and (dev_df['tag'] != 'abortion')
FPFN_df.info()
FPFN_df.head()

In [None]:
from IPython.display import HTML, display
import re
#import tabulate
#display(HTML(tabulate.tabulate(table, tablefmt='html')))


def print_args(df, idx, add_linebreaks=True):
    row = df.iloc[idx]
    print('IDX: {}, tag: {}, topics: {}'.format(idx, row['tag'], row['topic']))
    print('Is-Same-Side: {}'.format(row['is_same_side']))

    arg1 = row['argument1']
    arg2 = row['argument2']
    if add_linebreaks:
        pat = re.compile(r'(?P<c>(\.|\?|\!|\:)+\"?)')
        arg1 = pat.sub(r'\1<br/>', arg1)
        arg2 = pat.sub(r'\1<br/>', arg2)

    display(HTML('''<table>
        <tr>
            <td style="border-right:1px dashed black;">{arg1}</td>
            <td>{arg2}</td>
        </tr>
    </table>'''.format(arg1=arg1, arg2=arg2)))

In [None]:
_ = {print_args(FPFN_df, i) for i in range(10)}

In [None]:
# tokenizer from BERT
def tokenize_arguments(row):
    # tokenize
    row['argument1_tokens'] = tokenizer(row['argument1'])
    row['argument2_tokens'] = tokenizer(row['argument2'])

    # count tokens
    row['argument1_len'] = len(row['argument1_tokens'])
    row['argument2_len'] = len(row['argument2_tokens'])
    # token number diff
    row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
    row['argument12_len_diff_abs'] = np.abs(row['argument12_len_diff'])
    return row


FPFN_df = FPFN_df.progress_apply(tokenize_arguments, axis=1)
FPFN_df.describe()

In [None]:
FPFN_df.plot()

---

# Make final results/predictions

In [None]:
model, vocabulary, ctx, tokenizer, transform, loss_function, metric, all_labels = setup_bert()

In [None]:
# model.load_parameters('data/within_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)
# model.load_parameters('data/cross_traindev_epi512_BCE/bert.model.checkpoint4.params', ctx=ctx)
model.load_parameters('data/within_traindev_epi512_BCE_0.1/bert.model.checkpoint4.params', ctx=ctx)

In [None]:
#X_pred = within_test_df[['argument1', 'argument2', 'topic']]
#X_pred = cross_test_df[['argument1', 'argument2', 'topic']]
X_pred = new_within_test_df[['argument1', 'argument2', 'topic']]
y_pred = None

data_pred_raw, data_pred = transform_dataset(X_pred, y_pred, transform)

In [None]:
# data_pred_raw[0]

In [None]:
# data_pred[0]

In [None]:
# label_map=all_labels
predictions = predict_unknown(model, data_pred, ctx, label_map=None, batch_size=1)

In [None]:
assert len(data_pred) == len(predictions) == len(X_pred)

In [None]:
# convert predictions to dataframe
# bool works because we mapped 0 to False, 1 to True, is default conversion
test_pred_df = pd.DataFrame(data=predictions, columns=["prediction"], dtype="bool")

# merge all dataframes
# test_df = X_pred.join(y_pred)
test_df = X_pred.reset_index()
test_df = pd.merge(test_df, test_pred_df, left_index=True, right_index=True, how='inner')
test_df.set_index('id', inplace=True)

# re-apply tag value
test_df = test_df.progress_apply(add_tag, axis=1)
# info
test_df.info()

In [None]:
import pickle

# ser_fn = "data/within_traindev_epi512_BCE/within_test_pred_df.pickle"
# ser_fn = "data/cross_traindev_epi512_BCE/cross_test_pred_df.pickle"
# ser_fn = "data/cross_traindev_epi512_BCE/within_with_cross_model_test_pred_df.pickle"
# ser_fn = "data/within_traindev_epi512_BCE/cross_with_within_model_test_pred_df.pickle"
# ser_fn = "data/within_traindev_epi512_BCE_0.1/within_test_pred_df.pickle"
# ser_fn = "data/within_traindev_epi512_BCE_0.1/cross_with_within_model_test_pred_df.pickle"
ser_fn = "data/within_traindev_epi512_BCE_0.1/new_within_test_pred_df.pickle"

with open(ser_fn, "wb") as f:
    pickle.dump(test_df, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
next(test_df.itertuples())

In [None]:
# res_fn = "data/within_traindev_epi512_BCE/within_results.csv"
# res_fn = "data/cross_traindev_epi512_BCE/cross_results.csv"
# res_fn = "data/cross_traindev_epi512_BCE/within_with_cross_model_results.csv"
# res_fn = "data/within_traindev_epi512_BCE/cross_with_within_model_results.csv"
# res_fn = "data/within_traindev_epi512_BCE_0.1/within_results.csv"
# res_fn = "data/within_traindev_epi512_BCE_0.1/cross_with_within_model_results.csv"
res_fn = "data/within_traindev_epi512_BCE_0.1/new_within_results.csv"

with open(res_fn, "w") as of:
    of.write('"id","label"\n')
    for row_id, row in test_df.iterrows():
        of.write('{},"{}"\n'.format(row_id, str(row['prediction'])))

In [None]:
%%bash
cd data/within_traindev_epi512_BCE_0.1/
cp cross_with_within_model_results.csv cross.csv
cp within_results.csv within.csv

In [None]:
%%bash
cd data/within_traindev_epi512_BCE_0.1/
gzip cross.csv
gzip within.csv

In [None]:
%%bash
cd data/within_traindev_epi512_BCE_0.1/
gzip new_within_results.csv 

**NOTE**: do this for within and cross !!!

In [None]:
# test read
# temp_test_df = pd.read_csv("data/within_traindev_epi512_BCE_0.1/cross_with_within_model_results.csv", index_col='id')
temp_test_df = pd.read_csv("data/within_traindev_epi512_BCE_0.1/new_within_results.csv", index_col='id')
temp_test_df.info()
temp_test_df.iloc[10]