# RATIO 2019 - Benchmarking Workshop

PyTorch + Transformers

```bash
conda activate argmining19-ssc
pip install transformers
pip install future  # for torch.utils.tensorboard
pip install tensorboardX
```

In [1]:
import datetime
import logging
import os
import pickle
import random
import time
import warnings
from functools import partial
from multiprocessing import Pool, cpu_count

import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#from mxnet.gluon.data import Dataset, SimpleDataset

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.utils.tensorboard import SummaryWriter
# from tensorboardX import SummaryWriter
from transformers import AdamW, WarmupLinearSchedule

from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score)
from sklearn.model_selection import train_test_split
from sklearn import utils
from tqdm import tqdm, trange

I1113 13:30:18.339949 140108908386112 file_utils.py:39] PyTorch version 1.1.0 available.
I1113 13:30:18.394930 140108908386112 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
%matplotlib inline

In [3]:
warnings.filterwarnings('ignore')

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

logger = logging.getLogger("NB: pytorch-BERT")

In [4]:
# set repeatable random state
np.random.seed(100)
random.seed(100)
# https://pytorch.org/docs/stable/notes/randomness.html
_ = torch.manual_seed(0)

In [5]:
# apply progress bars for pandas .apply() -> .progress_apply()
tqdm.pandas()

In [6]:
# make tqdm jupyter friendly
from tqdm import tqdm_notebook as tqdm
# for .progress_apply() we have to hack it like this?
tqdm().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
class Timer:
    def __init__(self, name=None):
        self.name = name

    def __enter__(self):
        self.time_start = time.time()

    def __exit__(self, *exc):
        time_end = time.time()
        time_delta = datetime.timedelta(seconds=(time_end - self.time_start))
        if self.name:
            print(("Time for [{}]: {}".format(self.name, time_delta)))
        else:
            print(("Time: {}".format(time_delta)))

# Task 1 - Same Side Classification

In [8]:
load_new = False
# store tagged data in pickle object

In [9]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'
new_within_test = 'data/same-side-classification/within-topic/within_test.csv'

### Load within-topics and cross-topics data

In [10]:
if load_new:
    with Timer("read cross"):
        cross_traindev_df = pd.read_csv(data_cross_path.format('training'),
                                        quotechar='"',
                                        quoting=csv.QUOTE_ALL,
                                        encoding='utf-8',
                                        escapechar='\\',
                                        doublequote=False,
                                        index_col='id')
        cross_test_df = pd.read_csv(data_cross_path.format('test'), index_col='id')

    with Timer("read within"):
        within_traindev_df = pd.read_csv(data_within_path.format('training'),
                                         quotechar='"',
                                         quoting=csv.QUOTE_ALL,
                                         encoding='utf-8',
                                         escapechar='\\',
                                         doublequote=False,
                                         index_col='id')
        # within_test_df = pd.read_csv(data_within_path.format('test'),
        #                              quotechar='"',
        #                              quoting=csv.QUOTE_ALL,
        #                              encoding='utf-8',
        #                              escapechar='\\',
        #                              doublequote=True,  # <-- change, "" as quote escape in text?
        #                              index_col='id')
        within_test_df = pd.read_csv(data_within_path.format('test'), index_col='id')

    with Timer("read new within"):
        new_within_test_df = pd.read_csv(new_within_test, index_col='id')

In [11]:
#! head -n 5 data/same-side-classification/within-topic/test.csv

In [12]:
#! head -n 5 data/same-side-classification/within-topic/within_test.csv

In [13]:
if load_new:
    # Adding a tag for the topics in focus: "gay marriage" and "abortion"
    def add_tag(row):
        title = row['topic'].lower().strip()
        if "abortion" in title:
            row['tag'] = 'abortion'
        elif "gay marriage"  in title:
            row['tag'] = 'gay marriage'
        else:
            row['tag'] = 'NA'
        return row


    with Timer("tag cross traindev"):
        cross_traindev_df = cross_traindev_df.progress_apply(add_tag, axis=1)
    with Timer("tag cross test"):
        cross_test_df = cross_test_df.progress_apply(add_tag, axis=1)

    with Timer("tag within traindev"):
        within_traindev_df = within_traindev_df.progress_apply(add_tag, axis=1)
    with Timer("tag within test"):
        within_test_df = within_test_df.progress_apply(add_tag, axis=1)
    with Timer("tag new within test"):
        new_within_test_df = new_within_test_df.progress_apply(add_tag, axis=1)

### Cache data pre-processing

In [14]:
FN_TAGGED = "data/same-side-classification/tagged_data.pkl"

In [15]:
if load_new:
    with open(FN_TAGGED, "wb") as fp:
        pickle.dump(cross_traindev_df, fp)
        pickle.dump(cross_test_df, fp)
        pickle.dump(within_traindev_df, fp)
        pickle.dump(within_test_df, fp)
        pickle.dump(new_within_test_df, fp)

In [16]:
with open(FN_TAGGED, "rb") as fp:
    cross_traindev_df = pickle.load(fp)
    cross_test_df = pickle.load(fp)
    within_traindev_df = pickle.load(fp)
    within_test_df = pickle.load(fp)
    new_within_test_df = pickle.load(fp)

### Get an overview about each dataset

In [17]:
# requires nltk  wordtokenize
# from nltk.tokenize import sent_tokenize, word_tokenize
# model uses BERT Tokenizer ...

def get_overview(df, task='same-side', class_name='is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task)
    print('=' * 40, '\n')

    print('Total instances: ', total)
    print('\n')

    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t', is_same_side, ': ', len(side_df), ' instances')
    print('\n')

    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')
        print('\n')

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))), '\n')
    
    return

    print('-' * 40, '\n')

    arguments_length_lst = [
        len(word_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_length_lst.extend(
        [len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('\tshortest argument:', min(arguments_length_lst), ' words')
    print('\tlongest argument:', max(arguments_length_lst), ' words')
    print('\targument average length:', np.mean(arguments_length_lst),
          ' words')

    arguments_sent_length_lst = [
        len(sent_tokenize(x)) for x in df['argument1'].values
    ]
    arguments_sent_length_lst.extend(
        [len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('\tshortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('\tlongest argument:', max(arguments_sent_length_lst), ' sentences')
    print('\targument average length:', np.mean(arguments_sent_length_lst),
          ' sentences')

In [18]:
with Timer("overview cross"):
    get_overview(cross_traindev_df)

Task:  same-side

Total instances:  61048


For each topic:
abortion :  61048  instances
		 False :  29853  instances
		 True :  31195  instances


For each class value:
False :  29853  instances
True :  31195  instances


Unique argument1: 7828
Unique argument2: 7806
Unique total arguments: 9361 

Time for [overview cross]: 0:00:00.188041


In [19]:
with Timer("overview within"):
    get_overview(within_traindev_df)

Task:  same-side

Total instances:  63903


For each topic:
abortion :  40840  instances
		 False :  20006  instances
		 True :  20834  instances
gay marriage :  23063  instances
		 False :  9786  instances
		 True :  13277  instances


For each class value:
False :  29792  instances
True :  34111  instances


Unique argument1: 10508
Unique argument2: 10453
Unique total arguments: 13574 

Time for [overview within]: 0:00:00.233143


##### Count raw length

In [20]:
if False:
    def compute_arg_len(row):
        row['argument1_len'] = len(row['argument1'])
        row['argument2_len'] = len(row['argument2'])
        row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
        row['argument12_len_diff_abs'] = np.abs(row['argument12_len_diff'])
        return row


    cross_traindev_df = cross_traindev_df.progress_apply(compute_arg_len, axis=1)
    within_traindev_df = within_traindev_df.progress_apply(compute_arg_len, axis=1)
    cross_test_df = cross_test_df.progress_apply(compute_arg_len, axis=1)
    within_test_df = within_test_df.progress_apply(compute_arg_len, axis=1)

    #cross_traindev_df.describe()
    #within_traindev_df.describe()
    #within_test_df.describe()

##### Tokenize and count tokens

In [21]:
# BERT Tokenizer

# config_class, model_class, tokenizer_class = BertConfig, BertForSequenceClassification, BertTokenizer

if False:
    ctx = mx.cpu()
    _, vocabulary = nlp.model.get_model('bert_12_768_12',
                                        dataset_name='book_corpus_wiki_en_uncased',
                                        pretrained=True, ctx=ctx, use_pooler=True,
                                        use_decoder=False, use_classifier=False)
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    tokenizer = bert_tokenizer

if False:
    from nltk.tokenize import sent_tokenize, word_tokenize
    # nltk.download('punct')


    # tokenizer from BERT
    def tokenize_arguments(row):
        # tokenize
        row['argument1_tokens'] = tokenizer(row['argument1'])
        row['argument2_tokens'] = tokenizer(row['argument2'])

        # count tokens
        row['argument1_len'] = len(row['argument1_tokens'])
        row['argument2_len'] = len(row['argument2_tokens'])
        # token number diff
        row['argument12_len_diff'] = row['argument1_len'] - row['argument2_len']
        row['argument12_len_diff_abs'] = np.abs(row['argument12_len_diff'])
        return row


    cross_traindev_df = cross_traindev_df.progress_apply(tokenize_arguments, axis=1)
    within_traindev_df = within_traindev_df.progress_apply(tokenize_arguments, axis=1)
    cross_test_df = cross_test_df.progress_apply(tokenize_arguments, axis=1)
    within_test_df = within_test_df.progress_apply(tokenize_arguments, axis=1)

    #cross_traindev_df.describe()
    #within_traindev_df.describe()
    #within_test_df.describe()

In [22]:
if False:
    def plot_lengths(df, slicen=None, abs_diff=True, title=None):
        if df is None:
            print("no lengths to plot")
            return

        arg1_lens = df['argument1_len']
        arg2_lens = df['argument2_len']
        arg_diff_len = df['argument12_len_diff']

        if abs_diff:
            arg_diff_len = np.abs(arg_diff_len)

        if slicen is not None:
            arg1_lens = arg1_lens[slicen]
            arg2_lens = arg2_lens[slicen]
            arg_diff_len = arg_diff_len[slicen]

        x = np.arange(len(arg1_lens))  # arange/linspace

        plt.subplot(2, 1, 1)
        plt.plot(x, arg1_lens, label='argument1')  # Linie: '-', 'o-', '.-'
        plt.plot(x, arg2_lens, label='argument2')  # Linie: '-', 'o-', '.-'
        plt.legend()
        plt.title('Lengths of arguments' if not title else title)
        plt.ylabel('Lengths of arguments 1 and 2')

        plt.subplot(2, 1, 2)
        plt.plot(x, arg_diff_len)
        plt.xlabel('Index')
        plt.ylabel('Differences')

        plt.show()


    plot_lengths(within_traindev_df, slice(None, None, 500), title='Length of arguments within train/dev, every 500')
    plot_lengths(cross_traindev_df, slice(None, None, 500), title='Length of arguments cross train/dev, every 500')
    plot_lengths(within_test_df, slice(None, None, 1), title='Length of arguments within test')

## Train model - Baseline

### train dev set - 70% 30%

In [23]:
def get_train_test_sets(df, ratio=0.30, random_state=1):
    X = df[['argument1', 'argument2', 'argument1_id', 'argument2_id', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=random_state,
                                                        shuffle=True)
    return X_train, X_test, y_train, y_test

# Transformer

**_Base code from [gh:grenwi](https://github.com/grenwi/argmining19-same-side-classification)_**

### Loss etc.

- [BertForSequenceClassification](https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py#L962)
- [BCEWithLogitsLoss](https://pytorch.org/docs/stable/nn.html#bcewithlogitsloss)
- [transformers GLUE ..](https://github.com/huggingface/transformers/tree/master/examples#glue)
- 

In [24]:
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
from transformers import BertConfig, BertModel, BertPreTrainedModel

# https://huggingface.co/transformers/_modules/transformers/configuration_bert.html


# see: BertForSequenceClassification
class BertForSameSideClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
    """

    # configClass = BERTSameSideConfig

    def __init__(self, config):
        super(BertForSameSideClassification, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.loss_kwargs = dict()

        if self.num_labels == 1:
            # regression
            self.loss_cls = MSELoss
            # self.loss_cls = BCEWithLogitsLoss
        else:
            self.loss_cls = CrossEntropyLoss

        self.init_weights()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None):

        # forward(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None)
        # outputs = self.bert(input_ids,
        #                     attention_mask=attention_mask,
        #                     token_type_ids=token_type_ids,
        #                     position_ids=position_ids,
        #                     head_mask=head_mask,
        #                     inputs_embeds=inputs_embeds)
        # input_embeds only in newer version of transformers>=2.1.1 (in current master but not in pip)
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # add hidden states and attention if they are here
        outputs = (logits, ) + outputs[2:]

        if labels is not None:
            loss_fct = self.loss_cls()

            if self.num_labels == 1:
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))
            outputs = (loss, ) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)


class BertForSameSideBCEClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
    """

    def __init__(self, config):
        super(BertForSameSideBCEClassification, self).__init__(config)
        self.num_labels = 1

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.loss_kwargs = dict()
        self.loss_cls = BCEWithLogitsLoss

        self.init_weights()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        outputs = (logits, ) + outputs[2:]

        if labels is not None:
            loss_fct = self.loss_cls()
            loss = loss_fct(logits.view(-1), labels.view(-1))
            outputs = (loss, ) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

### Configs

In [25]:
args = {
    #: model_type: (bert|bert-ss|bert-ss-bce|...)
    'model_type':  'bert-ss-bce',
    'model_name': 'bert-base-uncased',
    #: task_name: (binary|binary-bce)
    'task_name': 'binary-bce',

    #: output dirs
    'data_dir': 'data/transformers/',
    # 'output_dir': 'outputs/transformers/',
    # 'output_dir': 'outputs/transformers/binary-label2-class',
    # 'output_dir': 'outputs/transformers/binary-label1-reg',
    # 'output_dir': 'outputs/transformers/binary-label1-class-bce1',
    'output_dir': 'outputs/transformers/binary-label1-class-bce',
    # 'log_dir': 'logs/transformers/',
    # 'log_dir': 'logs/transformers/binary-label2-class',
    # 'log_dir': 'logs/transformers/binary-label1-reg',
    # 'log_dir': 'logs/transformers/binary-label1-class-bce1',
    'log_dir': 'logs/transformers/binary-label1-class-bce',
    'cache_dir': 'cache/transformers/',

    'do_train': True,
    'do_eval': True,

    'fp16': False,
    'fp16_opt_level': 'O1',

    'max_seq_length': 512,
    #: truncate_end: (True|False) -- truncate longer inputs from start (True) or end (False)
    'truncate_end': False,
    'num_labels': 1,
    # 'num_labels': 2,
    #: output_mode: (regression|classification) -- regression := float, classification := labels (multiple)
    'output_mode': 'regression',
    #: train batch_size: batch/max_seq_len: 6/512, 16/256, 32/128
    'train_batch_size': 6,
    #: eval batch_size can probably be slightly larger?
    'eval_batch_size': 6,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 3,
    'weight_decay': 0,
    'learning_rate': 5e-6,
    'adam_epsilon': 1e-9,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 500,
    'evaluate_during_training': True,
    #: save_steps may need to be larger for smaller batch_sizes
    'save_steps': 1000,
    #: ?
    'eval_all_checkpoints': True,
    'overwrite_output_dir': False,
    #: cache it?
    'reprocess_input_data': False,
    'notes': 'SameSide argument classification task'
}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [26]:
from transformers import (
    WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
    XLMConfig, XLMForSequenceClassification, XLMTokenizer, XLNetConfig,
    XLNetForSequenceClassification, XLNetTokenizer, RobertaConfig,
    RobertaForSequenceClassification, RobertaTokenizer)

MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'bert-ss': (BertConfig, BertForSameSideClassification, BertTokenizer),
    'bert-ss-bce': (BertConfig, BertForSameSideBCEClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta':
    (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [27]:
config = config_class.from_pretrained(args['model_name'],
                                      num_labels=args['num_labels'],
                                      finetuning_task=args['task_name'])
tokenizer = tokenizer_class.from_pretrained(args['model_name'])

I1113 13:30:27.712184 140108908386112 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ekoerner/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1113 13:30:27.716675 140108908386112 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "binary-bce",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I1113 13:

In [28]:
model = model_class.from_pretrained(args['model_name'], num_labels=args['num_labels'])

I1113 13:30:29.048913 140108908386112 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/ekoerner/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1113 13:30:29.053189 140108908386112 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I1113 13:30:29.49

In [29]:
model.to(device)

BertForSameSideBCEClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_

### Data handling

In [30]:
from transformers.data import InputExample
# from transformers.data import InputFeatures
from transformers.data import DataProcessor


# TODO: binary? [0, 1] ?
class SameSideProcessor(DataProcessor):
    """Processor for the sameside data set"""

    def __init__(self, trainset, devset):
        self.trainset = trainset
        self.devset = devset

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self.trainset, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self.devset, "dev")

    def get_labels(self):
        """See base class."""
        return [False, True]

    def _create_examples(self, items, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        for (i, item) in enumerate(items):
            guid = "%s-%s" % (set_type, i)
            text_a = item[0]
            text_b = item[1]
            label = item[2]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples


class SameSideBinaryProcessor(DataProcessor):
    """Processor for the sameside data set, label is binary."""

    def __init__(self, trainset, devset):
        self.trainset = trainset
        self.devset = devset

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self.trainset, "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self.devset, "dev")

    def get_labels(self):
        """See base class."""
        return [0, 1]

    def _create_examples(self, items, set_type):
        """Creates examples for the training and dev sets."""
        examples = []

        for (i, item) in enumerate(items):
            guid = "%s-%s" % (set_type, i)
            text_a = item[0]
            text_b = item[1]
            label = 0 if not item[2] else 1
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples


# different names compared to transformers.data.InputFeatures
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [31]:
def convert_example_to_feature(example,
                               label_map,
                               max_seq_length,
                               tokenizer,
                               output_mode,
                               cls_token_at_end,
                               cls_token,
                               sep_token,
                               pad_on_left,
                               pad_token=0,
                               sequence_a_segment_id=0,
                               sequence_b_segment_id=1,
                               cls_token_segment_id=1,
                               pad_token_segment_id=0,
                               mask_padding_with_zero=True,
                               truncate_end=True):

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a,
                           tokens_b,
                           max_seq_length - 3,
                           from_end=truncate_end)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids:   0   0   0   0  0     0   0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = tokens_a + [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if tokens_b:
        tokens += tokens_b + [sep_token]
        segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

    if cls_token_at_end:
        tokens = tokens + [cls_token]
        segment_ids = segment_ids + [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        input_mask = ([0 if mask_padding_with_zero else 1] *
                      padding_length) + input_mask
        segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] *
                                   padding_length)
        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)


def convert_examples_to_features(examples,
                                 label_list,
                                 max_seq_length,
                                 tokenizer,
                                 output_mode,
                                 cls_token_at_end=False,
                                 pad_on_left=False,
                                 cls_token='[CLS]',
                                 sep_token='[SEP]',
                                 pad_token=0,
                                 sequence_a_segment_id=0,
                                 sequence_b_segment_id=1,
                                 cls_token_segment_id=1,
                                 pad_token_segment_id=0,
                                 mask_padding_with_zero=True,
                                 truncate_end=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    fn_convert = partial(convert_example_to_feature,
                         label_map=label_map,
                         max_seq_length=max_seq_length,
                         tokenizer=tokenizer,
                         output_mode=output_mode,
                         cls_token_at_end=cls_token_at_end,
                         cls_token=cls_token,
                         sep_token=sep_token,
                         pad_on_left=pad_on_left,
                         cls_token_segment_id=cls_token_segment_id,
                         pad_token_segment_id=pad_token_segment_id,
                         truncate_end=truncate_end)

    process_count = cpu_count() - 2

    with Pool(process_count) as p:
        features = list(
            tqdm(p.imap(fn_convert, examples, chunksize=100),
                 total=len(examples)))

    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length, from_end=True):
    """Truncates a sequence pair in place to the maximum length."""

    # from where to truncate (-1 (index) is from end, 0 is from the front)
    pop_pos = -1 if from_end else 0

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop(pop_pos)
        else:
            tokens_b.pop(pop_pos)

In [32]:
processors = {
    "binary": SameSideProcessor,
    "binary-bce": SameSideBinaryProcessor,
}

# not used?
output_modes = {"binary": "classification", "binary-bce": "regression"}

# GLUE_TASKS_NUM_LABELS = {"binary": 2, "binary-bce": 1}

---

In [33]:
# 1. Getting train and dev data
with Timer("1 - test/train split"):
    X_train, X_dev, y_train, y_dev = get_train_test_sets(within_traindev_df, ratio=0.1)

Time for [1 - test/train split]: 0:00:00.012222


In [None]:
def df2ds(X, y):
    """Convert pandas data frames to training data set"""
    # join label to items
    df = X.merge(y, left_index=True, right_index=True)
    # filter neccessary columns
    df = df[["argument1", "argument2", "is_same_side"]]
    # skip id and convert to list
    ds = [i[1:] for i in df.itertuples()]
    return ds


def df2ds_test(X):
    # TODO: or keep id?
    df = df[["argument1", "argument2"]]
    ds = [i[1:] for i in df.itertuples()]
    return ds

In [34]:
with Timer("2 - convert train/dev sets input format"):
    task = args['task_name']

    ds_train = df2ds(X_train, y_train)
    ds_dev = df2ds(X_dev, y_dev)

# processor = processors[task](ds_train, ds_dev)
# label_list = processor.get_labels()
# num_labels = len(label_list)

In [35]:
def load_and_cache_examples(ds_train, ds_dev, args, tokenizer, evaluate=False):
    task = args['task_name']
    processor = processors[task](ds_train, ds_dev)
    output_mode = args['output_mode']

    mode = 'dev' if evaluate else 'train'
    cached_features_file = os.path.join(
        args['data_dir'],
        f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}")

    if os.path.exists(cached_features_file) and not args.get(
            'reprocess_input_data', False):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)

    else:
        logger.info("Creating features from dataset file at %s",
                    args['data_dir'])
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            args['data_dir']) if evaluate else processor.get_train_examples(
                args['data_dir'])

        features = convert_examples_to_features(
            examples,
            label_list,
            args['max_seq_length'],
            tokenizer,
            output_mode,
            # xlnet has a cls token at the end
            cls_token_at_end=bool(args['model_type'] in ['xlnet']),
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
            # pad on the left for xlnet
            pad_on_left=bool(args['model_type'] in ['xlnet']),
            pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0,
            truncate_end=args['truncate_end'])

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset

---

In [36]:
# https://beta.mxnet.io/api/ndarray/_autogen/mxnet.ndarray.sigmoid.html
# https://stackoverflow.com/questions/43024745/applying-a-function-along-a-numpy-array


# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))


from scipy.special import expit as sigmoid

In [37]:
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix, accuracy_score, f1_score
from scipy.stats import pearsonr


def get_mismatched(labels, preds, args, ds_train, ds_dev):
    mismatched = labels != preds
    processor = processors[args['task_name']](ds_train, ds_dev)
    examples = processor.get_dev_examples(args['data_dir'])
    wrong = [i for (i, v) in zip(examples, mismatched) if v]

    return wrong


def get_eval_report(labels, preds, args, ds_train, ds_dev):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='binary')
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "acc": acc,
        "f1": f1
    }, get_mismatched(labels, preds, args, ds_train, ds_dev)


def compute_metrics(preds, labels, args, ds_train, ds_dev):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds, args, ds_train, ds_dev)

In [38]:
def evaluate(model, tokenizer, args, ds_train, ds_dev, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    results = {}
    EVAL_TASK = args['task_name']

    eval_dataset = load_and_cache_examples(ds_train,
                                           ds_dev,
                                           args,
                                           tokenizer,
                                           evaluate=True)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                # XLM don't use segment_ids
                'token_type_ids':
                batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        
        preds_ = logits.detach().cpu().numpy()
        out_label_ids_ = inputs['labels'].detach().cpu().numpy()
        
        # if args['output_mode'] == "classification" and args['num_labels'] == 1:
        #     # preds_ = preds_.sigmoid().round().astype('int32')
        #     out_label_ids_ = out_label_ids_.astype('float32')
        
        if preds is None:
            preds = preds_
            out_label_ids = out_label_ids_
        else:
            preds = np.append(preds, preds_, axis=0)
            out_label_ids = np.append(out_label_ids, out_label_ids_, axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args['output_mode'] == "classification":
        if args['num_labels'] == 1:
            preds = np.squeeze(preds)
        else:
            preds = np.argmax(preds, axis=1)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)
    
    # TODO: ?
    if args['num_labels'] == 1:
        preds = sigmoid(preds).round().astype('int32')
        out_label_ids = out_label_ids.astype('int32')

    try:
        result, wrong = compute_metrics(preds, out_label_ids, args, ds_train,
                                        ds_dev)
    except:
        result = wrong = None

    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results, wrong

In [39]:
def get_train_output(model, tokenizer, args, ds_train, ds_dev, prefix="", evaluate=True):
    eval_dataset = load_and_cache_examples(ds_train,
                                           ds_dev,
                                           args,
                                           tokenizer,
                                           evaluate=evaluate)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args['eval_batch_size'])

    logger.info("***** Running model output gen {} *****".format(prefix))
    logger.info("  Evaluation mode = %s", evaluate)
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0

    preds = None
    out_label_ids = None

    for batch in tqdm(eval_dataloader, desc="Get Model outputs"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                # XLM don't use segment_ids
                'token_type_ids':
                batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        
        preds_ = logits.detach().cpu().numpy()
        out_label_ids_ = inputs['labels'].detach().cpu().numpy()
        
        if preds is None:
            preds = preds_
            out_label_ids = out_label_ids_
        else:
            preds = np.append(preds, preds_, axis=0)
            out_label_ids = np.append(out_label_ids, out_label_ids_, axis=0)

    eval_loss = eval_loss / nb_eval_steps

    if args['output_mode'] == "classification":
        if args['num_labels'] == 1:
            preds = np.squeeze(preds)
        else:
            preds = np.argmax(preds, axis=1)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)
    
    if args['num_labels'] == 1:
        preds = sigmoid(preds).round().astype('int32')
        out_label_ids = out_label_ids.astype('int32')
        
    return preds, out_label_ids

In [40]:
def train(train_dataset, model, tokenizer, args, ds_train=None, ds_dev=None):
    tb_writer = SummaryWriter(args["log_dir"])

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args['train_batch_size'])

    t_total = len(train_dataloader) // args[
        'gradient_accumulation_steps'] * args['num_train_epochs']

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args['weight_decay']
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args['learning_rate'],
                      eps=args['adam_epsilon'])
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args['warmup_steps'],
                                     t_total=t_total)

    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args['fp16_opt_level'])

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d",
                args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")

    for epoch_nr in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration {}/{}".format(epoch_nr + 1, args['num_train_epochs']))
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                # XLM don't use segment_ids
                'token_type_ids':
                batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,
                'labels':
                batch[3]
            }
            outputs = model(**inputs)
            # model outputs are always tuple in pytorch-transformers (see doc)
            loss = outputs[0]
            # print("\rLoss: %f" % loss, end='')  # has no "real" meaning for me?

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args['max_grad_norm'])

            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()

                global_step += 1

                if args['logging_steps'] > 0 and global_step % args[
                        'logging_steps'] == 0:
                    # Log metrics
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args['evaluate_during_training']:
                        results, _ = evaluate(model, tokenizer, args, ds_train,
                                              ds_dev)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args['logging_steps'], global_step)
                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args[
                        'save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args['output_dir'],
                        'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)

    try:
        tb_writer.close()
    except Exception as ex:
        logger.exception("SummaryWriter.close() error?")

    return global_step, tr_loss / global_step

---

In [None]:
# training
if args['do_train']:
    with Timer("3 - train (fine-tune) model"):
        train_dataset = load_and_cache_examples(ds_train, ds_dev, args, tokenizer)
        global_step, tr_loss = train(train_dataset, model, tokenizer, args, ds_train=ds_train, ds_dev=ds_dev)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

I1113 13:31:30.287596 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_train_bert-base-uncased_512_binary-bce
I1113 13:31:34.473645 140108908386112 <ipython-input-40-6ea5beef75dc>:46] ***** Running training *****
I1113 13:31:34.474561 140108908386112 <ipython-input-40-6ea5beef75dc>:47]   Num examples = 57512
I1113 13:31:34.475141 140108908386112 <ipython-input-40-6ea5beef75dc>:48]   Num Epochs = 3
I1113 13:31:34.475619 140108908386112 <ipython-input-40-6ea5beef75dc>:49]   Total train batch size  = 6
I1113 13:31:34.476155 140108908386112 <ipython-input-40-6ea5beef75dc>:51]   Gradient Accumulation steps = 1
I1113 13:31:34.476631 140108908386112 <ipython-input-40-6ea5beef75dc>:52]   Total optimization steps = 28758
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Iteration 1/3', max=9586, style=ProgressStyle(description_wid…

Loss: 0.715316

I1113 13:34:33.967370 140108908386112 <ipython-input-35-795c8f7a6fbf>:19] Creating features from dataset file at data/transformers/


HBox(children=(IntProgress(value=0, max=6391), HTML(value='')))

I1113 13:34:38.966183 140108908386112 <ipython-input-35-795c8f7a6fbf>:42] Saving features into cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 13:34:41.675171 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 13:34:41.676196 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 13:34:41.676793 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 13:36:48.761387 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 13:36:48.762162 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.5758097324362385
I1113 13:36:48.762715 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.6160600481518199
I1113 13:36:48.763219 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1257
I1113 13:36:48.763686 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 1454
I1113 13:36:48.764185 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.14329328414893258
I1113 13:36:48.764647 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 1505
I1113 13:36:48.765102 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2175


Loss: 0.683322

I1113 13:39:54.404642 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 13:39:54.858559 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 13:39:54.859515 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 13:39:54.860332 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 13:42:02.216109 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 13:42:02.216779 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.6153966515412298
I1113 13:42:02.217303 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.6008444300097434
I1113 13:42:02.217789 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1582
I1113 13:42:02.218256 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 876
I1113 13:42:02.218717 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.2449906396017346
I1113 13:42:02.219170 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2083
I1113 13:42:02.219630 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 1850
I1113 13:42:02.221834 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-1000/config.json
I1113 13:42:02.515362 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tr

Loss: 0.672648

I1113 13:45:08.168343 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 13:45:08.502732 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 13:45:08.503665 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 13:45:08.504275 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 13:47:15.895884 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 13:47:15.896549 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.6490377092786731
I1113 13:47:15.897099 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.6033598585322724
I1113 13:47:15.897619 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1726
I1113 13:47:15.898122 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 517
I1113 13:47:15.898627 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.3374896793376939
I1113 13:47:15.899128 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2442
I1113 13:47:15.899634 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 1706


Loss: 0.455224

I1113 13:50:20.531043 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 13:50:20.939995 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 13:50:20.940941 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 13:50:20.941516 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 13:52:28.349564 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 13:52:28.349957 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.6634329525895791
I1113 13:52:28.350287 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.6465078060805258
I1113 13:52:28.350544 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1465
I1113 13:52:28.350802 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 686
I1113 13:52:28.351059 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.34537769598448387
I1113 13:52:28.351308 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2273
I1113 13:52:28.351567 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 1967
I1113 13:52:28.353807 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-2000/config.json
I1113 13:52:28.623474 140108908386112 modeling_utils.py:205] Model weights saved in outputs/t

Loss: 0.437874

I1113 13:55:34.327886 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 13:55:34.640254 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 13:55:34.641193 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 13:55:34.641788 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 13:57:42.396915 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 13:57:42.397838 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.6851822875919261
I1113 13:57:42.398407 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.664218958611482
I1113 13:57:42.398894 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1442
I1113 13:57:42.399367 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 570
I1113 13:57:42.399924 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.3940125949639098
I1113 13:57:42.400444 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2389
I1113 13:57:42.400894 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 1990


Loss: 0.332361

I1113 14:00:48.460575 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:00:48.779737 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:00:48.780706 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:00:48.781364 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:02:56.463873 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:02:56.464542 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.7122516038178689
I1113 14:02:56.465055 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.6634949679780421
I1113 14:02:56.465536 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1619
I1113 14:02:56.465999 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 220
I1113 14:02:56.466457 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.4859676503154316
I1113 14:02:56.466907 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2739
I1113 14:02:56.467357 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 1813
I1113 14:02:56.470917 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-3000/config.json
I1113 14:02:56.741516 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tr

Loss: 0.933976

I1113 14:06:03.093278 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:06:03.556858 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:06:03.557826 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:06:03.558418 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:08:11.848077 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:08:11.848754 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.745736191519324
I1113 14:08:11.849254 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.7319808675573148
I1113 14:08:11.849734 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1213
I1113 14:08:11.850200 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 412
I1113 14:08:11.850806 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.5140182096609405
I1113 14:08:11.851485 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2547
I1113 14:08:11.852203 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2219


Loss: 0.129133

I1113 14:11:17.880892 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:11:18.192754 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:11:18.193705 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:11:18.194291 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:13:26.021227 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:13:26.022006 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.7635737756219684
I1113 14:13:26.022550 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.7301303804250758
I1113 14:13:26.023038 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1388
I1113 14:13:26.023501 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 123
I1113 14:13:26.023991 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.5835348885290152
I1113 14:13:26.024448 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2836
I1113 14:13:26.024906 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2044
I1113 14:13:26.026957 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-4000/config.json
I1113 14:13:26.301554 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tr

Loss: 0.557779

I1113 14:16:29.310515 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:16:29.832846 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:16:29.833846 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:16:29.834431 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:18:37.205120 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:18:37.205505 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.7749960882491003
I1113 14:18:37.205813 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.7764303482587066
I1113 14:18:37.206076 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 935
I1113 14:18:37.206325 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 503
I1113 14:18:37.206579 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.5570886391720437
I1113 14:18:37.206821 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2456
I1113 14:18:37.207074 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2497


Loss: 0.751649

I1113 14:21:43.216516 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:21:43.529770 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:21:43.530723 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:21:43.531302 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:23:51.389078 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:23:51.389734 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.791894852135816
I1113 14:23:51.390305 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.7696570834776585
I1113 14:23:51.390797 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1210
I1113 14:23:51.391254 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 120
I1113 14:23:51.391705 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6280334838532095
I1113 14:23:51.392198 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2839
I1113 14:23:51.392649 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2222
I1113 14:23:51.394425 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-5000/config.json
I1113 14:23:51.676715 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tra

Loss: 0.544017

I1113 14:26:57.563247 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:26:58.009670 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:26:58.010641 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:26:58.011250 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:29:05.805401 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:29:05.806227 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.7967454232514474
I1113 14:29:05.806791 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.7800914169629254
I1113 14:29:05.807369 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1128
I1113 14:29:05.807887 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 171
I1113 14:29:05.808360 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.62802837545638
I1113 14:29:05.808825 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2788
I1113 14:29:05.809292 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2304


Loss: 0.099387

I1113 14:32:11.864367 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:32:12.186829 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:32:12.187832 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:32:12.188435 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:34:19.907371 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:34:19.908097 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.8097324362384604
I1113 14:34:19.908615 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8017606781871537
I1113 14:34:19.909089 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 973
I1113 14:34:19.909547 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 243
I1113 14:34:19.909998 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.640311671928455
I1113 14:34:19.910552 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2716
I1113 14:34:19.911040 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2459
I1113 14:34:19.913235 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-6000/config.json
I1113 14:34:20.188498 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tran

Loss: 0.144542

I1113 14:37:25.765193 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:37:26.198387 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:37:26.199345 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:37:26.199990 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:39:33.839329 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:39:33.840025 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.809263026130496
I1113 14:39:33.840650 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.7994076024354123
I1113 14:39:33.841128 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 1003
I1113 14:39:33.841592 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 216
I1113 14:39:33.842050 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6426198320662452
I1113 14:39:33.842503 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2743
I1113 14:39:33.843018 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2429


Loss: 0.956015

I1113 14:42:36.739593 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:42:37.048944 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:42:37.049927 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:42:37.050521 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:44:44.704776 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:44:44.705450 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.8170865279299014
I1113 14:44:44.706021 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8121484814398201
I1113 14:44:44.706511 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 905
I1113 14:44:44.706969 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 264
I1113 14:44:44.707424 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6505444524352674
I1113 14:44:44.707913 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2695
I1113 14:44:44.708376 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2527
I1113 14:44:44.726054 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-7000/config.json
I1113 14:44:45.004356 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tra

Loss: 0.680116

I1113 14:47:50.749013 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:47:51.185937 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:47:51.186916 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:47:51.187487 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:49:58.832590 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:49:58.833036 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.8203723986856517
I1113 14:49:58.833343 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8391255605381167
I1113 14:49:58.833606 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 438
I1113 14:49:58.833858 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 710
I1113 14:49:58.834136 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6388375371008707
I1113 14:49:58.834397 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2249
I1113 14:49:58.834656 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2994


Loss: 0.724564

I1113 14:53:04.610419 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:53:04.916894 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:53:04.917851 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:53:04.918436 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 14:55:12.699097 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 14:55:12.699748 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.820841808793616
I1113 14:55:12.700262 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8099585062240663
I1113 14:55:12.700738 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 992
I1113 14:55:12.701195 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 153
I1113 14:55:12.701649 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6694480770166332
I1113 14:55:12.702163 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2806
I1113 14:55:12.702630 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2440
I1113 14:55:12.704645 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-8000/config.json
I1113 14:55:12.992322 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tran

Loss: 0.297352

I1113 14:58:19.013619 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 14:58:19.442545 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 14:58:19.443472 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 14:58:19.444180 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 15:00:27.203311 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 15:00:27.204005 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.830855891096855
I1113 15:00:27.204520 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8328436678521726
I1113 15:00:27.205000 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 739
I1113 15:00:27.205464 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 342
I1113 15:00:27.205920 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6681023210384485
I1113 15:00:27.206370 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2617
I1113 15:00:27.206819 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2693


Loss: 0.897775

I1113 15:03:30.032196 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 15:03:30.371969 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 15:03:30.372946 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 15:03:30.373524 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 15:05:38.406175 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 15:05:38.406920 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.8336723517446409
I1113 15:05:38.407479 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8413669601552006
I1113 15:05:38.408211 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 613
I1113 15:05:38.408900 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 450
I1113 15:05:38.409545 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6676495796281259
I1113 15:05:38.410104 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2509
I1113 15:05:38.410595 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2819
I1113 15:05:38.412705 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-9000/config.json
I1113 15:05:38.691860 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tra

Loss: 0.199937

I1113 15:08:43.750803 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 15:08:44.188302 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 15:08:44.189280 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 15:08:44.189868 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 15:10:52.379576 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 15:10:52.380443 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.8305429510248787
I1113 15:10:52.380985 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8379955123410621
I1113 15:10:52.381477 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 631
I1113 15:10:52.382013 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 452
I1113 15:10:52.382489 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6616756537430034
I1113 15:10:52.382945 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2507
I1113 15:10:52.383394 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2801


Loss: 0.011357

Epoch:  33%|███▎      | 1/3 [1:39:50<3:19:40, 5990.06s/it]

Loss: 0.002953


HBox(children=(IntProgress(value=0, description='Iteration 2/3', max=9586, style=ProgressStyle(description_wid…

Loss: 0.789869

I1113 15:13:58.778885 140108908386112 <ipython-input-35-795c8f7a6fbf>:14] Loading features from cached file data/transformers/cached_dev_bert-base-uncased_512_binary-bce
I1113 15:13:59.167805 140108908386112 <ipython-input-38-81483096b16f>:22] ***** Running evaluation  *****
I1113 15:13:59.168775 140108908386112 <ipython-input-38-81483096b16f>:23]   Num examples = 6391
I1113 15:13:59.169410 140108908386112 <ipython-input-38-81483096b16f>:24]   Batch size = 6


HBox(children=(IntProgress(value=0, description='Evaluating', max=1066, style=ProgressStyle(description_width=…

I1113 15:16:07.525209 140108908386112 <ipython-input-38-81483096b16f>:90] ***** Eval results  *****
I1113 15:16:07.526284 140108908386112 <ipython-input-38-81483096b16f>:92]   acc = 0.8363323423564387
I1113 15:16:07.527038 140108908386112 <ipython-input-38-81483096b16f>:92]   f1 = 0.8412264723740133
I1113 15:16:07.527717 140108908386112 <ipython-input-38-81483096b16f>:92]   fn = 661
I1113 15:16:07.528382 140108908386112 <ipython-input-38-81483096b16f>:92]   fp = 385
I1113 15:16:07.529030 140108908386112 <ipython-input-38-81483096b16f>:92]   mcc = 0.6754835332248185
I1113 15:16:07.529674 140108908386112 <ipython-input-38-81483096b16f>:92]   tn = 2574
I1113 15:16:07.530289 140108908386112 <ipython-input-38-81483096b16f>:92]   tp = 2771
I1113 15:16:07.544936 140108908386112 configuration_utils.py:71] Configuration saved in outputs/transformers/binary-label1-class-bce/checkpoint-10000/config.json
I1113 15:16:07.838395 140108908386112 modeling_utils.py:205] Model weights saved in outputs/tr

Loss: 0.144899

In [None]:
# saving
if args['do_train']:
    if not os.path.exists(args['output_dir']):
        os.makedirs(args['output_dir'])
    
    logger.info("Saving model checkpoint to %s", args['output_dir'])

    model_to_save = model.module if hasattr(
        model,
        'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(args['output_dir'])
    tokenizer.save_pretrained(args['output_dir'])
    torch.save(args, os.path.join(args['output_dir'], 'training_args.bin'))

In [None]:
with open(os.path.join(args['output_dir'], 'done.flag'), "w") as fp:
    fp.write("Done.")

In [None]:
results, wrong = evaluate(model, tokenizer, args, ds_train, ds_dev)

In [None]:
preds, out_label_ids = get_train_output(model, tokenizer, args, ds_train, ds_dev, prefix="", evaluate=True)

In [None]:
np.unique(preds, return_counts=True)

In [None]:
labels = out_label_ids
confusion_matrix(labels, preds).ravel()

In [None]:
vals = np.array([-1.0, -0.1, 0.1, 0.4, 0.5, 0.6, 1.0, 2.0])
vals_s = sigmoid(vals)
vals_s_r = vals_s.round()

vals, vals_s, vals_s_r

In [None]:
jfkdöljfkdlasöjfkld
# abort here

---

_may need to see how to adapt MxNet structure to PyTorch workflow?_

In [None]:
class MyBERTDataset(SimpleDataset):
    def __init__(self, X, y=None):
        self._X = X
        self._y = y
        super(MyBERTDataset, self).__init__(self._convert())

    def _convert(self):
        allsamples = list()

        if self._y is not None:
            df = self._X.merge(self._y, left_index=True, right_index=True)
            for _, row in df.iterrows():
                # allsamples.append([
                #     row['argument1'], row['argument2'],
                #     "1" if str(row['is_same_side']) == "True" else "0"
                # ])
                allsamples.append([
                    row['argument1'], row['argument2'],
                    1 if str(row['is_same_side']) == "True" else 0
                ])

        else:
            for _, row in self._X.iterrows():
                allsamples.append([row['argument1'], row['argument2'], None])

        return allsamples

###### my own `BERTDatasetTransform` for extracting chunks from arguments or last part etc.

```python
transform = dataset.BERTDatasetTransform(bert_tokenizer, 512,
                                         labels=['0', '1'],
                                         label_dtype='int32',
                                         pad=True,
                                         pair=True)
```

http://localhost:9001/edit/bert/dataset.py @454
```python
# substitute with my own (e. g. last part, many parts etc.)
def __init__(...):
    self._bert_xform = BERTSentenceTransform(tokenizer, max_seq_length, pad=pad, pair=pair)
```
https://gluon-nlp.mxnet.io/master/_modules/gluonnlp/data/transforms.html#BERTSentenceTransform
```python
# substitute with my own (e. g. only last part (trim from start))
self._truncate_seq_pair(tokens_a, tokens_b, self._max_seq_length - 3)
```

https://mxnet.incubator.apache.org/_modules/mxnet/gluon/data/dataset.html#Dataset.transform

In [None]:
from gluonnlp.data import BERTSentenceTransform


class FirstAndLastPartBERTSentenceTransform(BERTSentenceTransform):
    def __init__(self, tokenizer, max_seq_length, pad=True, pair=True):
        super(FirstAndLastPartBERTSentenceTransform,
              self).__init__(tokenizer, max_seq_length, pad=pad, pair=pair)

    def __call__(self, line):
        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer(text_a)
        tokens_a_epi = tokens_a.copy()
        tokens_b = None
        tokens_b_epi = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)
            tokens_b_epi = tokens_b.copy()

        if tokens_b:
            self._truncate_seq_pair_prolog(tokens_a, tokens_b,
                                           self._max_seq_length - 3)
            self._truncate_seq_pair_epilog(tokens_a_epi, tokens_b_epi,
                                           self._max_seq_length - 3)
        else:
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]
            if len(tokens_a_epi) > self._max_seq_length - 2:
                tokens_a_epi = tokens_a_epi[0:(self._max_seq_length - 2)]

        vocab = self._tokenizer.vocab
        tokens, tokens_epi = [], []
        tokens.append(vocab.cls_token)
        tokens_epi.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens_epi.extend(tokens_a_epi)
        tokens.append(vocab.sep_token)
        tokens_epi.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)
        segment_ids_epi = [0] * len(tokens_epi)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens_epi.extend(tokens_b_epi)
            tokens.append(vocab.sep_token)
            tokens_epi.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
            segment_ids_epi.extend([1] * (len(tokens) - len(segment_ids_epi)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
        input_ids_epi = self._tokenizer.convert_tokens_to_ids(tokens_epi)
        valid_length = len(input_ids)
        valid_length_epi = len(input_ids_epi)

        if self._pad:
            padding_length = self._max_seq_length - valid_length
            padding_length_epi = self._max_seq_length - valid_length_epi
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            input_ids_epi.extend([vocab[vocab.padding_token]] *
                                 padding_length_epi)
            segment_ids.extend([0] * padding_length)
            segment_ids_epi.extend([0] * padding_length_epi)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32'), np.array(input_ids_epi, dtype='int32'),\
            np.array(valid_length_epi, dtype='int32'), np.array(segment_ids_epi, dtype='int32')

    def _truncate_seq_pair_prolog(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def _truncate_seq_pair_epilog(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length.
        Removes from end of token list."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop(0)
            else:
                tokens_b.pop(0)

In [None]:
class FirstAndLastPartBERTDatasetTransform(dataset.BERTDatasetTransform):
    def __init__(self,
                 tokenizer,
                 max_seq_length,
                 labels=None,
                 pad=True,
                 pair=True,
                 label_dtype='float32'):
        super(FirstAndLastPartBERTDatasetTransform,
              self).__init__(tokenizer,
                             max_seq_length,
                             labels=labels,
                             pad=pad,
                             pair=pair,
                             label_dtype=label_dtype)
        self._bert_xform = FirstAndLastPartBERTSentenceTransform(
            tokenizer, max_seq_length, pad=pad, pair=pair)

    def __call__(self, line):
        input_ids, valid_length, segment_ids, input_ids_epi, valid_length_epi, segment_ids_epi = self._bert_xform(
            line[:-1])

        label = line[-1]

        # if label is None than we are predicting unknown data
        if label is None:
            # early abort
            return input_ids, valid_length, segment_ids, input_ids_epi, valid_length_epi, segment_ids_epi
            
        if self.labels:  # for classification task
            label = self._label_map[label]
        label = np.array([label], dtype=self.label_dtype)

        return input_ids, valid_length, segment_ids, input_ids_epi, valid_length_epi, segment_ids_epi, label

In [None]:
from mxnet.gluon import Block
from mxnet.gluon import nn


class BERTProEpiClassifier(Block):
    """Model for sentence (pair) classification task with BERT.

    The model feeds token ids and token type ids into BERT to get the
    pooled BERT sequence representation, then apply a Dense layer for
    classification. Does this also for an adversarial classifier.

    Parameters
    ----------
    bert: BERTModel
        Bidirectional encoder with transformer.
    num_classes : int, default is 2
        The number of target classes.
    dropout : float or None, default 0.0.
        Dropout probability for the bert output.
    prefix : str or None
        See document of `mx.gluon.Block`.
    params : ParameterDict or None
        See document of `mx.gluon.Block`.
    """

    def __init__(self,
                 bert,
                 num_classes=2,
                 dropout=0.0,
                 prefix=None,
                 params=None):
        super(BERTProEpiClassifier, self).__init__(prefix=prefix, params=params)
        self.bert = bert
        with self.name_scope():
            self.classifier = nn.HybridSequential(prefix=prefix)
            if dropout:
                self.classifier.add(nn.Dropout(rate=dropout))
            self.classifier.add(nn.Dense(units=num_classes))

    def forward(self,
                inputs,
                token_types,
                valid_length=None,
                inputs_epi=None,
                token_types_epi=None,
                valid_length_epi=None):  # pylint: disable=arguments-differ
        """Generate the unnormalized scores for the given the input sequences.
        From both classifiers (classifier + adversarial_classifier).

        Parameters
        ----------
        inputs : NDArray, shape (batch_size, seq_length)
            Input words for the sequences.
        token_types : NDArray, shape (batch_size, seq_length)
            Token types for the sequences, used to indicate whether the word belongs to the
            first sentence or the second one.
        valid_length : NDArray or None, shape (batch_size)
            Valid length of the sequence. This is used to mask the padded tokens.
        inputs_epi : NDArray or None, shape (batch_size, seq_length)
            Input words for the sequences. If None then same as inputs.
        token_types_epi : NDArray or None, shape (batch_size, seq_length)
            Token types for the sequences, used to indicate whether the word belongs to the
            first sentence or the second one. If None then same as token_types.
        valid_length_epi : NDArray or None, shape (batch_size)
            Valid length of the sequence. This is used to mask the padded tokens.

        Returns
        -------
        outputs : NDArray
            Shape (batch_size, num_classes), outputs of classifier.
        """
        # if inputs_epi is None and token_types_epi is None:
        #     inputs_epi = inputs
        #     token_types_epi = token_types
        #     valid_length_epi = valid_length

        _, pooler_out = self.bert(inputs, token_types, valid_length)
        _, pooler_out_epi = self.bert(inputs_epi, token_types_epi, valid_length_epi)
        pooler_concat = mx.nd.concat(pooler_out, pooler_out_epi, dim=1)
        return self.classifier(pooler_concat)

In [None]:
def setup_bert():
    # change `ctx` to `mx.cpu()` if no GPU is available.
    ctx = mx.gpu(0)
    # ctx = [mx.gpu(i) for i in range(2)]
    # ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
    # ctx = mx.cpu()

    bert_base, vocabulary = nlp.model.get_model(
        'bert_12_768_12',
        dataset_name='book_corpus_wiki_en_uncased',
        pretrained=True,
        ctx=ctx,
        use_pooler=True,
        use_decoder=False,
        use_classifier=False)
    print(bert_base)

    #model = BERTProEpiClassifier(bert_base, num_classes=2, dropout=0.1)
    model = BERTProEpiClassifier(bert_base, num_classes=1, dropout=0.1)
    # only need to initialize the classifier layer.
    model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
    model.hybridize(static_alloc=True)

    # softmax cross entropy loss for classification
    #loss_function = gluon.loss.SoftmaxCELoss()
    loss_function = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
    loss_function.hybridize(static_alloc=True)

    metric = mx.metric.Accuracy()

    # use the vocabulary from pre-trained model for tokenization
    bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
    # maximum sequence length
    # max_len = 128  # + batch_size: 32
    # 384 - 12
    max_len = 512  # + batch_size: 6 ?
    # the labels for the two classes
    #all_labels = ["0", "1"]
    all_labels = [0, 1]
    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    transform = FirstAndLastPartBERTDatasetTransform(bert_tokenizer,
                                                     max_len,
                                                     labels=all_labels,
                                                     label_dtype='int32',
                                                     pad=True,
                                                     pair=True)

    return model, vocabulary, ctx, bert_tokenizer, transform, loss_function, metric, all_labels

In [None]:
def transform_dataset(X, y, transform):
    data_train_raw = MyBERTDataset(X, y)
    data_train = data_train_raw.transform(transform)
    return data_train_raw, data_train


def predict_out_to_ys(all_predictions, all_labels):
    y_true, y_pred = list(), list()

    for _, y_true_many, y_pred_many in all_predictions:
        y_true_many = y_true_many.T[0].asnumpy()
        # https://mxnet.incubator.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss
        # pred: the prediction tensor, where the batch_axis dimension ranges over batch size and axis dimension ranges over the number of classes.
        #y_pred_many = np.argmax(y_pred_many, axis=1).asnumpy()
        y_pred_many = y_pred_many.asnumpy()

        y_true.extend(list(y_true_many))
        y_pred.extend(list(y_pred_many))
        # TODO: convert label_id to label?
        # y_pred.extend(all_labels[c] for c in list(y_pred_many))

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return y_true, y_pred

Multi-GPU?
- https://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html

In [None]:
def train(model,
          data_train,
          ctx,
          metric,
          loss_function,
          batch_size=32,
          lr=5e-6,
          num_epochs=3,
          sw=None,
          checkpoint_dir="data",
          use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(), 'adam', {
            'learning_rate': lr,
            'epsilon': 1e-9
        })

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [
            p for p in model.collect_params().values() if p.grad_req != 'null'
        ]

    log_interval = 500
    global_step = 0
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(
                    epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(
                        checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                global_step = epoch_id * len(bert_dataloader)
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               token_ids_epi, valid_length_epi,
                               segment_ids_epi,
                               label) in enumerate(tqdm(bert_dataloader)):
                    global_step += 1
                    with mx.autograd.record():
                        # load data to GPU
                        token_ids = token_ids.as_in_context(ctx)
                        valid_length = valid_length.as_in_context(ctx)
                        segment_ids = segment_ids.as_in_context(ctx)
                        token_ids_epi = token_ids_epi.as_in_context(ctx)
                        valid_length_epi = valid_length_epi.as_in_context(ctx)
                        segment_ids_epi = segment_ids_epi.as_in_context(ctx)
                        label = label.as_in_context(ctx)

                        # forward computation
                        out = model(token_ids, segment_ids,
                                    valid_length.astype('float32'),
                                    token_ids_epi, segment_ids_epi,
                                    valid_length_epi.astype('float32'))
                        label = label.astype('float32')
                        ls = loss_function(out, label).mean()

                    # backward computation
                    ls.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    step_loss += ls.asscalar()
                    out = out.sigmoid().round().astype('int32')
                    label = label.astype('int32')
                    metric.update([label], [out])
                    stats.append((metric.get()[1], ls.asscalar()))

                    if sw:
                        sw.add_scalar(tag='T-ls', value=ls.asscalar(), global_step=global_step)
                        sw.add_scalar(tag='T-acc', value=metric.get()[1], global_step=global_step)

                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(
                                epoch_id, batch_id + 1, len(bert_dataloader),
                                step_loss / log_interval,
                                trainer.learning_rate,
                                metric.get()[1],
                                datetime.timedelta(seconds=(time.time() -
                                                            t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)

    return stats

In [None]:
def train_multi(model,
                data_train,
                ctx,
                metric,
                loss_function,
                batch_size=32,
                lr=5e-6,
                num_epochs=3,
                checkpoint_dir="data",
                use_checkpoints=True):
    with Timer("setup training"):
        train_sampler = nlp.data.FixedBucketSampler(
            lengths=[int(item[1]) for item in tqdm(data_train)],
            batch_size=batch_size,
            shuffle=True)
        bert_dataloader = mx.gluon.data.DataLoader(data_train,
                                                   batch_sampler=train_sampler)

        trainer = gluon.Trainer(model.collect_params(),
                                'adam', {
                                    'learning_rate': lr,
                                    'epsilon': 1e-9
                                },
                                update_on_kvstore=False)

        # collect all differentiable parameters
        # grad_req == 'null' indicates no gradients are calculated (e.g. constant parameters)
        # the gradients for these params are clipped later
        params = [
            p for p in model.collect_params().values() if p.grad_req != 'null'
        ]

    log_interval = 500
    with Timer("training"):
        stats = list()
        for epoch_id in range(num_epochs):
            if use_checkpoints:
                epoch_checkpoint_savefile = "bert.model.checkpoint{}.params".format(
                    epoch_id)
                if checkpoint_dir is not None:
                    epoch_checkpoint_savefile = os.path.join(
                        checkpoint_dir, epoch_checkpoint_savefile)
                if os.path.exists(epoch_checkpoint_savefile):
                    model.load_parameters(epoch_checkpoint_savefile, ctx=ctx)
                    print("loaded checkpoint for epoch {}".format(epoch_id))
                    continue

            with Timer("epoch {}".format(epoch_id)):
                metric.reset()
                step_loss = 0
                t_p = time.time()  # time keeping
                for batch_id, (token_ids, valid_length, segment_ids,
                               token_ids_epi, valid_length_epi,
                               segment_ids_epi,
                               label) in enumerate(bert_dataloader):
                    with mx.autograd.record():
                        # load data to GPU
                        token_ids = gluon.utils.split_and_load(
                            token_ids, ctx, even_split=False)
                        valid_length = gluon.utils.split_and_load(
                            valid_length, ctx, even_split=False)
                        segment_ids = gluon.utils.split_and_load(
                            segment_ids, ctx, even_split=False)
                        token_ids_epi = gluon.utils.split_and_load(
                            token_ids_epi, ctx, even_split=False)
                        valid_length_epi = gluon.utils.split_and_load(
                            valid_length_epi, ctx, even_split=False)
                        segment_ids_epi = gluon.utils.split_and_load(
                            segment_ids_epi, ctx, even_split=False)
                        label = gluon.utils.split_and_load(label,
                                                           ctx,
                                                           even_split=False)

                        # forward computation
                        out = [
                            model(t1, s1, v1.astype('float32'), t2, s2,
                                  v2.astype('float32'))
                            for t1, s1, v1, t2, s2, v2 in zip(
                                token_ids, segment_ids, valid_length,
                                token_ids_epi, segment_ids_epi,
                                valid_length_epi)
                        ]
                        ls = [
                            loss_function(o, l.astype('float32')).mean()
                            for o, l in zip(out, label)
                        ]

                    # backward computation
                    for l in ls:
                        l.backward()

                    # gradient clipping
                    trainer.allreduce_grads()
                    nlp.utils.clip_grad_global_norm(params, 1)
                    trainer.update(1)

                    for l in ls:
                        step_loss += l.asscalar()
                    for o, l in zip(out, label):
                        metric.update([l.astype('int32')],
                                      [o.sigmoid().round().astype('int32')])
                    stats.append((metric.get()[1], [l.asscalar() for l in ls]))
                    if (batch_id + 1) % (log_interval) == 0:
                        print(
                            '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f} - time {}'
                            .format(
                                epoch_id, batch_id + 1, len(bert_dataloader),
                                step_loss / log_interval,
                                trainer.learning_rate,
                                metric.get()[1],
                                datetime.timedelta(seconds=(time.time() -
                                                            t_p))))
                        t_p = time.time()
                        step_loss = 0

            if use_checkpoints:
                model.save_parameters(epoch_checkpoint_savefile)

    return stats

In [None]:
def predict(model, data_predict, ctx, metric, loss_function, batch_size=32, sw=None):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict,
                                               batch_size=batch_size)

    all_predictions = list()

    with Timer("prediction"):
        metric.reset()
        cum_loss = 0
        for batch_id, (token_ids, valid_length, segment_ids, token_ids_epi,
                       valid_length_epi, segment_ids_epi,
                       label) in enumerate(tqdm(bert_dataloader)):
            global_step = batch_id
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            token_ids_epi = token_ids_epi.as_in_context(ctx)
            valid_length_epi = valid_length_epi.as_in_context(ctx)
            segment_ids_epi = segment_ids_epi.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'),
                        token_ids_epi, segment_ids_epi,
                        valid_length_epi.astype('float32'))
            label = label.astype('float32')
            ls = loss_function(out, label).mean()

            out = out.sigmoid().round().astype('int32')
            label = label.astype('int32')
            metric.update([label], [out])
            cum_loss += ls.asscalar()  # .sum() ?

            if sw:
                sw.add_scalar(tag='P-ls', value=ls.asscalar(), global_step=global_step)
                sw.add_scalar(tag='P-acc', value=metric.get()[1], global_step=global_step)

            all_predictions.append((batch_id, label, out))

    return all_predictions, cum_loss

In [None]:
def predict_unknown(model, data_predict, ctx, label_map=None, batch_size=32):
    bert_dataloader = mx.gluon.data.DataLoader(data_predict,
                                               batch_size=batch_size)

    predictions = list()

    with Timer("prediction"):
        for batch_id, (token_ids, valid_length, segment_ids, token_ids_epi,
                       valid_length_epi,
                       segment_ids_epi) in enumerate(tqdm(bert_dataloader)):
            global_step = batch_id
            # load data to GPU
            token_ids = token_ids.as_in_context(ctx)
            valid_length = valid_length.as_in_context(ctx)
            segment_ids = segment_ids.as_in_context(ctx)
            token_ids_epi = token_ids_epi.as_in_context(ctx)
            valid_length_epi = valid_length_epi.as_in_context(ctx)
            segment_ids_epi = segment_ids_epi.as_in_context(ctx)

            # forward computation
            out = model(token_ids, segment_ids, valid_length.astype('float32'),
                        token_ids_epi, segment_ids_epi,
                        valid_length_epi.astype('float32'))

            # to binary: 0/1
            out = out.sigmoid().round().astype('int32')
            # to numpy (not mxnet)
            out = out.asnumpy()
            # get mapping type
            if label_map:
                out = [label_map[c] for c in list(out)]

            predictions.extend(out)

    # list to numpy array
    predictions = np.array(predictions)

    return predictions

In [None]:
def print_infos(vocabulary, data_train_raw, data_train):
    sample_id = 0

    # sentence a
    print(data_train_raw[sample_id][0])
    # sentence b
    print(data_train_raw[sample_id][1])
    # 1 means equivalent, 0 means not equivalent
    print(data_train_raw[sample_id][2])

    print('vocabulary used for tokenization = \n%s' % vocabulary)
    print('[PAD] token id = %s' % (vocabulary['[PAD]']))
    print('[CLS] token id = %s' % (vocabulary['[CLS]']))
    print('[SEP] token id = %s' % (vocabulary['[SEP]']))

    print('token ids = \n%s' % data_train[sample_id][0])
    print('valid length = \n%s' % data_train[sample_id][1])
    print('segment ids = \n%s' % data_train[sample_id][2])
    print('epi token ids = \n%s' % data_train[sample_id][3])
    print('epi valid length = \n%s' % data_train[sample_id][4])
    print('epi segment ids = \n%s' % data_train[sample_id][5])
    print('label = \n%s' % data_train[sample_id][6])


def plot_train_stats(stats):
    if not stats:
        print("no stats to plot")
        return

    x = np.arange(len(stats))  # arange/linspace

    acc_dots, loss_dots = zip(*stats)
    # if isinstance(loss_dots, tuple):
    #     loss_dots, loss_dots2 = zip(*loss_dots)

    plt.subplot(2, 1, 1)
    plt.plot(x, acc_dots)  # Linie: '-', 'o-', '.-'
    plt.title('Training BERTClassifier')
    plt.ylabel('Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(x, loss_dots)
    plt.xlabel('Batches')
    plt.ylabel('Loss')

    plt.show()