In [16]:
import os
import aiohttp

from datasets import load_dataset
from datasets import Dataset as DT
import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format='retina'

import math
from collections import defaultdict
from textwrap import wrap
import numpy as np

import pandas as pd
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import (AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.preprocessing import OrdinalEncoder

import nltk.data
from nltk.tokenize import sent_tokenize
from nltk.corpus import alpino

from imblearn.over_sampling import RandomOverSampler

In [17]:
#!sudo kill -5957 pid
# torch.cuda.empty_cache()
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

cuda:0


In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [18]:
DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled"
PRE_TRAINED_MODEL_NAME = 'wietsedv/bert-base-dutch-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

## Load processed labeled data

In [19]:
df = pd.read_csv(os.path.join(DATA_DIR, "labeled_split_sents_energy_1960_1990.csv"))
oil = pd.read_csv(os.path.join(DATA_DIR, "labeled_split_sents_oil_1960_1990.csv"))
gas = pd.read_csv(os.path.join(DATA_DIR, "labeled_split_sents_gas_1960_1990.csv"))
coal = pd.read_csv(os.path.join(DATA_DIR, "labeled_split_sents_coal_1960_1990.csv"))

## Create Dataset

### Create dataset with Dataset class - Base

In [20]:
# Token max length
MAX_LEN = 512

# Parameters
batch_size = 16
max_epochs = 100

# Select df
df = df
# Convert labels to integers
s = df["labels"]
df["labels"] = pd.to_numeric(s,downcast='integer')
df.head(1)

Unnamed: 0.1,Unnamed: 0,text_split,labels,text,date
0,0,Het PEB heeft een contract tot levering van aa...,1,Het PEB heeft een contract tot levering van aa...,1977-09-29


In [21]:
from sklearn.model_selection import train_test_split
# Divide into train and val
train_texts, val_texts, train_labels, val_labels = train_test_split(list(df.text_split), list(df.labels), test_size=.3)
# Divide into val and test
test_texts, val_texts, test_labels, val_labels = train_test_split(val_texts, val_labels, test_size=.5)

In [22]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
# TODO: add_special_tokens=True ? 
# TODO: padding='max_length'?
# return_token_type_ids=False,
# return_attention_mask=True,
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt', max_length=MAX_LEN)
valid_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt', max_length=MAX_LEN)

### Create dataset with Dataset class - Advanced
uses __getitem__ to create encodings, instead of doing it later --> easier for DataLoader

In [24]:
class HADataset(torch.utils.data.Dataset):
    # Characterizes a dataset for Pytorch
    def __init__(self, articles, labels, tokenizer, max_len):
        # Initialization
        self.articles = articles
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        # Total number of articles
        return len(self.articles)

    def __getitem__(self, item):
        # Generates one sample of the data/article
        article = str(self.articles[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            article,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            #return_token_type_ids=False,
            #return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        return {
          'article_text': article,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }

In [25]:
train_dataset = HADataset(list(train_texts), list(train_labels), tokenizer, MAX_LEN)
val_dataset = HADataset(list(val_texts), list(val_labels), tokenizer, MAX_LEN)
test_dataset = HADataset(list(test_texts), list(test_labels), tokenizer, MAX_LEN)

### Create DataLoader

Is it actually necessary or does the Trainer already take care of this?

In [26]:
def create_data_loader(df, tokenizer, batch_size, num_workers, MAX_LEN):
    ds = HADataset(
        articles=df.text_split.to_numpy(),
        labels=torch.LongTensor(df.labels),#.to_numpy(),
        tokenizer=tokenizer,
        max_len=MAX_LEN
      )
    
    return DataLoader(
        ds,
        batch_size,
        num_workers
      )

In [27]:
# Create dataloaders
num_workers = 0
train_data_loader = create_data_loader(df, tokenizer, batch_size, num_workers, MAX_LEN)
val_data_loader = create_data_loader(df, tokenizer, batch_size, num_workers, MAX_LEN)
test_data_loader = create_data_loader(df, tokenizer, batch_size, num_workers, MAX_LEN)

## Train

### Automatic Training

In [28]:
training_args = TrainingArguments(
    output_dir = "~/dev/hist-aware/notebooks/models/bert-fine-tuning-existing",
    overwrite_output_dir = False,
#   
    num_train_epochs=3,              # total number of training epochs
    evaluation_strategy="steps",
    per_device_train_batch_size=6, # default is 8
    per_device_eval_batch_size=4, # default is 8
    eval_steps=200,
    warmup_steps=200,                # number of warmup steps for learning rate scheduler    
    weight_decay=0.01,
    learning_rate=2e-5,  # config
    
    logging_dir="~/dev/hist-aware/notebooks/logging",
    logging_steps=20,
    load_best_model_at_end=True,  
    seed=2020,
    label_names=["labels"], # check this
    disable_tqdm=False
)

In [29]:
# ------------------
# IGNORING WARNINGS!
# ------------------

import warnings
warnings.filterwarnings('ignore')

model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=3).to("cuda")

def get_model():
    return BertForSequenceClassification.from_pretrained(
        PRE_TRAINED_MODEL_NAME,
        config=config,
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        #'f1': f1,
        #'precision': precision,
        #'recall': recall
    }


trainer = Trainer(
    model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

# Defaut objective is the sum of all metrics
# when metrics are provided, so we have to maximize it.
#trainer.hyperparameter_search(
#    model_init=model_init,
#    direction="maximize",
#    backend="ray",
#    n_trials=100, # deafult 100
#    n_jobs=2  # number of parallel jobs, if multiple GPUs
#)

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wietsedv/bert-

Step,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
200,0.9283,1.051296,0.519993,158.3278,37.119
400,1.0743,0.986892,0.530883,94.1627,62.413


KeyboardInterrupt: 

### Evaluate

In [None]:
trainer.evaluate()

In [None]:
# saving the fine tuned model & tokenizer
model.save_pretrained("/home/leonardovida/dev/hist-aware/notebooks/models/bert-fine-tuning-existing")
tokenizer.save_pretrained(model_path)

### Get predictions

In [37]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

## Tune model - RUN ONLY WITH 1TB of disk space

In [None]:
from typing import Callable, Dict
from transformers import EvalPrediction
from transformers import glue_compute_metrics, glue_output_modes


def build_compute_metrics_fn(
        task_name: str) -> Callable[[EvalPrediction], Dict]:
    """Function from transformers/examples/text-classification/run_glue.py"""
    output_mode = glue_output_modes[task_name]

    def compute_metrics_fn(p: EvalPrediction):
        if output_mode == "classification":
            preds = np.argmax(p.predictions, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(p.predictions)
        metrics = glue_compute_metrics(task_name, preds, p.label_ids)
        return metrics

    return compute_metrics_fn

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    #precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        #'f1': f1,
        #'precision': precision,
        #'recall': recall
    }

def tune_transformer(num_samples=8,
                     gpus_per_trial=0,
                     smoke_test=False,
                    num_labels=3):
    ray.init(log_to_driver=True, ignore_reinit_error=True)
    data_dir_name = "./tune_data" if not smoke_test else "./tune_test_data"
    data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
    if not os.path.exists(data_dir):
        os.mkdir(data_dir, 0o755)
        
    model_name = "wietsedv/bert-base-dutch-cased"
    task_name = "energy"
    
    task_data_dir = os.path.join(data_dir, task_name.upper())
    
    num_labels = num_labels
    
    config = AutoConfig.from_pretrained(
        model_name, num_labels=num_labels)
    
    # Download and cache tokenizer, model, and features
    print("Downloading and caching Tokenizer")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Triggers tokenizer download to cache
    print("Downloading and caching pre-trained model")
    AutoModelForSequenceClassification.from_pretrained(
        model_name,
        config=config,
    )
    
    def get_model():
        return AutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=config,
        )
    
    training_args = TrainingArguments(
        output_dir=".",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        no_cuda=gpus_per_trial <= 0,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=2,  # config
        max_steps=-1,
        per_device_train_batch_size=16,  # config
        per_device_eval_batch_size=16,  # config
        warmup_steps=0,
        weight_decay=0.1,  # config
        logging_dir="/logs",
        evaluate_during_training=True
    )
    
    training_args._n_gpu = gpus_per_trial

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics
    )

    tune_config = {
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 8,
        "num_train_epochs": tune.choice([2, 3, 4, 5]),
        "max_steps": 1 if smoke_test else -1,  # Used for smoke test.
    }
    
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="eval_acc",
        mode="max",
        perturbation_interval=1,
        hyperparam_mutations={
            "weight_decay": tune.uniform(0.0, 0.3),
            "learning_rate": tune.uniform(1e-5, 5e-5),
            "per_device_train_batch_size": [4, 8, 16, 32],
        })

    reporter = CLIReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "per_device_train_batch_size": "train_bs/gpu",
            "num_train_epochs": "num_epochs"
        },
        metric_columns=[
            "eval_acc", "eval_loss", "epoch", "training_iteration"
        ])

    trainer.hyperparameter_search(
        hp_space=lambda _: tune_config,
        backend="ray",
        n_trials=num_samples,
        resources_per_trial={
            "cpu": 4,
            "gpu": gpus_per_trial
        },
        scheduler=scheduler,
        keep_checkpoints_num=1,
        checkpoint_score_attr="training_iteration",
        stop={"training_iteration": 1} if smoke_test else None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer_pbt",
        log_to_file=True)
    
tune_transformer(
    num_samples=8,
    gpus_per_trial=2)    

2021-03-03 18:11:44,007	INFO worker.py:664 -- Calling ray.init() again after it has already been called.


Downloading and caching Tokenizer
Downloading and caching pre-trained model


Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

== Status ==
Memory usage on this node: 8.6/78.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 2/10 CPUs, 2/2 GPUs, 0.0/46.58 GiB heap, 0.0/16.06 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /home/leonardovida/ray_results/tune_transformer_pbt
Number of trials: 1/8 (1 RUNNING)
+------------------------+----------+-------+-----------+-------------+----------------+--------------+
| Trial name             | status   | loc   |   w_decay |          lr |   train_bs/gpu |   num_epochs |
|------------------------+----------+-------+-----------+-------------+----------------+--------------|
| _objective_efc70_00000 | RUNNING  |       |  0.112362 | 4.80286e-05 |              8 |            4 |
+------------------------+----------+-------+-----------+-------------+----------------+--------------+




[2m[36m(pid=26218)[0m 2021-03-03 18:12:07.197985: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
[2m[36m(pid=26218)[0m 2021-03-03 18:12:07.198066: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2m[36m(pid=26218)[0m Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
[2m[36m(pid=26218)[0m - This IS expected if you are initializing BertForSequenceClassification from the check

[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26218)[0m <IPython.core.d

2021-03-03 18:15:18,156	ERROR trial_runner.py:616 -- Trial _objective_efc70_00000: Error processing event.
Traceback (most recent call last):
  File "/home/leonardovida/.cache/pypoetry/virtualenvs/histaware-NidRwJ64-py3.8/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 594, in _process_trial
    decision = self._process_trial_result(trial, result)
  File "/home/leonardovida/.cache/pypoetry/virtualenvs/histaware-NidRwJ64-py3.8/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 631, in _process_trial_result
    self._validate_result_metrics(result)
  File "/home/leonardovida/.cache/pypoetry/virtualenvs/histaware-NidRwJ64-py3.8/lib/python3.8/site-packages/ray/tune/trial_runner.py", line 742, in _validate_result_metrics
    raise ValueError(
ValueError: Trial returned a result which did not include the specified metric(s) `eval_acc` that `PopulationBasedTraining` expects. Make sure your calls to `tune.report()` include the metric, or set the TUNE_DISABLE_STRICT_METRIC_

Result for _objective_efc70_00000:
  {}
  
== Status ==
Memory usage on this node: 12.7/78.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 0/10 CPUs, 0/2 GPUs, 0.0/46.58 GiB heap, 0.0/16.06 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /home/leonardovida/ray_results/tune_transformer_pbt
Number of trials: 2/8 (1 ERROR, 1 PENDING)
+------------------------+----------+-------+-----------+-------------+----------------+--------------+
| Trial name             | status   | loc   |   w_decay |          lr |   train_bs/gpu |   num_epochs |
|------------------------+----------+-------+-----------+-------------+----------------+--------------|
| _objective_efc70_00001 | PENDING  |       |  0.219598 | 3.39463e-05 |              8 |            2 |
| _objective_efc70_00000 | ERROR    |       |  0.112362 | 4.80286e-05 |              8 |            4 |
+------------------------+----------+-------+-----------+-------------+----------------+--------------+
Numbe



== Status ==
Memory usage on this node: 9.1/78.7 GiB
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Resources requested: 2/10 CPUs, 2/2 GPUs, 0.0/46.58 GiB heap, 0.0/16.06 GiB objects (0/1.0 accelerator_type:RTX)
Result logdir: /home/leonardovida/ray_results/tune_transformer_pbt
Number of trials: 2/8 (1 ERROR, 1 RUNNING)
+------------------------+----------+-------+-----------+-------------+----------------+--------------+
| Trial name             | status   | loc   |   w_decay |          lr |   train_bs/gpu |   num_epochs |
|------------------------+----------+-------+-----------+-------------+----------------+--------------|
| _objective_efc70_00001 | RUNNING  |       |  0.219598 | 3.39463e-05 |              8 |            2 |
| _objective_efc70_00000 | ERROR    |       |  0.112362 | 4.80286e-05 |              8 |            4 |
+------------------------+----------+-------+-----------+-------------+----------------+--------------+
Number of errored trials: 1
+--------------------

[2m[36m(pid=26217)[0m 2021-03-03 18:15:25.722540: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
[2m[36m(pid=26217)[0m 2021-03-03 18:15:25.722590: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2m[36m(pid=26217)[0m Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
[2m[36m(pid=26217)[0m - This IS expected if you are initializing BertForSequenceClassification from the check

[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.display.HTML object>
[2m[36m(pid=26217)[0m <IPython.core.d