# Finetuning bert model

This is a script to finetune the bert model. Make sure you have runned `create_vector_files.py` and they are placed in the `build` directory.

Best Hyperparameter:

* Default
  * seed: 42
  * model_name: `bert-base-cased`
  * epochs: 3
  * batch_size: 16
  * learning_rate: 5e-5
* Large
  * seed: 1996
  * model_name: `bert-base-cased`
  * epochs: 3
  * batch_size: 16
  * learning_rate: 5e-5
* Hatespeech
  * seed: 42
  * model_name: `bert-base-cased`
  * epochs: 2
  * batch_size: 16
  * learning_rate: 2e-5

In [None]:
# Make sure you choose the right basis directory!

# Code must be in ./src
# Vector files (use create_vector_files.py on your base system) must be in ./build
# Dataset splitting information must be in ./dataset/<name>/splitting.txt

# Note that you just ne the splitting information, not the dataset itself.

%cd ./drive/My\ Drive/Colab\ Notebooks/PythonLearner

/content/drive/My Drive/Colab Notebooks/PythonLearner


In [None]:
#@title Basic installs and imports
!pip install transformers==3.3.1
!pip install emoji==0.6.0

from src.twitter import TweetDataset
from src.ml import BinaryNeuralLearner, MappingDataset, \
    random_split_percentage, load_all_from_dataset, \
    accuracy, custom_accuracy, split_from_file

from src.ml.model import RNN
from src.util import SuspendManager, format_time

import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torch.utils.data import random_split
from tqdm import tqdm
import numpy as np
import time
from src.features import preprocess_bert

from transformers import \
  BertTokenizer, BertModel, \
  BertForSequenceClassification, \
  AdamW, get_linear_schedule_with_warmup


Collecting transformers==3.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |▎                               | 10kB 17.9MB/s eta 0:00:01[K     |▋                               | 20kB 23.8MB/s eta 0:00:01[K     |█                               | 30kB 17.3MB/s eta 0:00:01[K     |█▎                              | 40kB 11.9MB/s eta 0:00:01[K     |█▌                              | 51kB 8.3MB/s eta 0:00:01[K     |█▉                              | 61kB 8.6MB/s eta 0:00:01[K     |██▏                             | 71kB 8.8MB/s eta 0:00:01[K     |██▌                             | 81kB 9.3MB/s eta 0:00:01[K     |██▉                             | 92kB 8.9MB/s eta 0:00:01[K     |███                             | 102kB 9.6MB/s eta 0:00:01[K     |███▍                            | 112kB 9.6MB/s eta 0:00:01[K     |███▊                            | 



In [None]:
#@markdown The seed determines the generated random numbers for initilizing the
#@markdown model. The splitting of the dataset is independet of this information
seed_value =  1996#@param {type:"number"}

torch.manual_seed(seed_value)
np.random.seed(seed_value) 

#@markdown The name of the dataset that should be used
dataset_name = "hatespeech" #@param ["default", "large", "hatespeech"]

#@markdown Model name and information
model_name = "bert-base-cased" #@param ["bert-base-cased", "bert-base-uncased", "bert-large-cased", "bert-large-uncased"]
use_cuda = True #@param {type:"boolean"}

tweet_texts = torch.load("build/vectors/" + dataset_name + "/x_texts.pt")
label = torch.load("build/vectors/" + dataset_name + "/y_data.pt")

print("Size of Dataset: {}".format(len(tweet_texts)))

SPLITTING_FILE = "datasets/" + dataset_name + "/splitting.txt"

Size of Dataset: 3778


In [None]:
# @title Preprocessing of the bert model
# @markdown Settings for the preprocessing phase of the bert model

replace_url = True        #@param {type:"boolean"}
replace_mention = True    #@param {type:"boolean"}
replace_hashtags = False  #@param {type:"boolean"}
replace_emoticons = True  #@param {type:"boolean"}
replace_smileys = False   #@param {type:"boolean"}

print("Original Text")
for t in tweet_texts[-5:]:
  print(t)

tweet_texts = [preprocess_bert(
    t,
      url_token="[unused1]" if replace_url else None,
      mention_token="[unused2]" if replace_mention else None,
      hashtag_token="[unused3]" if replace_hashtags else None,
      replace_emojis=replace_emoticons,
      replace_smileys=replace_smileys
    ) for t in tweet_texts]
tokenizer = BertTokenizer.from_pretrained(model_name)


tokenlist = []
if replace_url:       tokenlist.append("[unused1]")
if replace_mention:   tokenlist.append("[unused2]")
if replace_hashtags:  tokenlist.append("[unused3]")

if len(tokenlist) > 0:
  tokenizer.add_tokens(tokenlist, True)

print(" ----- ")
print("After text processing")
for t in tweet_texts[-5:]:
  print(tokenizer.tokenize(t))

Original Text
Leave the Past where it belongs...

Thought of the of the night. #COTAMUSHE @ Windhoek Noord,… https://t.co/NzlEYOYw4F
Bharat Stage-III vehicle ban brings in huge discounts on two-wheelers - India Today https://t.co/7Nn3PNFLUS
[Georges Hall] Forretress (F) (IV: 75%) until 07:54:16PM at 15 Orison St https://t.co/ml736uYzOh https://t.co/ikHTtcs6dS #sixty
Wow... 2017s @waikato Distinguished Awards are looking amazing #DAA #waikatoproud #alumni @AlumniWaikato https://t.co/YVFi7rDlXH
I've had a few people ask what effects I use to create Lil' Dva's voice...

Uhhh... none.

I'm seriously. That's all me. No modulation.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…


 ----- 
After text processing
['Leave', 'the', 'Past', 'where', 'it', 'belongs', '.', '.', '.', 'Thought', 'of', 'the', 'of', 'the', 'night', '.', '#', 'CO', '##TA', '##MU', '##S', '##H', '##E', '@', 'Wind', '##hoe', '##k', 'No', '##ord', ',', '…', '[unused1]']
['B', '##hara', '##t', 'Stage', '-', 'III', 'vehicle', 'ban', 'brings', 'in', 'huge', 'discount', '##s', 'on', 'two', '-', 'wheel', '##ers', '-', 'India', 'Today', '[unused1]']
['[', 'Georges', 'Hall', ']', 'For', '##ret', '##ress', '(', 'F', ')', '(', 'IV', ':', '75', '%', ')', 'until', '07', ':', '54', ':', '16', '##PM', 'at', '15', 'Or', '##ison', 'St', '[unused1]', '[unused1]', '#', 'sixty']
['Wow', '.', '.', '.', '2017', '##s', '[unused2]', 'Distinguished', 'Awards', 'are', 'looking', 'amazing', '#', 'D', '##AA', '#', 'wa', '##ika', '##top', '##rou', '##d', '#', 'alumni', '[unused2]', '[unused1]']
['I', "'", 've', 'had', 'a', 'few', 'people', 'ask', 'what', 'effects', 'I', 'use', 'to', 'create', 'Lil', "'", 'D', '##va', "'

In [None]:
print('Creating Bert Embeddings..')
embeddings = tokenizer.batch_encode_plus(tweet_texts, truncation=True, padding=True, return_tensors='pt')

dataset = TensorDataset(
    embeddings["input_ids"],
    embeddings["attention_mask"],
    embeddings["token_type_ids"],
    torch.FloatTensor(label)
)

train_set, dev_set, test_set = split_from_file(dataset, SPLITTING_FILE)
print("Set distribution: {}/{}/{}".format(len(train_set), len(dev_set), len(test_set)))

Creating Bert Embeddings..
Set distribution: 3022/377/379


In [None]:
# @title Learning phase
# @markdown The bert paper recommends the following hyper parameter:
# @markdown * **Batch size**: 16, 32 
# @markdown * **Learning rate**: 5e-5, 3e-5, 2e-5
# @markdown * **Number of epochs**: 2, 3, 4

from tqdm.notebook import tqdm as tqdm_notebook
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
device = torch.device("cpu")

if use_cuda:
    device = torch.device("cuda")
    model.cuda()

epochs =  2# @param {type:"integer"}
batch_size = 16 # @param {type:"integer"}
learning_rate = 2e-5 # @param {type:"number"}

optimizer = AdamW(model.parameters(),
                  lr=learning_rate,
                  eps=1e-8
                  )


train_dataloader = DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True
)
dev_dataloader = DataLoader(
    dev_set,
    batch_size=batch_size,
    drop_last=False
)

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

training_stats = []
t0 = time.time()

for epoch in range(epochs):
    ti = time.time()

    total_train_loss = 0

    model.train()
    loop = tqdm_notebook(train_dataloader, desc="Epoch {} / {}".format(epoch + 1, epochs))
    for batch in loop:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        token_type = batch[2].to(device)
        labels = batch[3].to(device)

        model.zero_grad()
        loss, logits = model(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type,
                             labels=labels)

        atomic_loss = loss.item()
        total_train_loss += atomic_loss
        loop.set_postfix(loss=atomic_loss)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - ti)
    loop.write("Average training loss: {0:.2f}".format(avg_train_loss))
    loop.write("Training epoch took: {:}".format(training_time))
    loop.write("")

    with torch.no_grad():
        model.eval()
        ti = time.time()

        total_batch_count = 0
        total_loss = 0
        total_accuracy = 0
        for batch in tqdm_notebook(dev_dataloader, "Evaluating"):
            dev_input_ids = batch[0].to(device)
            dev_attention_mask = batch[1].to(device)
            token_type = batch[2].to(device)
            dev_labels = batch[3].to(device)

            loss, logits = model(dev_input_ids,
                                 attention_mask=dev_attention_mask,
                                 token_type_ids=token_type,
                                 labels=dev_labels)
            logits = torch.reshape(logits, (-1,))

            total_batch_count += 1
            total_loss += loss.item()
            total_accuracy += accuracy(logits, dev_labels)

        training_time = format_time(time.time() - ti)
        loop.write("Development loss: {0:.4f}".format(total_loss / total_batch_count))
        loop.write("Development accuracy: {0:.4f}".format(total_accuracy / total_batch_count))
        loop.write("Development epoch took: {:}".format(training_time))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

HBox(children=(FloatProgress(value=0.0, description='Epoch 1 / 2', max=189.0, style=ProgressStyle(description_…


Average training loss: 0.19
Training epoch took: 0:02:10



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=24.0, style=ProgressStyle(description_wi…


Development loss: 0.1303
Development accuracy: 0.8183
Development epoch took: 0:00:06


HBox(children=(FloatProgress(value=0.0, description='Epoch 2 / 2', max=189.0, style=ProgressStyle(description_…


Average training loss: 0.11
Training epoch took: 0:02:12



HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=24.0, style=ProgressStyle(description_wi…


Development loss: 0.1371
Development accuracy: 0.8209
Development epoch took: 0:00:06


In [None]:
# @title Evaluate results

test_dataloader = DataLoader(
    test_set,
    batch_size=batch_size,
    drop_last=False
)

with torch.no_grad():
    model.eval()

    total_batch_count = 0
    total_loss = 0
    total_accuracy = 0
    for batch in tqdm_notebook(test_dataloader, "Evaluating"):
        test_input_ids = batch[0].to(device)
        test_attention_mask = batch[1].to(device)
        test_token_type = batch[2].to(device)
        test_labels = batch[3].to(device)

        loss, logits = model(test_input_ids,
                              attention_mask=test_attention_mask,
                              token_type_ids=test_token_type,
                              labels=test_labels)
        logits = torch.reshape(logits, (-1,))

        total_batch_count += 1
        total_loss += loss.item()
        total_accuracy += accuracy(logits, test_labels)

    loop.write("Test loss: {0:.4f}".format(total_loss / total_batch_count))
    loop.write("Test accuracy: {0:.4f}".format(total_accuracy / total_batch_count))

HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=24.0, style=ProgressStyle(description_wi…


Test loss: 0.1421
Test accuracy: 0.8217


In [None]:
# @title Save results

import ipywidgets as widgets
from IPython.display import display

textbox = widgets.Text(value='build/model-result')
button = widgets.Button(description="Save results")

def on_button_clicked(b):
  import os
  
  output_dir = textbox.value
  if not os.path.exists(output_dir):
      os.makedirs(output_dir)
  
  
  # Save a trained model, configuration and tokenizer using `save_pretrained()`.
  # They can then be reloaded using `from_pretrained()`
  model_to_save = model.module if hasattr(model, 'module') else model
  model_to_save.save_pretrained(output_dir)
  tokenizer.save_pretrained(output_dir)
  print("Model Saved")




output = widgets.Output()
button.on_click(on_button_clicked)
display(textbox, output)
display(button, output)

Text(value='build/model-result')

Output()

Button(description='Save results', style=ButtonStyle())

Output()