# FinBert Masked language model (MLM) training

###Mounting of Google Drive

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###Instal/Load Packages & Libraries

In [None]:
!pip install datasets -q
!pip install -q transformers
!pip install --quiet --upgrade accelerate
!pip install evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, re
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
import evaluate
import math

## NN packages
import tensorflow as tf
from tensorflow import keras

# NLP packages
import torch
import transformers
from transformers import AutoModelForMaskedLM, TFAutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

###Loading in Datasets

In [None]:
dataset = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/w266/data/clean_train_data.csv'}, encoding = "ISO-8859-1")


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#loading in the training and test set
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 16990
    })
})

###FinBERT Fine Tuning
This is the FinBERT version that we will continue to pretrain on our specific dataset.

####Tokenizer Setup

In [None]:
checkpoint = 'yiyanghkust/finbert-pretrain'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForMaskedLM.from_pretrained(checkpoint, from_pt=True)

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["Unnamed: 0","text", "label"]
)
tokenized_dataset

Map:   0%|          | 0/16990 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 16990
    })
})

In [None]:
chunk_size = 100

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size

    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/16990 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 4462
    })
})

In [None]:
# del dataset
# del tokenized_dataset

### Further Training FinBert Model

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
# Creating TF datasets

tf_train_dataset = model.prepare_tf_dataset(
    lm_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005))

In [None]:
# Training the model
model.fit(tf_train_dataset, validation_data=None, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3410485720>

In [None]:
# Saving checkpoint
checkpoint_save_location = '/content/drive/MyDrive/w266/FinalProject/checkpoints/FinBERT-base-MLM'
model.save_pretrained(checkpoint_save_location)
model.config.__class__.from_pretrained(checkpoint).save_pretrained(checkpoint_save_location)
tokenizer.save_pretrained(checkpoint_save_location)


('/content/drive/MyDrive/w266/FinalProject/checkpoints/FinBERT-base-MLM/tokenizer_config.json',
 '/content/drive/MyDrive/w266/FinalProject/checkpoints/FinBERT-base-MLM/special_tokens_map.json',
 '/content/drive/MyDrive/w266/FinalProject/checkpoints/FinBERT-base-MLM/vocab.txt',
 '/content/drive/MyDrive/w266/FinalProject/checkpoints/FinBERT-base-MLM/added_tokens.json',
 '/content/drive/MyDrive/w266/FinalProject/checkpoints/FinBERT-base-MLM/tokenizer.json')