###Mounting of Google Drive

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###Instal/Load Packages & Libraries

In [None]:
!pip install datasets -q
!pip install -q transformers
!pip install --quiet --upgrade accelerate
!pip install evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, re
import time
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset
import evaluate
import math

## NN packages
import tensorflow as tf
from tensorflow import keras

# NLP packages
import torch
from transformers import AutoModelForMaskedLM, TFAutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
from transformers import create_optimizer

In [None]:
import transformers

###Loading in Datasets

In [None]:
dataset = load_dataset('csv', data_files={'train': '/content/drive/MyDrive/Colab Notebooks/w266/data/clean_train_data.csv',
                                          'eval': '/content/drive/MyDrive/Colab Notebooks/w266/data/clean_test_data.csv'}, encoding = "ISO-8859-1")


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4d94bd9b0ace3877/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4d94bd9b0ace3877/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#loading in the training and test set
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 16990
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'text', 'label'],
        num_rows: 4117
    })
})

###Metric Evaluations

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred.predictions, eval_pred.label_ids
  # Calculate metrics based on predictions and labels
  train_accuracy = accuracy_score(predictions, labels)

  # Use the validation dataset for evaluation
  val_predictions, val_labels = trainer.predict(eval_dataset)
  val_accuracy = accuracy_score(val_predictions, val_labels)

  f1 = f1_score(predictions, labels, average='micro')

  return {
        'train_accuracy': train_accuracy,
        'val_accuracy': val_accuracy,
        'f1_score': f1,
        }

In [None]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from sklearn.metrics import accuracy_score

def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": accuracy_score(labels, preds)}

###BERTbase (uncased) Fine Tuning
This is the vanilla version of bert that we will continue to pretrain on our specific dataset.

####Tokenizer Setup

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModelForMaskedLM.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["Unnamed: 0","text", "label"]
)
tokenized_dataset

Map:   0%|          | 0/16990 [00:00<?, ? examples/s]

Map:   0%|          | 0/4117 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 16990
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 4117
    })
})

In [None]:
chunk_size = 100

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size

    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_dataset.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/16990 [00:00<?, ? examples/s]

Map:   0%|          | 0/4117 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 4527
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1102
    })
})

In [None]:
# del dataset
# del tokenized_dataset

### Fine-Tuning Bert Model

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
# Creating TF datasets

tf_train_dataset = model.prepare_tf_dataset(
    lm_datasets["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    lm_datasets["eval"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Creating lr schedule and compiling model
num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
# Evaluating perplexity before training
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 64.02


In [None]:
# Training the model
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x78f9066f1de0>

In [None]:
# Evaluating perplexity after training
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 11.95


#### Saving model

In [None]:
# Checkpointing BERT-base-MLM model
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM')

# Checkpointing BERT-base-MLM tokenizer
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer')

('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer/tokenizer.json')