## Ekhator Uwaila's Notebook

### BERT fine-tuning for document classification

In [1]:
import os
import re
import numpy as np
from sklearn.metrics import accuracy_score

import transformers
from transformers import BertTokenizer, BertModel

import torch
from torch import cuda
from tqdm import tqdm_notebook as tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cuda'

- use X.txt and YL1.txt

In [2]:
X = [line.strip() for line in open('X.txt').readlines()]
y = train_data = [int(line.strip()) for line in open('YL1.txt').readlines()]

len(X), len(y), max(y)

(46985, 46985, 6)

In [3]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

In [4]:
cleaned_X = [clean_text(text_sequence) for text_sequence in X]

In [5]:
len(cleaned_X)

46985

In [6]:
len(set(y))

7

### An easy train/test split

In [7]:
train_X = cleaned_X[:46000]
train_y = np.array(y[:46000])
test_X = cleaned_X[46000:]
test_y = np.array(y[46000:])

len(train_X), len(train_y), len(test_X), len(test_y)

(46000, 46000, 985, 985)

### Torch Datasets

- takes in inputs and outputs/labels
- interfaces with tokenizer
- handles batching

In [8]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

### Finetune BERT on the dataset

- first "layer" is a pre-trained BERT model
- you can add whatever layers you want after that

In [9]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()

        self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)#u may not need to include softmax for multiclass as the cross entopy loss function has softmax included

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

In [10]:
def loss_fn(outputs, targets):
    #return torch.nn.BCELoss()(outputs, targets)
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss

def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()#we may also remove this for the multiclass, as sigmoid is mostly used for binary
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)




    #     for data in tqdm(testing_loader):
    #         targets = data['targets']
    #         ids = data['ids'].to(device, dtype = torch.long)
    #         mask = data['mask'].to(device, dtype = torch.long)
    #         token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    #         outputs = model(ids, mask, token_type_ids)
    #          # Assuming `targs` is already converted to tensor during data loading
    #         fin_targets.append(targets)
    #         fin_outputs.append(outputs)
    # # Concatenate the lists of tensors along the batch dimension
    # fin_targets = torch.cat(fin_targets, dim=0)
    # fin_outputs = torch.cat(fin_outputs, dim=0)
    # return fin_outputs, fin_targets
            #outputs = torch.sigmoid(outputs).cpu().detach()#we may also remove this for the multiclass, as sigmoid is mostly used for binary
            #fin_outputs.extend(outputs)
            #fin_targets.extend(targets)
    #return torch.stack(fin_outputs), torch.stack(fin_targets)

### The Tokenizer

- Converts a raw string to the ids, masks, and token_type_ids

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# what does the tokenizer do?
print(train_X[5])

tokenizer.encode_plus(
            train_X[5],
            None,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

 Objective In order to increase classification accuracy of tea category identification TCI system this paper proposed a novel approach. Method The proposed methods first extracted 64 color histogram to obtain color information and 16 wavelet packet entropy to obtain the texture information. With the aim of reducing the 80 features principal component analysis was harnessed. The reduced features were used as input to generalized eigenvalue proximal support vector machine GEPSVM . Winner takes all WTA was used to handle the multiclass problem. Two kernels were tested linear kernel and Radial basis function RBF kernel. Ten repetitions of 10 fold stratified cross validation technique were used to estimate the out of sample errors. We named our method as GEPSVM RBF WTA and GEPSVM WTA. Result The results showed that PCA reduced the 80 features to merely five with explaining 99.90 of total variance. The recall rate of GEPSVM RBF WTA achieved the highest overall recall rate of 97.9 . Conclusio



{'input_ids': [101, 7863, 1999, 2344, 2000, 3623, 5579, 10640, 1997, 5572, 4696, 8720, 22975, 2072, 2291, 2023, 3259, 3818, 1037, 3117, 3921, 1012, 4118, 1996, 3818, 4725, 2034, 15901, 4185, 3609, 2010, 3406, 13113, 2000, 6855, 3609, 2592, 1998, 2385, 4400, 7485, 14771, 23077, 2000, 6855, 1996, 14902, 2592, 1012, 2007, 1996, 6614, 1997, 8161, 1996, 3770, 2838, 4054, 6922, 4106, 2001, 17445, 2098, 1012, 1996, 4359, 2838, 2020, 2109, 2004, 7953, 2000, 18960, 1041, 29206, 10175, 5657, 4013, 9048, 9067, 2490, 9207, 3698, 16216, 4523, 2615, 2213, 1012, 3453, 3138, 2035, 21925, 2001, 2109, 2000, 5047, 1996, 4800, 26266, 3291, 1012, 2048, 16293, 2015, 2020, 7718, 7399, 16293, 1998, 15255, 3978, 3853, 21144, 2546, 16293, 1012, 2702, 23318, 2015, 1997, 2184, 10671, 2358, 8609, 7810, 2892, 27354, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Training setup

- hyperparameters
- setup dataset
- setup parameters
- setup dataloader

In [None]:
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
NUM_OUT = 7
LEARNING_RATE = 2e-05

#train_2y = torch.tensor(train_y, dtype=torch.long, device=device)
#test_2y = torch.tensor(test_y, dtype=torch.long, device=device)

#training_data = MultiLabelDataset(train_X, train_2y, tokenizer, MAX_LEN)
#test_data = MultiLabelDataset(test_X, test_2y, tokenizer, MAX_LEN)

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

### Train,  Evaluate

- model.to -> send to GPU, if available (anything computed should be put onto the GPU)
- setup optimizer - could use Stochastic Gradient Descent, but ADAM tends to work better
- for each epoch, train, show the loss, evaluate on the test data

In [None]:
model = BERTClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    #targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targs)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/719 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 0, Loss:  1.3328043222427368


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/16 [00:00<?, ?it/s]

arracy on test set 0.8030456852791878


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/719 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 1, Loss:  1.289966344833374


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/16 [00:00<?, ?it/s]

arracy on test set 0.8050761421319796


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/719 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 2, Loss:  1.373113751411438


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/16 [00:00<?, ?it/s]

arracy on test set 0.8243654822335026


### Using 32 as batch size:

In [12]:
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
NUM_OUT = 7
LEARNING_RATE = 2e-05

#train_2y = torch.tensor(train_y, dtype=torch.long, device=device)
#test_2y = torch.tensor(test_y, dtype=torch.long, device=device)

#training_data = MultiLabelDataset(train_X, train_2y, tokenizer, MAX_LEN)
#test_data = MultiLabelDataset(test_X, test_2y, tokenizer, MAX_LEN)

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

In [13]:
model = BERTClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    #targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targs)))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/1438 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 0, Loss:  1.3061795234680176


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/31 [00:00<?, ?it/s]

arracy on test set 0.798984771573604


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/1438 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 1, Loss:  1.2915265560150146


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/31 [00:00<?, ?it/s]

arracy on test set 0.8131979695431472


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/1438 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 2, Loss:  1.290584921836853


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/31 [00:00<?, ?it/s]

arracy on test set 0.8304568527918782


### Using 16 as batch size:

In [14]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
NUM_OUT = 7
LEARNING_RATE = 2e-05

#train_2y = torch.tensor(train_y, dtype=torch.long, device=device)
#test_2y = torch.tensor(test_y, dtype=torch.long, device=device)

#training_data = MultiLabelDataset(train_X, train_2y, tokenizer, MAX_LEN)
#test_data = MultiLabelDataset(test_X, test_2y, tokenizer, MAX_LEN)

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

In [15]:
model = BERTClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    #targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targs)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 0, Loss:  1.4351199865341187


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/62 [00:00<?, ?it/s]

arracy on test set 0.7786802030456853


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 1, Loss:  1.352996587753296


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/62 [00:00<?, ?it/s]

arracy on test set 0.8081218274111676


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 2, Loss:  1.4243149757385254


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/62 [00:00<?, ?it/s]

arracy on test set 0.8071065989847716


In [None]:
# not needed for training or evaluation, but useful for mapping examples
labels = {
    0:'Computer Science',
    1:'Electrical Engineering',
    2:'Psychology',
    3:'Mechanical Engineering',
    4:'Civil Engineering',
    5:'Medical Science',
    6:'Biochemistry'
}

len(labels)

7

### Finetune ROBERTA on the dataset

In [None]:
from transformers import RobertaTokenizer, RobertaModel



In [None]:
class ROBERTAClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(ROBERTAClass, self).__init__()

        self.l1 = RobertaModel.from_pretrained('roberta-base')
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)#u may not need to include softmax for multiclass as the cross entopy loss function has softmax included

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

In [None]:
def loss_fn(outputs, targets):
    #return torch.nn.BCELoss()(outputs, targets)
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss

def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()#we may also remove this for the multiclass, as sigmoid is mostly used for binary
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### Tokenizer

In [None]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

### Training Setup

In [None]:
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
NUM_OUT = 7
LEARNING_RATE = 2e-05

#train_2y = torch.tensor(train_y, dtype=torch.long, device=device)
#test_2y = torch.tensor(test_y, dtype=torch.long, device=device)

#training_data = MultiLabelDataset(train_X, train_2y, tokenizer, MAX_LEN)
#test_data = MultiLabelDataset(test_X, test_2y, tokenizer, MAX_LEN)
training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

### Train/Evaluate

In [None]:
model = ROBERTAClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    #targets = torch.max(targs, dim=1)
    print('arracy on test set {}'.format(accuracy_score(guesses.indices, targs)))

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/719 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 0, Loss:  1.2651734352111816


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/16 [00:00<?, ?it/s]

arracy on test set 0.7796954314720812


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/719 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 1, Loss:  1.394126296043396


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/16 [00:00<?, ?it/s]

arracy on test set 0.7969543147208121


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/719 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


Epoch: 2, Loss:  1.389917254447937


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(testing_loader):


  0%|          | 0/16 [00:00<?, ?it/s]

arracy on test set 0.7928934010152284


### Questions

- What does the BERT Tokenizer do?<br>
Bert tokenizer uses the subword/wordpiece tokenization method to tokenize texts. It splits the words into smaller words or characters, and then uses special tokens like CLS, SEP, MASK, etc. to mark the beginning/end of sentences, indicate paddings and unknown tokens. It outputs ids/encoddings for the subwords, token_type ids which helps to segment sentence/sequence pairs, and an attention mask.
- What loss function did you use? Why did you choose that loss function?<br>
I used the Crossentropy loss function because our dataset involves multiclass classification
- (Edit) Try different batch sizes (e.g., 8 vs 16 vs 32). How does that affect your results?<br>
Roberta gave an accuracy of 79% for 64 batch size<br>
Bert gave an accuracy of 82.4% for 64 batch size<br>
Bert gave an accuracy of 83% for 32 batch size<br>
Bert gave an accuracy of 80% for 16 batch size<br>
The accuracy score reduced as i increased the batch size.

- Try another huggingface model (e.g., ELECTRA or RoBERTa) and compare results (you will need to change the name of the model and the tokenzier)
How do the results compare to BERT?<br>
I used RoBERTa and got an accuracy of 79%, whereas i got an accuracy of 82% using BERT. RoBERTa performed worse than BERT.
- What is the power of fine-tuning (as opposed to pre-training)?<br>
Fine tuning has faster training times, since we are training with a smaller more specific training set.<br>
Fine tuning is also better for specific/specialized tasks, it would perform better.<br>
Fine tuning is also resource efficient, in the sense that it requires less powerful hardwares or GPUs than pretraining