# RoBERTa base Fine-tuned on Custom Amazon Dataset - TRAINING FILE

In [1]:
import wandb
wandb.login() # add your wanddb api key when prompt

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import json
import pandas as pd
import collections
import os
import numpy as np
import random

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Load Dataset

In [3]:
path_meta = "/kaggle/input/cell-phones-final/Cell_Phones_and_Accessories_final.json"

with open(path_meta,'r') as file:
    data = json.load(file)
len(data)

10375

In [5]:
data[0]

{'id': 'B0015X7RSO',
 'title': 'HTC Sprint Touch P3450 Smartphone Black Swivel Belt Clip Holster',
 'context': 'It is a Generic product. Don apos t ever leave your phone behind again. By attaching our holster you can clip your phone onto your belt or just anywhere and feel free while your phone is secured. Brand new non OEM Custom made to fit your HTC Touch perfectly. Includes a swivel belt clip. Categories of product are Cell Phones Accessories Cases Holsters Sleeves',
 'qas': [{'answer': {'answer_start': 8, 'text': 'Generic'},
   'question': 'What type of product is it?'},
  {'answer': {'answer_start': 29, 'text': 'apos'},
   'question': 'What is the name of the name of the person who is responsible for leaving a phone behind?'},
  {'answer': {'answer_start': 89, 'text': 'holster'},
   'question': 'What is the name of the accessory that allows you to clip your phone onto your belt?'},
  {'answer': {'answer_start': 210, 'text': 'OEM'},
   'question': 'What is the name of the company t

## Tokenizing the dataset

In [6]:
from transformers import AutoTokenizer
import transformers

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [7]:
random.shuffle(data)
train_data = data[:int(0.8*len(data))]
val_data = data[int(0.8*len(data)):int(0.9*len(data))]
test_data = data[int(0.9*len(data)):]
len(train_data),len(val_data),len(test_data)

(8300, 1037, 1038)

In [8]:
# this function extracts the contexts, questions and answers (text and start position) from the dataset. also adds answers
# end position to the answer dictionary
def data_prep(data):
    contexts = []
    questions = []
    answers = []
    for prod in data:
        context = prod['context']
        for i in range(len(prod['qas'])):
            question = prod['qas'][i]['question']
            answer = prod['qas'][i]['answer']
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
    for cont,ans in zip(contexts,answers):
        ans_txt = ans['text']
        ans_start = int(ans['answer_start'])
        ans_end = int(ans_start + len(ans_txt))
        if cont[ans_start:ans_end] == ans_txt:
            ans['answer_end'] = ans_end
    return contexts,questions,answers

train_contexts,train_questions,train_answers = data_prep(train_data)
val_contexts,val_questions,val_answers = data_prep(val_data)
test_contexts,test_questions,test_answers = data_prep(test_data)

In [9]:
# this function tokenizes context+questions sepreately and answers spans seprately.
def encode_prep(questions,contexts,answers):
    encode = tokenizer(contexts,questions,truncation=True,padding=True)
    start_pos = []
    end_pos = []
    for i in range(len(answers)):
        start_pos.append(encode.char_to_token(i, answers[i]['answer_start']))
        end_pos.append(encode.char_to_token(i, answers[i]['answer_end']))
        if start_pos[-1] is None:
            start_pos[-1] = tokenizer.model_max_length
        c = 1
        while end_pos[-1] is None:
            end_pos[-1] = encode.char_to_token(i, answers[i]['answer_end']-c)
            c +=1
    encode.update({'start_positions': start_pos, 'end_positions': end_pos})
    return encode

train_embedding = encode_prep(train_questions,train_contexts,train_answers)
val_embedding = encode_prep(val_questions,val_contexts,val_answers)
test_embedding = encode_prep(test_questions,test_contexts,test_answers)

In [10]:
print(len(train_embedding['input_ids']),len(val_embedding['input_ids']),len(test_embedding['input_ids']))

68710 8554 8478


## RoBERTa for Question Answering

In [11]:
# making dataset instance for train, val and test to feed the model
class prodDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = prodDataset(train_embedding)
val_dataset = prodDataset(val_embedding)
test_dataset = prodDataset(test_embedding)

In [12]:
from transformers import RobertaForQuestionAnswering

model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

### Training

In [13]:
wandb.init(
    # set the wandb project where this run will be logged
    project="RoBERTa for Question Answering",
    # track hyperparameters and run metadata
    config={
    "learning_rate": 3e-5,
    "architecture": "RoBERTa",
    "dataset": "Custom Amazon Dataset",
    "epochs": 5
    }
)



[34m[1mwandb[0m: Currently logged in as: [33mvinayakpanchal99[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
# this fuction calculates f1 score
def f1_score_metric(true_vals, pred_vals):
    common = collections.Counter(true_vals) & collections.Counter(pred_vals)
    num_same = sum(common.values())
    if len(true_vals) == 0 or len(pred_vals) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(true_vals == pred_vals)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_vals)
    recall = 1.0 * num_same / len(true_vals)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5,weight_decay=0.01)
epochs = 5
cs_1=-1
cs_2=-1
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader,leave=True)
    losses,acc = [],[]
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        start = batch['start_positions'].to(device)
        end = batch['end_positions'].to(device)
        atn_mask = batch['attention_mask'].to(device)

        op = model(input_ids, attention_mask=atn_mask,start_positions=start,end_positions=end)

        loss = op[0]
        loss.backward()
        optimizer.step()
        start_pred = torch.argmax(op['start_logits'], dim=1)
        end_pred = torch.argmax(op['end_logits'], dim=1)
        
        ac_st = accuracy_score(start.cpu().flatten(),start_pred.cpu().flatten())
        acc.append(ac_st)
        ac_en = accuracy_score(end.cpu().flatten(),end_pred.cpu().flatten())
        acc.append(ac_en)

        losses.append(loss.item())
        wandb.define_metric("custom_step_1")
        cs_1+=1
        log_dic1 = {'Train Accuracy':(ac_st+ac_en)/2,'custom_step_1':cs_1,'Training Loss':loss.item()}
        wandb.log(data=log_dic1)
        loop.set_description(f'Epoch {epoch}')
    print(f"Train Loss: {np.array(losses).mean():.4f} | Train Accuracy: {np.array(acc).mean():.4f}")
    wandb.log({'Train Accuracy per epoch':np.array(acc).mean(),'Training Loss per epoch':np.array(losses).mean()})
    
    acc_val = []
    val_loss = []
    f1_micro_avg = []
    f1_macro_avg = []
    EM_ov = []
    y_test_start = []
    y_pred_start = []
    y_test_end = []
    y_pred_end = []
    
    model.eval()

    val_loader = DataLoader(val_dataset)

    loop = tqdm(val_loader)
    for batch in loop:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            start = batch['start_positions'].to(device)
            end = batch['end_positions'].to(device)
            atn_mask = batch['attention_mask'].to(device)

            op_eval = model(input_ids, attention_mask=atn_mask,start_positions=start,end_positions=end)
            
            y_test_start.append(start.cpu())
            y_test_end.append(end.cpu())

            start_pred = torch.argmax(op_eval['start_logits'], dim=1)
            end_pred = torch.argmax(op_eval['end_logits'], dim=1)
            
            v_ls = op_eval[0]
            val_loss.append(v_ls.item())

            
            ac_st_val = accuracy_score(start.cpu().flatten(),start_pred.cpu().flatten())
            acc_val.append(ac_st_val)
            ac_en_val = accuracy_score(end.cpu().flatten(),end_pred.cpu().flatten())
            acc_val.append(ac_en_val)
            
            y_pred_start.append(start_pred.cpu())
            y_pred_end.append(end_pred.cpu()) 
            
            wandb.define_metric("custom_step_2")
            cs_2+=1
            log_dic2 = {'Validation Accuracy':(ac_st_val+ac_en_val)/2,"custom_step_2":cs_2,
                        'Validation Loss':v_ls.cpu()}
            wandb.log(data=log_dic2)
    
    answers_pred_ls = []
    answers_true_ls = []
    f1_score = []
    #computing f1 scores per epoch here
    for start_pr,end_pr,start_t,end_t,i in zip(y_pred_start,y_pred_end,y_test_start,y_test_end,range(len(y_test_start))):
        pred_se = val_embedding['input_ids'][i][start_pr:end_pr+1]
        true_se = val_embedding['input_ids'][i][start_t:end_t+1]
        answers_pred_ls.append(tokenizer.decode(pred_se))
        answers_true_ls.append(tokenizer.decode(true_se))
        f1_score.append(f1_score_metric(true_se,pred_se))

    EM = 0
    # computing Exact match per epoch here
    for pred_val, true_val in zip(answers_pred_ls,answers_true_ls):
        if pred_val == true_val:
            EM+=1
    EM_ov = EM/len(answers_true_ls)
    val_loss_cl = [x for x in val_loss if str(x) != 'nan']
    
    wandb.log({'Validation Accuracy per epoch':np.array(acc_val).mean(),'Validation Loss per epoch':np.array(val_loss_cl).mean(),
               'Exact Match per epoch':EM_ov, 'F1 score per epoch': np.array(f1_score).mean()})
    
    print(f"Val Accuracy: {np.array(acc_val).mean():.4f} | Val Loss: {np.array(val_loss_cl).mean():.4f} | Exact Match: {EM_ov:.2f} \
    | F1 score: {np.array(f1_score).mean():.4f}")
    
    if epoch>=2:
        torch.save({'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'Train Accuracy per epoch':np.array(acc).mean(),
            'Training Loss per epoch':np.array(losses).mean(),
            'Validation Accuracy per epoch':np.array(acc_val).mean(),
            'Validation Loss per epoch':np.array(val_loss_cl).mean(),
            'Exact Match per epoch':EM_ov, 
            'F1 score per epoch': np.array(f1_score).mean()}, os.path.join('/kaggle/working', 'epoch-{}.pt'.format(epoch)))


Epoch 0: 100%|██████████| 8589/8589 [1:02:49<00:00,  2.28it/s]


Train Loss: 0.6726 | Train Accuracy: 0.8078


100%|██████████| 8554/8554 [03:27<00:00, 41.31it/s]


Val Accuracy: 0.8747 | Val Loss: 0.4155 | Exact Match: 0.81     | F1 score: 0.9029


Epoch 1: 100%|██████████| 8589/8589 [1:02:52<00:00,  2.28it/s]


Train Loss: 0.3692 | Train Accuracy: 0.8868


100%|██████████| 8554/8554 [03:26<00:00, 41.44it/s]


Val Accuracy: 0.8916 | Val Loss: 0.3610 | Exact Match: 0.83     | F1 score: 0.9208


Epoch 2: 100%|██████████| 8589/8589 [1:02:47<00:00,  2.28it/s]


Train Loss: 0.2771 | Train Accuracy: 0.9117


100%|██████████| 8554/8554 [03:26<00:00, 41.46it/s]


Val Accuracy: 0.9014 | Val Loss: 0.3452 | Exact Match: 0.85     | F1 score: 0.9277


Epoch 3: 100%|██████████| 8589/8589 [1:02:49<00:00,  2.28it/s]


Train Loss: 0.2218 | Train Accuracy: 0.9278


100%|██████████| 8554/8554 [03:26<00:00, 41.37it/s]


Val Accuracy: 0.8944 | Val Loss: 0.3955 | Exact Match: 0.83     | F1 score: 0.9138


Epoch 4: 100%|██████████| 8589/8589 [1:02:47<00:00,  2.28it/s]


Train Loss: 0.2007 | Train Accuracy: 0.9349


100%|██████████| 8554/8554 [03:27<00:00, 41.22it/s]


Val Accuracy: 0.9069 | Val Loss: 0.3519 | Exact Match: 0.86     | F1 score: 0.9307


### Model save

In [17]:
model_path = '/kaggle/working/models/RoBERTa-custom-amazon-cellphones'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/kaggle/working/models/RoBERTa-custom-amazon-cellphones/tokenizer_config.json',
 '/kaggle/working/models/RoBERTa-custom-amazon-cellphones/special_tokens_map.json',
 '/kaggle/working/models/RoBERTa-custom-amazon-cellphones/vocab.json',
 '/kaggle/working/models/RoBERTa-custom-amazon-cellphones/merges.txt',
 '/kaggle/working/models/RoBERTa-custom-amazon-cellphones/added_tokens.json',
 '/kaggle/working/models/RoBERTa-custom-amazon-cellphones/tokenizer.json')

### Test

In [18]:
model.eval()

test_loader = DataLoader(test_dataset)

loop = tqdm(test_loader)

acc = []
y_test_start = []
y_pred_start = []
y_test_end = []
y_pred_end = []

for batch in loop:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        start = batch['start_positions'].to(device)
        end = batch['end_positions'].to(device)
        atn_mask = batch['attention_mask'].to(device)
        
        op = model(input_ids, attention_mask=atn_mask)
        
        y_test_start.append(start.item())
        y_test_end.append(end.item())
        
        start_pred = torch.argmax(op['start_logits'], dim=1)
        end_pred = torch.argmax(op['end_logits'], dim=1)
        
        y_pred_start.append(start_pred.item())
        y_pred_end.append(end_pred.item()) 

        acc.append(((start_pred == start).sum()/len(start_pred)).item())
        acc.append(((end_pred == end).sum()/len(end_pred)).item())

test_acc = np.array(acc).mean()



100%|██████████| 8478/8478 [03:03<00:00, 46.08it/s]


### Performance Metrics

### F1 Score and Accuracies

In [19]:
answers_pred_lst = []
answers_true_lst = []
f1_score_test = []

for start_pr,end_pr,start_t,end_t,i in zip(y_pred_start,y_pred_end,y_test_start,y_test_end,range(len(y_test_start))):
    pred_se_test = test_embedding['input_ids'][i][start_pr:end_pr+1]
    true_se_test = test_embedding['input_ids'][i][start_t:end_t+1]
    answers_pred_lst.append(tokenizer.decode(pred_se_test))
    answers_true_lst.append(tokenizer.decode(true_se_test))
    f1_score_test.append(f1_score_metric(true_se_test,pred_se_test))
    

In [20]:
print("Test Accuracy: ", test_acc*100)
print("F1 score test set: ", np.array(f1_score_test).mean())

Test Accuracy:  91.12998348667138
F1 score test set:  0.9326065683066194


### Exact Match

In [21]:
EM = 0
for pred_val, true_val in zip(answers_pred_lst,answers_true_lst):
    if pred_val == true_val:
        EM+=1
EM_ov = EM/len(answers_true_lst)
print("Exact Match: ", EM_ov)

Exact Match:  0.8627034677990092


## Example Predictions

In [23]:
print("T/F\tstart\tend\n")
for i in range(len(y_test_start[10:20])):
    print(f"true\t{y_test_start[10+i]}\t{y_test_end[10+i]}\n"
          f"pred\t{y_pred_start[10+i]}\t{y_pred_end[10+i]}\n")

T/F	start	end

true	147	150
pred	147	150

true	155	156
pred	155	156

true	169	171
pred	169	171

true	184	193
pred	184	193

true	4	4
pred	4	4

true	7	9
pred	7	9

true	22	27
pred	22	27

true	4	5
pred	34	36

true	8	9
pred	8	9

true	34	36
pred	34	36

