# DistilBERT-base Baseline Fine-tuned on Custom Amazon Dataset - TRAINING FILE

In [1]:
!pip install transformers
!pip install evaluate

[0mCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import json
import pandas as pd
import os
import numpy as np
import random
from sklearn.metrics import f1_score

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Load Dataset

In [3]:
path_meta = "/kaggle/input/cell-phones-final/Cell_Phones_and_Accessories_final.json"

with open(path_meta,'r') as file:
    data = json.load(file)
len(data)

10375

In [4]:
data[1]

{'id': 'B00579L6K2',
 'title': 'EMPIRE Hot Pink and Black Zebra Stripes Rubberized Design Hard Case Cover + Car Charger (CLA) for Verizon HTC Droid Incredible 2',
 'context': 'It is a EMPIRE product. The HTC Droid Incredible Hot Pink and Black Zebra Stripes case cover provides excellent protection from dust scratches and unwanted blemishes. The HTC Droid Incredible Hot Pink and Black Zebra Stripes case cover also allows for full functionality of your phone with openings for all buttons ports jaHZs and speakers. The HTC Droid Incredible car charger will charge your phone and provides unlimited talk time while in the car. Features an enhanced internal circuitry chip to manage charging status as well as an integrated electrical fuse to prevent damage to your HTC Droid Incredible and it s battery from over and under charging. Safe case removal tool included. b EMPIRE TM is a registered trademark with the USPTO. b Premium high quality snap on hard cover case protector. Made to fit phone per

## Tokenizing the dataset

In [5]:
from transformers import AutoTokenizer
import transformers

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
random.shuffle(data)
train_data = data[:int(0.8*len(data))]
val_data = data[int(0.8*len(data)):]
len(train_data),len(val_data)

(8300, 2075)

In [7]:
# this function extracts the contexts, questions and answers (text and start position) from the dataset. also adds answers
# end position to the answer dictionary
def data_prep(data):
    contexts = []
    questions = []
    answers = []
    for prod in data:
        context = prod['context']
        for i in range(len(prod['qas'])):
            question = prod['qas'][i]['question']
            answer = prod['qas'][i]['answer']
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
    for cont,ans in zip(contexts,answers):
        ans_txt = ans['text']
        ans_start = int(ans['answer_start'])
        ans_end = int(ans_start + len(ans_txt))
        if cont[ans_start:ans_end] == ans_txt:
            ans['answer_end'] = ans_end
    return contexts,questions,answers

train_contexts,train_questions,train_answers = data_prep(train_data)
val_contexts,val_questions,val_answers = data_prep(val_data)

In [8]:
# this function tokenizes context+questions sepreately and answers spans seprately.
def encode_prep(questions,contexts,answers):
    encode = tokenizer(contexts,questions,truncation=True,padding=True)
    start_pos = []
    end_pos = []
    for i in range(len(answers)):
        start_pos.append(encode.char_to_token(i, answers[i]['answer_start']))
        end_pos.append(encode.char_to_token(i, answers[i]['answer_end']))
        if start_pos[-1] is None:
            start_pos[-1] = tokenizer.model_max_length
        c = 1
        while end_pos[-1] is None:
            end_pos[-1] = encode.char_to_token(i, answers[i]['answer_end']-c)
            c +=1
    encode.update({'start_positions': start_pos, 'end_positions': end_pos})
    return encode

train_embedding = encode_prep(train_questions,train_contexts,train_answers)
val_embedding = encode_prep(val_questions,val_contexts,val_answers)

In [13]:
print(len(train_embedding['input_ids']),len(val_embedding['input_ids']))

68801 16941


## Distil BERT baseline for Question Answering

In [14]:
# making dataset instance for train, val and test to feed the model
class prodDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = prodDataset(train_embedding)
val_dataset = prodDataset(val_embedding)

In [15]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to

### Training

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
epochs = 5
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader,leave=True)
    losses,acc = [],[]
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        start = batch['start_positions'].to(device)
        end = batch['end_positions'].to(device)
        atn_mask = batch['attention_mask'].to(device)

        op = model(input_ids, attention_mask=atn_mask,start_positions=start,end_positions=end)

        loss = op[0]
        loss.backward()
        optim.step()
        start_pred = torch.argmax(op['start_logits'], dim=1)
        end_pred = torch.argmax(op['end_logits'], dim=1)

        acc.append(((start_pred == start).sum()/len(start_pred)).item())
        acc.append(((end_pred == end).sum()/len(end_pred)).item())

        losses.append(loss.item())
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    print(f"Train Loss: {np.array(losses).mean():.4f} | Train Accuracy: {np.array(acc).mean():.4f}")

Epoch 0: 100%|██████████| 2151/2151 [30:11<00:00,  1.19it/s, loss=0.00141]


Train Loss: 1.0954 | Train Accuracy: 0.7083


Epoch 1: 100%|██████████| 2151/2151 [30:10<00:00,  1.19it/s, loss=0.288] 


Train Loss: 0.4987 | Train Accuracy: 0.8517


Epoch 2: 100%|██████████| 2151/2151 [30:10<00:00,  1.19it/s, loss=0.00217]


Train Loss: 0.3446 | Train Accuracy: 0.8922


Epoch 3: 100%|██████████| 2151/2151 [30:09<00:00,  1.19it/s, loss=0.00183]


Train Loss: 0.2475 | Train Accuracy: 0.9214


Epoch 4: 100%|██████████| 2151/2151 [30:10<00:00,  1.19it/s, loss=0.000289]

Train Loss: 0.1869 | Train Accuracy: 0.9381





### Model save

In [17]:
model_path = '/kaggle/working/models/distilbert-custom-amazon-cellphones'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/kaggle/working/models/distilbert-custom-amazon-cellphones/tokenizer_config.json',
 '/kaggle/working/models/distilbert-custom-amazon-cellphones/special_tokens_map.json',
 '/kaggle/working/models/distilbert-custom-amazon-cellphones/vocab.txt',
 '/kaggle/working/models/distilbert-custom-amazon-cellphones/added_tokens.json',
 '/kaggle/working/models/distilbert-custom-amazon-cellphones/tokenizer.json')

### Validation

In [19]:
model.eval()

val_loader = DataLoader(val_dataset)

loop = tqdm(val_loader)

acc = []
y_test_start = []
y_pred_start = []
y_test_end = []
y_pred_end = []

for batch in loop:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        start = batch['start_positions'].to(device)
        end = batch['end_positions'].to(device)
        atn_mask = batch['attention_mask'].to(device)
        
        op = model(input_ids, attention_mask=atn_mask)
        
        y_test_start.append(start.item())
        y_test_end.append(end.item())
        
        start_pred = torch.argmax(op['start_logits'], dim=1)
        end_pred = torch.argmax(op['end_logits'], dim=1)
        
        y_pred_start.append(start_pred.item())
        y_pred_end.append(end_pred.item()) 

        acc.append(((start_pred == start).sum()/len(start_pred)).item())
        acc.append(((end_pred == end).sum()/len(end_pred)).item())

f1_start_micro = f1_score(y_true=y_test_start,y_pred=y_pred_start,average='micro')
f1_end_micro = f1_score(y_true=y_test_end,y_pred=y_pred_end,average='micro')
f1_start_macro = f1_score(y_true=y_test_start,y_pred=y_pred_start,average='macro')
f1_end_macro = f1_score(y_true=y_test_end,y_pred=y_pred_end,average='macro')
test_acc = np.array(acc).mean()

100%|██████████| 16941/16941 [03:11<00:00, 88.37it/s]


### Performance Metrics

### F1 Score and Accuracies

In [20]:
print("Val Accuracy: ", test_acc*100)
print("F1 score start position micro: ", f1_start_micro)
print("F1 score end position micro: ", f1_end_micro)
print("F1 score start position macro: ", f1_start_macro)
print("F1 score end position macro: ", f1_end_macro)

Test Accuracy:  88.06741042441413
F1 score start position micro:  0.8853668614603625
F1 score end position micro:  0.8759813470279204
F1 score start position macro:  0.8355015309235457
F1 score end position macro:  0.8272682024211889


In [35]:
answers_pred_ls = []
answers_true_ls = []
for start_pr,end_pr,start_t,end_t,i in zip(y_pred_start,y_pred_end,y_test_start,y_test_end,range(len(y_test_start))):
    answers_pred_ls.append(tokenizer.decode(val_embedding['input_ids'][i][start_pr:end_pr+1]))
    answers_true_ls.append(tokenizer.decode(val_embedding['input_ids'][i][start_t:end_t+1]))

### Exact Match

In [37]:
EM = 0
for pred_val, true_val in zip(answers_pred_ls,answers_true_ls):
    if pred_val == true_val:
        EM+=1
EM_ov = EM/len(answers_true_ls)
print("Exact Match: ", EM_ov)

Exact Match:  0.8216752257836019


### Test Examples

In [41]:
print("T/F\tstart\tend\n")
for i in range(len(y_test_start[:6])):
    print(f"true\t{y_test_start[i]}\t{y_test_end[i]}\n"
          f"pred\t{y_pred_start[i]}\t{y_pred_end[i]}\n")

T/F	start	end

true	4	4
pred	4	4

true	16	18
pred	16	18

true	28	31
pred	28	31

true	45	45
pred	45	45

true	59	59
pred	59	60

true	76	85
pred	76	79



In [42]:
val_answers[-1]

{'answer_start': 878,
 'text': 'Cell Phones Accessories Cases Holsters Sleeves',
 'answer_end': 924}

In [43]:
answer_tokens = val_embedding['input_ids'][-1][start_pred:end_pred+1]
tokenizer.decode(answer_tokens)

'cell phones accessories cases holsters sleeves'