## BART BASE FINE-TUNING ON CUSTOM DATASET - TRAINING FILE

In [1]:
import wandb
wandb.login() # add your api key

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
import json
import pandas as pd
import os
import gzip
import numpy as np
import random
import collections
from transformers import AutoTokenizer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


## Load Dataset

In [4]:
path_meta = "/kaggle/input/cell-phones-final/Cell_Phones_and_Accessories_final.json"

with open(path_meta,'r') as file:
    data = json.load(file)
len(data)

10375

In [5]:
data[0]

{'id': 'B0015X7RSO',
 'title': 'HTC Sprint Touch P3450 Smartphone Black Swivel Belt Clip Holster',
 'context': 'It is a Generic product. Don apos t ever leave your phone behind again. By attaching our holster you can clip your phone onto your belt or just anywhere and feel free while your phone is secured. Brand new non OEM Custom made to fit your HTC Touch perfectly. Includes a swivel belt clip. Categories of product are Cell Phones Accessories Cases Holsters Sleeves',
 'qas': [{'answer': {'answer_start': 8, 'text': 'Generic'},
   'question': 'What type of product is it?'},
  {'answer': {'answer_start': 29, 'text': 'apos'},
   'question': 'What is the name of the name of the person who is responsible for leaving a phone behind?'},
  {'answer': {'answer_start': 89, 'text': 'holster'},
   'question': 'What is the name of the accessory that allows you to clip your phone onto your belt?'},
  {'answer': {'answer_start': 210, 'text': 'OEM'},
   'question': 'What is the name of the company t

In [6]:
tokenizer= AutoTokenizer.from_pretrained("facebook/bart-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
data_short = data[:5000]
random.shuffle(data)
train_data = data_short[:int(0.8*len(data_short))]
val_data = data_short[int(0.8*len(data_short)):int(0.996*len(data_short))]
test_data = data_short[int(0.996*len(data_short)):]
len(train_data),len(val_data),len(test_data)

(4000, 980, 20)

In [14]:
# storing test data seprately
with open("/kaggle/working/Cell_Phones_and_Accessories_testflie.json",'w') as file:
    json.dump(test_data,file,indent=4)

## Tokeinzing the data

In [15]:
# function extracts answers, questions, and contexts from the dataset
def data_prep(data):
    contexts = []
    questions = []
    answers = []
    for prod in data:
        context = prod['context']
        for i in range(len(prod['qas'])):
            question = prod['qas'][i]['question']
            answer = prod['qas'][i]['answer']['text']
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
    return contexts,questions,answers

train_contexts,train_questions,train_answers = data_prep(train_data)
val_contexts,val_questions,val_answers = data_prep(val_data)

In [16]:
# function tokenizes the questions and contexts together and labels seprately
def encode_prep(questions,contexts,answers):
    encode_qa = tokenizer(questions,contexts,truncation=True,padding="max_length",max_length = 512,pad_to_max_length=True,
                          add_special_tokens=True)
    encode_ans = tokenizer(answers,truncation=True,padding="max_length",max_length = 25,pad_to_max_length=True,
                          add_special_tokens=True)
    labels = encode_ans["input_ids"]
    encode_qa.update({'labels':labels,"decoder_attention_mask":encode_ans["attention_mask"]})

    return encode_qa

train_embedding = encode_prep(train_questions,train_contexts,train_answers)
val_embedding = encode_prep(val_questions,val_contexts,val_answers)

In [17]:
print(len(train_embedding['input_ids']),len(val_embedding['input_ids']))

33250 8291


## BART base of Question Answering

In [18]:
# creating instance of dataset to feed the model
class prodDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = prodDataset(train_embedding)
val_dataset = prodDataset(val_embedding)

In [19]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,Seq2SeqTrainingArguments, Seq2SeqTrainer

In [23]:
batch_size = 8
model_name = "bart-base-qa"
model_dir = "/kaggle/working/model"

args = Seq2SeqTrainingArguments(
    model_dir,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy='epoch',
    report_to="wandb"
)

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

## Training

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [26]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mvinayakpanchal99[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.5924,0.069568
2,0.0736,0.058693
3,0.06,0.055943




TrainOutput(global_step=6237, training_loss=0.2420275273643397, metrics={'train_runtime': 4325.1977, 'train_samples_per_second': 23.063, 'train_steps_per_second': 1.442, 'total_flos': 3.041060585472e+16, 'train_loss': 0.2420275273643397, 'epoch': 3.0})

## Model Save

In [27]:
model.save_pretrained("/kaggle/working/model/bartbase")
tokenizer.save_pretrained("/kaggle/working/model/bartbase")

('/kaggle/working/model/bartbase/tokenizer_config.json',
 '/kaggle/working/model/bartbase/special_tokens_map.json',
 '/kaggle/working/model/bartbase/vocab.json',
 '/kaggle/working/model/bartbase/merges.txt',
 '/kaggle/working/model/bartbase/added_tokens.json',
 '/kaggle/working/model/bartbase/tokenizer.json')