#### We finetune this LM : roberta-base-squad2 on dataset.txt having question and answer

In [2]:
from transformers import RobertaForQuestionAnswering, RobertaTokenizer, RobertaTokenizerFast
import pickle
import random
import torch
from torch.utils.data import Dataset

In [3]:
# Get data from LM Dataset.txt
questions = []
answers = []
with open('LM_dataset.txt', 'r') as file:
    lines = file.readlines()
    for i, line in enumerate(lines):
        if i % 3 == 0:
            questions.append(line.strip())
        elif i % 3 == 1:
            answers.append(line.strip()[8:])
        else:
            continue

In [4]:
# Read graph for context
with open('graph.pkl', 'rb') as f:
    graph = pickle.load(f)

In [5]:
# Helper function to create context 
def get_context_from_qa(graph, ques, ans):
    ques = ques.lower()
    ans = ans.lower()
    context = [ans]
    for node in graph.nodes():
        if node in ques and node is not None:
            context.append(node)
    random.shuffle(context)
    return ' '.join(context)

In [6]:
# Create dataset in standard QA format
dataset = []
for i in range(len(questions)):
    sample = {}
    sample['question'] = str(questions[i])
    sample['context'] = get_context_from_qa(graph, questions[i], answers[i])
    text = str(answers[i].lower())
    answer= {}
    answer['text'] = [text]
    answer['answer_start'] = [sample['context'].find(text)] 
    sample['answer'] = answer 
    dataset.append(sample)

In [7]:
dataset

[{'question': 'In which year was the movie "Interstellar" with Matthew McConaughey released?',
  'context': 'interstellar 2014 it matthew mcconaughey',
  'answer': {'text': ['2014'], 'answer_start': [13]}},
 {'question': 'Identify a 2015 film starring Jennifer Lawrence.',
  'context': 'jennifer lawrence the hunger games: mockingjay - part 2 2015 ti',
  'answer': {'text': ['the hunger games: mockingjay - part 2'],
   'answer_start': [18]}},
 {'question': 'Name a movie released in 2013 with Christian Bale in the lead role.',
  'context': 'it ti 2013 american hustle christian bale',
  'answer': {'text': ['american hustle'], 'answer_start': [11]}},
 {'question': 'Can you mention a 2016 release that shares its lead actor with "Deadpool"?',
  'context': 'deadpool 2016 deadpool it ti',
  'answer': {'text': ['deadpool'], 'answer_start': [0]}},
 {'question': 'Which film stars Leonardo DiCaprio and was released in 2015?',
  'context': '2015 rio leonardo dicaprio the revenant',
  'answer': {'text

In [8]:
# Generate custom dataset with tokenizers to get tensors
class CustomQADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data[idx]['context']
        question = self.data[idx]['question']
        answer = self.data[idx]['answer']

        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            return_tensors='pt',
            max_length=128,
            truncation=True,
            padding='max_length',
            return_offsets_mapping=True
        )

        start_positions = answer['answer_start'][0]
        end_positions = answer['answer_start'][0] + len(answer['text'][0])

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'start_positions': start_positions,
            'end_positions': end_positions
        }

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained('deepset/roberta-base-squad2')

In [10]:
final_custom_dataset = CustomQADataset(dataset, tokenizer)

### Dataset is ready .. lets begin finetuning LM

In [11]:
from torch.utils.data import DataLoader

In [12]:
model = RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')

In [13]:
train_loader = DataLoader(final_custom_dataset, batch_size=16, shuffle=True)

In [14]:
# iter_train_loader = iter(train_loader)

# # Get the next batch
# for batch in iter_train_loader:
#     for key in batch:
#         print(batch[key])

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.MSELoss()

num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: batch[key].to(model.device) for key in batch}
        outputs = model(**inputs)
        start_pred = torch.tensor(torch.argmax(outputs.start_logits, dim = 1).to(torch.float), requires_grad=True)
        end_pred = torch.tensor(torch.argmax(outputs.start_logits, dim = 1).to(torch.float), requires_grad=True)
        start_loss = criterion(start_pred, batch['start_positions'].to(torch.float))
        end_loss = criterion(end_pred, batch['end_positions'].to(torch.float))
        loss = start_loss + end_loss
        loss.backward(retain_graph=True)
        optimizer.step()
        print(loss)
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

  start_pred = torch.tensor(torch.argmax(outputs.start_logits, dim = 1).to(torch.float), requires_grad=True)
  end_pred = torch.tensor(torch.argmax(outputs.start_logits, dim = 1).to(torch.float), requires_grad=True)


tensor(1010.1875, grad_fn=<AddBackward0>)
tensor(1308.2500, grad_fn=<AddBackward0>)
tensor(1082.1250, grad_fn=<AddBackward0>)
tensor(827.5625, grad_fn=<AddBackward0>)
tensor(730.8125, grad_fn=<AddBackward0>)
tensor(681.6250, grad_fn=<AddBackward0>)
tensor(820.0625, grad_fn=<AddBackward0>)
tensor(1749.7500, grad_fn=<AddBackward0>)
tensor(948.7500, grad_fn=<AddBackward0>)
tensor(1548.7500, grad_fn=<AddBackward0>)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tensor(1303.1875, grad_fn=<AddBackward0>)
tensor(2364.3125, grad_fn=<AddBackward0>)
tensor(793.6875, grad_fn=<AddBackward0>)
tensor(1104.5000, grad_fn=<AddBackward0>)
tensor(1339.1250, grad_fn=<AddBackward0>)
tensor(600.8125, grad_fn=<AddBackward0>)
tensor(987.0625, grad_fn=<AddBackward0>)
tensor(680.3125, grad_fn=<AddBackward0>)
tensor(1108.1875, grad_fn=<AddBackward0>)
tensor(1384.9375, grad_fn=<AddBackward0>)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tensor(1704.1875, grad_fn

tensor(1693.1875, grad_fn=<AddBackward0>)
tensor(1395.1875, grad_fn=<AddBackward0>)
tensor(1059.9375, grad_fn=<AddBackward0>)
tensor(658.7500, grad_fn=<AddBackward0>)
tensor(1011.2500, grad_fn=<AddBackward0>)
tensor(1282.6875, grad_fn=<AddBackward0>)
tensor(1317.5625, grad_fn=<AddBackward0>)
tensor(642.2500, grad_fn=<AddBackward0>)
tensor(1614.8125, grad_fn=<AddBackward0>)
tensor(1104., grad_fn=<AddBackward0>)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tensor(672.9375, grad_fn=<AddBackward0>)
tensor(957.8125, grad_fn=<AddBackward0>)
tensor(1181.3125, grad_fn=<AddBackward0>)
tensor(1260.8125, grad_fn=<AddBackward0>)
tensor(839.7500, grad_fn=<AddBackward0>)
tensor(1790.6875, grad_fn=<AddBackward0>)
tensor(861.4375, grad_fn=<AddBackward0>)
tensor(733.2500, grad_fn=<AddBackward0>)
tensor(1456.5625, grad_fn=<AddBackward0>)
tensor(864.6875, grad_fn=<AddBackward0>)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tensor(1234., grad_fn=<AddBa

In [17]:
# Save finetuned model
model.save_pretrained('fine_tuned_roberta_squad2')
tokenizer.save_pretrained('fine_tuned_roberta_squad2')

('fine_tuned_roberta_squad2/tokenizer_config.json',
 'fine_tuned_roberta_squad2/special_tokens_map.json',
 'fine_tuned_roberta_squad2/vocab.json',
 'fine_tuned_roberta_squad2/merges.txt',
 'fine_tuned_roberta_squad2/added_tokens.json',
 'fine_tuned_roberta_squad2/tokenizer.json')