In [None]:
# 'base' variable has to store the path to the folder where the model, test set and validation set of BanglaRQA is saved/available/uploaded/stored
# so set the 'base' variable path to the folder of the where you have uploaded/saved the model, test set and validation set
base = '../input/banglarqa'       #sample

# 'test_file_name' variable has to store file name of the test set of BanglaRQA
# so set the 'test_file_name' variable as the file anme of the test set of BanglaRQA
test_file_name = 'Test.json' #sample

# 'validation_file_name' variable has to store file name of the validation set of BanglaRQA
# so set the 'validation_file_name' variable as the file anme of the validation set of BanglaRQA
validation_file_name = 'Validation.json'      #sample


# 'model_name' variable has to store file name of the saved/uploaded/stroed model that you want to test
# so set the 'model_name' variable as the file anme of saved/uploaded/stroed model you want to test
model_name = '../input/epoch2/model_weights_epoch_5.pth'         #sample

In [None]:
!pip install transformers[sentencepiece]

In [None]:
pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
import torch

In [None]:
# calling the model BanglaT5 to use it as a class

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize # pip install git+https://github.com/csebuetnlp/normalizer

model = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/banglat5")
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglat5", use_fast=False)

In [None]:
from transformers import AdamW

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device

optimizer = AdamW(model.parameters(), lr=5e-5)


#calling the checkpoint and loading the parameters of the saved model for evaluation
checkpoint = torch.load(model_name)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.to(device)


model.eval()
print()

In [None]:
import numpy

def get_answer(context):
    source_encoding=tokenizer(
        context,
        max_length=1024,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt").to(device)
  
  #print(source_encoding)
    generated_ids=model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask=source_encoding["attention_mask"],
        num_beams = 1,
        max_length=256,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping= True,
        use_cache = True)
  
  #print(generated_ids)

    preds=[tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for gen_id in generated_ids]

    return "".join(preds)

### BanglaRQA Validation

In [None]:
# loading the validation dataset for calculating the EM and F1 scores on BanglaRQA's validation set

import json
import os

f = open(os.path.join(base,validation_file_name))
  
data_val = json.load(f)
  
data_val.keys()
  
data_val = data_val['data']

In [None]:
context_val = []
question_answer_pairs_val = []

for i in range(len(data_val)):
    for j in range(len(data_val[i]['qas'])):
        context_val.append('c: ' + normalize(data_val[i]['context']))
        qa = 'q: ' + normalize(data_val[i]['qas'][j]['question_text'])
        qa = qa + ' a: ' + normalize(data_val[i]['qas'][j]['answers']['answer_text'][0])
        question_answer_pairs_val.append(qa)  
        
        c =  'a: ' + normalize(data_val[i]['qas'][j]['answers']['answer_text'][0]) + '। c: ' + normalize(data_val[i]['context'])
        context_val.append(c)
        question_answer_pairs_val.append('q: ' + normalize(data_val[i]['qas'][j]['question_text']))
    break

In [None]:
print(len(context_val))
print(len(question_answer_pairs_val))

In [None]:
sample_total = len(context_val)
print('No of sample in Validation set: ', sample_total)

with torch.no_grad():
    l = len(context_val)
    for i in range(l):
        pred = get_answer(context_val[i])
        print(context_val[i])
        print(question_answer_pairs_val[i])
        print(pred)
        print()

In [None]:
import pandas
df = pandas.read_csv('/kaggle/input/sports/Sports_40k_kom.csv')
df = df.dropna()
df = df.drop(['Unnamed: 0', 'class'], axis=1)

articles = []
articles = list(df['article'])

for i in range(len(articles)):
    articles[i] = 'c: ' + normalize(articles[i])

print(articles[0])
print()
print(articles[1])

In [None]:
sample_total = len(articles)
print('No of sample in Validation set: ', sample_total)
qa_pairs = []

with torch.no_grad():
    l = len(articles)
    for i in range(l):
        pred = get_answer(articles[i])
        '''print(i)
        print(articles[i])
        print(pred)
        print()
        #if(i==30):
         #   break'''
        qa_pairs.append(pred)

In [None]:
print(articles[0])
print(qa_pairs[0])

In [None]:
my_submission = pandas.DataFrame({'path': articles, 'sentence': qa_pairs})
my_submission.to_csv('submission3.csv', index=False)

In [None]:
my_submission