#### About
Extractive Question answering on Squad Dataset via ROBERTA in PyTorch.

Dataset Link - https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
#mandatory import
import json
import os
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
#!pip install transformers --quiet
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast,RobertaForQuestionAnswering
from transformers import AdamW
from tqdm import tqdm

In [2]:
os.chdir('/content/drive/MyDrive/Datasets')

In [3]:
train_json_path = "train-v1.1.json"
val_json_path = "dev-v1.1.json"

In [4]:
#converting data to datapath
#fetching id, context, answers

train_file= json.loads(open(train_json_path).read())
train_file.keys()


dict_keys(['data', 'version'])

In [5]:
train_file['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [6]:
train_file['data'][0]['title']

'University_of_Notre_Dame'

In [7]:
train_file['data'][0]['paragraphs'][0].keys()

dict_keys(['context', 'qas'])

In [8]:
train_file['data'][0]['paragraphs'][0]['context']

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [9]:
train_file['data'][0]['paragraphs'][0]['qas'][0].keys()

dict_keys(['answers', 'question', 'id'])

In [10]:

print(train_file['data'][0]['paragraphs'][0]['qas'][0]['answers'][0])
print(train_file['data'][0]['paragraphs'][0]['qas'][0]['question'])
print(train_file['data'][0]['paragraphs'][0]['qas'][0]['id'])

{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
5733be284776f41900661182


From the nested dict, We only need id, question, context and answers

In [11]:
hierarchy = ['data','paragraphs','qas','answers']
level_1 = pd.io.json.json_normalize(train_file, hierarchy)
level_2 = pd.io.json.json_normalize(train_file,hierarchy[:-1])
level_3 = pd.io.json.json_normalize(train_file,hierarchy[:-2])

#combining into single df

idx = np.repeat(level_3['context'].values, level_3.qas.str.len())
level_2['context'] = idx
df = level_2[['id','question','answers','context']].set_index('id').reset_index()

df['context_id'] = df['context'].factorize()[0]


  level_1 = pd.io.json.json_normalize(train_file, hierarchy)
  level_2 = pd.io.json.json_normalize(train_file,hierarchy[:-1])
  level_3 = pd.io.json.json_normalize(train_file,hierarchy[:-2])


In [12]:
df.head()

Unnamed: 0,id,question,answers,context,context_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"[{'answer_start': 515, 'text': 'Saint Bernadet...","Architecturally, the school has a Catholic cha...",0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"[{'answer_start': 188, 'text': 'a copper statu...","Architecturally, the school has a Catholic cha...",0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"[{'answer_start': 279, 'text': 'the Main Build...","Architecturally, the school has a Catholic cha...",0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"[{'answer_start': 381, 'text': 'a Marian place...","Architecturally, the school has a Catholic cha...",0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"[{'answer_start': 92, 'text': 'a golden statue...","Architecturally, the school has a Catholic cha...",0


In [13]:
def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        tmp = pd.DataFrame((d for idx, d in df[column].iteritems()))
        ret = pd.concat([df.drop(column,axis=1), tmp], axis=1)
    else:
        tmp = pd.DataFrame((d for idx, d in 
        df[column].iteritems())).fillna(fillna)
        ret = pd.concat([df.drop(column,axis=1), tmp], axis=1)
    return ret

In [14]:
df = unpack(df,'answers')
df = unpack(df,0)

In [15]:
df

Unnamed: 0,id,question,context,context_id,answer_start,text
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",0,515,Saint Bernadette Soubirous
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",0,188,a copper statue of Christ
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",0,279,the Main Building
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",0,381,a Marian place of prayer and reflection
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",0,92,a golden statue of the Virgin Mary
...,...,...,...,...,...,...
87594,5735d259012e2f140011a09d,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",18890,229,Oregon
87595,5735d259012e2f140011a09e,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",18890,414,Rangoon
87596,5735d259012e2f140011a09f,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",18890,476,Minsk
87597,5735d259012e2f140011a0a0,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",18890,199,1975


In [16]:
train_df = df.drop(columns=['id','context_id'])
train_df

Unnamed: 0,question,context,answer_start,text
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary
...,...,...,...,...
87594,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",229,Oregon
87595,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",414,Rangoon
87596,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",476,Minsk
87597,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",199,1975


In [17]:
train_df.dropna()

Unnamed: 0,question,context,answer_start,text
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary
...,...,...,...,...
87594,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",229,Oregon
87595,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",414,Rangoon
87596,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",476,Minsk
87597,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",199,1975


In [18]:
train_df.drop_duplicates()

Unnamed: 0,question,context,answer_start,text
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary
...,...,...,...,...
87594,In what US state did Kathmandu first establish...,"Kathmandu Metropolitan City (KMC), in order to...",229,Oregon
87595,What was Yangon previously known as?,"Kathmandu Metropolitan City (KMC), in order to...",414,Rangoon
87596,With what Belorussian city does Kathmandu have...,"Kathmandu Metropolitan City (KMC), in order to...",476,Minsk
87597,In what year did Kathmandu create its initial ...,"Kathmandu Metropolitan City (KMC), in order to...",199,1975


In [19]:
#similarly reading val_json
val_file = json.loads(open(val_json_path).read())
hierarchy = ['data','paragraphs','qas','answers']
val_level_1 = pd.io.json.json_normalize(val_file, hierarchy)
val_level_2 = pd.io.json.json_normalize(val_file,hierarchy[:-1])
val_level_3 = pd.io.json.json_normalize(val_file,hierarchy[:-2])

#combining into single df

val_idx = np.repeat(val_level_3['context'].values, val_level_3.qas.str.len())
val_level_2['context'] = val_idx
val_df = val_level_2[['id','question','answers','context']].set_index('id').reset_index()

val_df['context_id'] = val_df['context'].factorize()[0]
val_df = unpack(val_df,'answers')
#val_df = unpack(val_df,1)


  val_level_1 = pd.io.json.json_normalize(val_file, hierarchy)
  val_level_2 = pd.io.json.json_normalize(val_file,hierarchy[:-1])
  val_level_3 = pd.io.json.json_normalize(val_file,hierarchy[:-2])


In [20]:
val_df = unpack(val_df,0)


In [21]:
val_df.head()

Unnamed: 0,id,question,context,context_id,1,2,3,4,5,answer_start,text
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,0,"{'answer_start': 177, 'text': 'Denver Broncos'}","{'answer_start': 177, 'text': 'Denver Broncos'}",,,,177,Denver Broncos
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,0,"{'answer_start': 249, 'text': 'Carolina Panthe...","{'answer_start': 249, 'text': 'Carolina Panthe...",,,,249,Carolina Panthers
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,0,"{'answer_start': 355, 'text': 'Levi's Stadium'}","{'answer_start': 355, 'text': 'Levi's Stadium ...",,,,403,"Santa Clara, California"
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,0,"{'answer_start': 177, 'text': 'Denver Broncos'}","{'answer_start': 177, 'text': 'Denver Broncos'}",,,,177,Denver Broncos
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,0,"{'answer_start': 488, 'text': 'gold'}","{'answer_start': 521, 'text': 'gold'}",,,,488,gold


In [22]:
val_df = val_df.drop(columns=['id','context_id',1,2,3,4,5])
val_df.head()

Unnamed: 0,question,context,answer_start,text
0,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,177,Denver Broncos
1,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,249,Carolina Panthers
2,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,403,"Santa Clara, California"
3,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,177,Denver Broncos
4,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,488,gold


In [23]:
val_df.dropna()
val_df.drop_duplicates()

Unnamed: 0,question,context,answer_start,text
0,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,177,Denver Broncos
1,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,249,Carolina Panthers
2,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,403,"Santa Clara, California"
3,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,177,Denver Broncos
4,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,488,gold
...,...,...,...,...
10565,What is the metric term less used than the New...,"The pound-force has a metric counterpart, less...",82,kilogram-force
10566,What is the kilogram-force sometimes reffered ...,"The pound-force has a metric counterpart, less...",114,kilopond
10567,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...",274,slug
10568,What seldom used term of a unit of force equal...,"The pound-force has a metric counterpart, less...",712,kip


#### Abbreviations
1. Tokens - They are generated by one of word, character, subword or byte piece tokenisation.
2. Input IDs - They are token indices. List of input_ids constituent a sequence which's used in batches by the model to generate the output.
3. Sequence IDs - They are IDs that tell us which are questions and which are answers. OOV tokens are None, 1 for context and 0 for answers.
4. answer_start - Context is the entire doc and answer_start is the word_index at which answer exists.
5. attention_mask - They tell us which to ignore i.e padding
6. Gold text - It refers to the answer we expect in return

#### About ROBERTA
1. Roberta uses BPE derived from GPT2. In BPE, we replace aa in aaraam with A.
2. Roberta stands for Robustly Optimized BERT Pre training Approach.

In [24]:

#updating the dataframe with end_positions
def updated_end_indices(dataframe):
    #creating new attribute
    dataframe['answer_end']=0
    for i in range(len(dataframe)):
        gold_text = dataframe['text'][i]
        start_idx = dataframe['answer_start'][i]
        end_idx = start_idx + len(gold_text)
        context = dataframe['context'][i]
        #sometimes these can be off by a character or two
        if context[start_idx:end_idx] == gold_text:
            dataframe['answer_end'][i] = end_idx
        else:
            for n in [1,2]:
            #check the offset
                if context[start_idx-n:end_idx-n] == gold_text:
                    dataframe['answer_start'][i] = int(start_idx-n)
                    dataframe['answer_end'][i] = int(end_idx - n)
    return train_df


In [25]:
train_df = updated_end_indices(train_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['answer_end'][i] = end_idx


In [26]:
train_df.head()

Unnamed: 0,question,context,answer_start,text,answer_end
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,541
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,213
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,296
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,420
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,126


In [27]:
val_df = updated_end_indices(val_df)
val_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['answer_end'][i] = end_idx


Unnamed: 0,question,context,answer_start,text,answer_end
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",515,Saint Bernadette Soubirous,541
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",188,a copper statue of Christ,213
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",279,the Main Building,296
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",381,a Marian place of prayer and reflection,420
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",92,a golden statue of the Virgin Mary,126


In [28]:
#encoding texts
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
train_encodings = tokenizer(train_df['context'].values.tolist(), train_df['question'].values.tolist(), truncation=True, padding=True,max_length=128) #remove max len when ram is higher
val_encodings = tokenizer(val_df['context'].values.tolist(),val_df['question'].values.tolist(), truncation=True, padding=True,max_length=128)


In [29]:
#adding token positions
def update_token_positions(encodings, dataframe):
    start_positions = []
    end_positions = []
    for i in range(len(dataframe)):
        start_positions.append(encodings.char_to_token(i,dataframe['answer_start'][i]))
        end_positions.append(encodings.char_to_token(i,dataframe['answer_end'][i]))
    
    # if start position is None, The answer passage has been truncated
    if start_positions[-1] is None:
        start_positions[-1] = tokenizer.model_max_length

    # end position not found, so shift back
    offset=1
    while end_positions[-1] is None:
        end_positions[-1] = encodings.char_to_token(i,dataframe['answer_end'][i]-offset)
        offset+=1

    # updating encoding dict with our start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


In [30]:
update_token_positions(train_encodings,train_df)
update_token_positions(val_encodings,val_df)


In [31]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [32]:
torch.tensor(train_encodings['input_ids'][0])

tensor([    0, 37848, 37471, 28108,     6,     5,   334,    34,    10,  4019,
         2048,     4,   497,  1517,     5,  4326,  6919,    18,  1637, 31346,
           16,    10,  9030,  9577,     9,     5,  9880,  2708,     4, 29261,
           11,   760,     9,     5,  4326,  6919,     8,  2114,    24,     6,
           16,    10,  7621,  9577,     9,  4845,    19,  3701,    62, 33161,
           19,     5,  7875,    22, 39043,  1459,  1614,  1464, 13292,  4977,
          845,  4130,     7,     5,  4326,  6919,    16,     5, 26429,  2426,
            9,     5, 25095,  6924,     4, 29261,   639,     5, 32394,  2426,
           16,     5,  7461, 26187,     6,    10, 19035,   317,     9,  9621,
            8, 12456,     4,    85,    16,    10, 24633,     9,     5, 11491,
        26187,    23,   226,  2126, 10067,     6,  1470,   147,     2,     2,
         3972,  2661,   222,     5,  9880,  2708,  2346,  2082,    11,   504,
         4432,    11,   226,  2126, 10067,  1470,   116,     2])

In [33]:
#creating dataset
class QNA_Dataset(Dataset):
    def __init__(self,encodings):
        self.encodings = encodings
    
    def __len__(self):
        return(len(self.encodings.input_ids))
    
    def __getitem__(self, idx):
        keys = self.encodings.keys()
        item = {}
        for key in keys:
            try:
              item[key] = torch.tensor(self.encodings[key][idx])
            except:
              item[key] = torch.tensor(0)
        
        return item
         
    

    

In [34]:
train_dataset = QNA_Dataset(train_encodings)
val_dataset = QNA_Dataset(val_encodings)

In [35]:
train_dataset.__getitem__(5)

{'input_ids': tensor([    0,  1620,    23,   144,    97,  6630,     6, 10579,  9038,    18,
           521,   422,    10,   346,     9,   340,   433,  6639,     4,    20,
          1117,  1294,    12,  2962,  6639,   680,   130,  9911,     6,   258,
            10,  3188,     8,  2384,  1992,     6,     8,   484, 15829,     8,
         28059,     4,  1456,  8215,    25,    10,    65,    12,  8596,  8812,
            11,   772,   504,  5067,     6,     5,  1811,  1168, 11599,  4320,
            16,  1167,  2330,  3708,     8,  1449,     7,    28,     5,  7763,
         11152, 25161,  5362,    11,     5,   315,   532,     4,    20,    97,
          4320,     6,    20, 45011,  1371,     6,    16,   703,  2330,    10,
            76,     8,  7235,    15,  1294, 13144,     8, 14129,     4,    20,
         25336,    76,  6298,    16,  1027,  6333,     4,    20,  9911,    33,
         15958,     2,     2,  1779,   222,     5,  1811,  1168, 11599, 10202,
             9, 10579,   385,  4344,  1

In [36]:
train_loader = DataLoader(train_dataset, batch_size=64)
val_loader = DataLoader(val_dataset, batch_size=64)

In [37]:
for batch in train_loader:
  print(batch)
  break


{'input_ids': tensor([[    0, 37848, 37471,  ...,  1470,   116,     2],
        [    0, 37848, 37471,  ...,  6919,   116,     2],
        [    0, 37848, 37471,  ...,  3184,   116,     2],
        ...,
        [    0,   133, 20561,  ...,  2534,   116,     2],
        [    0,   133, 20561,  ...,    76,   116,     2],
        [    0,   133, 20561,  ...,  9038,   116,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'start_positions': tensor([  0,  41,  63,  85,  21,  51,  88,   0,  26,   0,  24,  31,  51,  75,
          0,  88,   8,  23,  55,  28,  93,  15,  31,   0,  75,   0,   0,  21,
          0,   2,  97,  57,  74,  71,  26,  29,  38,  69,  82,  21,  32,  47,
        103,   0,   9,  79,   0,  38,   0,   1,  69,  84,   0,  21,   2,  35,
         32,  51,  16,   1,   0,   0,   0,  19]), 'end_

In [38]:
#loading model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')
model = model.to(device)
optim = AdamW(model.parameters(),lr=1e-4)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [39]:
#training model
total_accuracy= []
for epoch in range(1):
    model.train()
    loader_progress = tqdm(train_loader)
    for batch in loader_progress:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        #training
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loader_progress.set_description("Epoch".format(epoch))
        loader_progress.set_postfix(loss=loss.item())
        
        #validation
        val_loader_progress = tqdm(val_loader)
        for batch in val_loader_progress:
          with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)

            start_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_pred = torch.argmax(outputs['end_logits'], dim=1)

            #calculating acc
            total_accuracy.append(((start_pred == start_positions).sum()/len(start_pred)).item())
            total_accuracy.append(((end_pred == end_positions).sum()/len(end_pred)).item())
        accuracy = sum(total_accuracy)/len(total_accuracy)
        loader_progress.set_description("Epoch".format(epoch))
        loader_progress.set_postfix(accuracy=accuracy)




Epoch:   0%|          | 0/1369 [00:04<?, ?it/s, loss=4.9]
  0%|          | 0/1369 [00:00<?, ?it/s][A
  0%|          | 1/1369 [00:00<11:44,  1.94it/s][A
  0%|          | 2/1369 [00:01<11:37,  1.96it/s][A
  0%|          | 3/1369 [00:01<11:19,  2.01it/s][A
  0%|          | 4/1369 [00:01<11:14,  2.02it/s][A
  0%|          | 5/1369 [00:02<11:18,  2.01it/s][A
  0%|          | 6/1369 [00:03<11:43,  1.94it/s][A
  1%|          | 7/1369 [00:03<11:50,  1.92it/s][A
  1%|          | 8/1369 [00:04<11:54,  1.90it/s][A
  1%|          | 9/1369 [00:04<11:49,  1.92it/s][A
  1%|          | 10/1369 [00:05<11:43,  1.93it/s][A
  1%|          | 11/1369 [00:05<11:47,  1.92it/s][A
  1%|          | 12/1369 [00:06<11:47,  1.92it/s][A
  1%|          | 13/1369 [00:06<11:39,  1.94it/s][A
  1%|          | 14/1369 [00:07<11:55,  1.89it/s][A
  1%|          | 15/1369 [00:07<11:55,  1.89it/s][A
  1%|          | 16/1369 [00:08<11:53,  1.90it/s][A
  1%|          | 17/1369 [00:08<11:49,  1.91it/s][A
  1%|▏

AttributeError: ignored

In [None]:
#save the tokenizer and model
model.save_pretrained('roberta-trained_model/')
tokenizer.save_pretrained('roberta-trained_model/')

In [40]:
#custom eval
question = "Who is Adam"
context = """Adam is someone we don't know but yet is a renowned professional in the domain of Machine learning """

In [49]:
def answer(context,question):
  #generate encodings
  encodings = tokenizer.encode_plus(text=question, text_pair = context)
  input_ids = encodings['input_ids']
  #searching input id for [SEP] token
  sep_index = input_ids.index(tokenizer.sep_token_id)
  num_segments1 = sep_index+1
  num_segments2 = len(input_ids) - num_segments1

  # construicting segmentid via 0 and 1
  segment_ids = [0]* num_segments1 + [1]* num_segments2

  #each input token should have segment id
  assert len(segment_ids) == len(input_ids)

  start_scores ,end_scores =model(torch.tensor([input_ids]).to(device),token_type_ids=torch.tensor([segment_ids]).to(device))
  print(start_scores,end_scores)
  answer_start = torch.argmax(start_scores)
  answer_end = torch.argmax(end_scores)
  #gettinmg string of input tokens

  tokens = tokenizer.convert_ids_to_tokens(input_ids)
  answer = tokens[answer_start]
  #selecting rest
  for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] =="##":
      answer+=tokens[i][2:]
    else:
      answer+=' '+tokens[i]
  
  return answer

In [None]:
answer(context,question)