Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version', tf.__version__)

  from .autonotebook import tqdm as notebook_tqdm


TF version 2.12.0


In [2]:
MAX_LEN = 512

tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab = 'D:/capstone/roberta-base-squad2/vocab.json',
    merges = 'D:/capstone/roberta-base-squad2/merges.txt',
    lowercase = True,
    add_prefix_space = True
)

Loading the SQuAD dataset which has been converted to a csv file:

In [3]:
train = pd.read_csv('train.csv').fillna('')
train.head()

Unnamed: 0.1,Unnamed: 0,title,question,id,answers,answer_start,context
0,0,assessee for income tax,what is the responsibilty of the assessee ?,id_0_0_1,the assessee shall be responsible for administ...,0,the assessee shall be responsible for administ...
1,1,assessee for income tax,who is responsible for administering or invest...,id_0_0_2,the assessee shall be responsible for administ...,0,the assessee shall be responsible for administ...
2,2,assessee for income tax,what is the per cent of the total value of the...,id_0_0_3,not more than ten per cent. of the total value...,373,the assessee shall be responsible for administ...
3,3,assessee for income tax,what is the maximum per cent of the total valu...,id_0_0_5,ten per cent. of the total value of the assets...,387,the assessee shall be responsible for administ...
4,4,"income-tax (20th amendment)rules, 2022","when did the income-tax (20th amendment)rules,...",id_1_0_1,they shall come into force from 1st day of jul...,74,these rules may be called the income-tax (20th...


In [4]:
rec = train.shape[0]  # Number of records in the training set
inputs = np.ones((rec, MAX_LEN), dtype = 'int32') # Input vector
attention_mask = np.zeros((rec, MAX_LEN), dtype = 'int32') # Attention Mask
token_type_ids = np.zeros((rec, MAX_LEN), dtype = 'int32') # Tokens produced
start_tokens = np.zeros((rec, MAX_LEN), dtype = 'int32') # Start logit for answer
end_tokens = np.zeros((rec, MAX_LEN), dtype = 'int32') # End logit for answer

for i in range(rec):

  context = ' '+' '.join(train.loc[i, 'context'].split())
  answer = ' '+' '.join(train.loc[i, 'answers'].split())
  question = ' '+' '.join(train.loc[i, 'question'].split())

  start_idx = train.loc[i, 'answer_start']
  chars = np.zeros((len(context)))
  chars[start_idx:start_idx + len(answer)] = 1
  if context[start_idx - 1] == ' ':
    chars[start_idx - 1] = 1
  
  enc1 = tokenizer.encode(context)
  enc2 = tokenizer.encode(question)

  # For resource limitations only.

  if len(enc1) + len(enc2) + 4 < MAX_LEN:

    #creating offsets
    offsets = []
    start_idx = 0

    for t in enc1.ids:
      w = tokenizer.decode([t])
      offsets.append((start_idx, start_idx + len(w)))
      start_idx += len(w)
    
    # Those which are a part of the answer

    tokens = []
    for j, (a, b) in enumerate(offsets):
      sum_ = np.sum(chars[a:b])
      if sum_ > 0:
        tokens.append(j)

    # The input for roberta is in the form <s> Question </s></s> Context </s>
    
    inputs[i, :len(enc1.ids) + len(enc2.ids) + 4] = [0] + enc2.ids + [2,2] + enc1.ids + [2]

    attention_mask[i, :len(enc1.ids) + len(enc2.ids) + 4] = 1

    if len(tokens) > 0:
      start_tokens[i, tokens[0] + 1] = 1
      end_tokens[i, tokens[-1] + 1] = 1


In [6]:
def build_model():
  ids = tf.keras.layers.Input((MAX_LEN,), dtype = tf.int32)
  att = tf.keras.layers.Input((MAX_LEN,), dtype = tf.int32)
  tok = tf.keras.layers.Input((MAX_LEN,), dtype = tf.int32)

  config = RobertaConfig.from_pretrained('D:/capstone/roberta-base-squad2/config.json')
  bert_model = TFRobertaModel.from_pretrained('D:/capstone/roberta-base-squad2/tf_model.h5', config = config)
  x = bert_model(ids, attention_mask=att, token_type_ids = tok)

  # For start logit

  x1 = tf.keras.layers.Dropout(0.1)(x[0])
  x1 = tf.keras.layers.Conv1D(1,1)(x1)
  x1 = tf.keras.layers.Flatten()(x1)
  x1 = tf.keras.layers.Activation('softmax')(x1)

  # For end logit

  x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
  x2 = tf.keras.layers.Conv1D(1,1)(x2)
  x2 = tf.keras.layers.Flatten()(x2)
  x2 = tf.keras.layers.Activation('softmax')(x2)

  # Initalising the model

  model = tf.keras.models.Model(inputs = [ids, att, tok], outputs = [x1, x2])
  optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-5)
  model.compile(loss='categorical_crossentropy', optimizer = optimizer)

  return model

In [7]:
model = build_model()

loading configuration file D:/capstone/roberta-base-squad2/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.27.4",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file D:/capstone/roberta-base-squad2/tf_model.h5
Some layers from the model checkpoint at D:/capstone/roberta-base-squad2/tf_model.h5 were not used when initializing TFRobertaModel: ['qa

Fitting the model with our dataset

In [9]:
# history = model.fit([inputs, attention_mask, token_type_ids], [start_tokens, end_tokens], epochs = 3, batch_size = 4, validation_split = 0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
model.save('roberta_model') #saving the model after fitting 

In [8]:
model.load_weights("roberta_model") #loading the weights after fitting the model

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1ae77638210>

In [9]:
print(enc1.ids)

[1030, 7668, 8941, 7, 5730, 126, 5730, 16, 5, 762, 50, 45261, 9, 70, 3034, 1538, 2189, 8941, 7, 5, 11827, 4, 5, 7404, 13, 14999, 9, 5730, 16, 17171, 149, 2810, 24404, 102, 9, 5, 1760, 4, 5, 7089, 13, 2502, 13, 5730, 16, 14255, 11, 2178, 15900, 9, 5, 1492, 4, 5, 4620, 14255, 13, 5730, 2502, 32, 2766, 102, 8, 2766, 6621, 13, 9473, 811, 8, 1093, 2286, 73, 1342, 2192, 4, 17977, 9, 5730, 34, 57, 17171, 13, 1402, 5538, 1065, 17966, 11543, 923, 11, 2178, 15900, 428, 9, 5, 1492, 4]


In [10]:
con = 'Google LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, search engine, cloud computing, software, and hardware. It is considered one of the Big Four technology companies, alongside Amazon, Apple, and Facebook.'
que = 'What is Google?'

inp_id = np.zeros((1,MAX_LEN),dtype='int32')
attn_mask_input = np.zeros((1,MAX_LEN),dtype='int32')
token_type_id_input = np.zeros((1,MAX_LEN),dtype='int32')
inpenc = tokenizer.encode(con)
queenc = tokenizer.encode(que)
print(len(que))
#chars_1 = np.zeros((len(con)))
#chars_1[idx:idx + len(text2)] = 1

offset = []
id_ = 0
for t in inpenc.ids:
  w = tokenizer.decode([t])
  offset.append((id_, id_ + len(w)))
  id_ += len(w)
print(offset)

inp_id[0,:len(inpenc.ids)+len(queenc.ids) + 4] = [0] + queenc.ids + [2,2] + inpenc.ids + [2]
attn_mask_input[0,:len(inpenc.ids)+len(queenc.ids) + 4] = 1

15
[(0, 7), (7, 10), (10, 11), (11, 14), (14, 17), (17, 24), (24, 26), (26, 40), (40, 51), (51, 59), (59, 64), (64, 76), (76, 79), (79, 88), (88, 89), (89, 96), (96, 105), (105, 109), (109, 118), (118, 119), (119, 125), (125, 133), (133, 140), (140, 152), (152, 165), (165, 166), (166, 173), (173, 180), (180, 181), (181, 187), (187, 197), (197, 198), (198, 207), (207, 208), (208, 212), (212, 221), (221, 222), (222, 225), (225, 228), (228, 239), (239, 243), (243, 246), (246, 250), (250, 254), (254, 259), (259, 270), (270, 280), (280, 281), (281, 291), (291, 294), (294, 298), (298, 299), (299, 305), (305, 306), (306, 310), (310, 319), (319, 320)]


Function to predict answer given a context and question

In [11]:
def generate_ans(con,que,model,tokenizer):
  inp_id = np.zeros((1,MAX_LEN),dtype='int32')
  attn_mask_input = np.zeros((1,MAX_LEN),dtype='int32')
  token_type_id_input = np.zeros((1,MAX_LEN),dtype='int32')
  inpenc = tokenizer.encode(con)
  queenc = tokenizer.encode(que)
  inp_id[0,:len(inpenc.ids)+len(queenc.ids) + 4] = [0] + queenc.ids + [2,2] + inpenc.ids + [2]
  attn_mask_input[0,:len(inpenc.ids)+len(queenc.ids) + 4] = 1
  s, f = model.predict([inp_id,attn_mask_input,token_type_id_input])
  s_ = np.argmax(s[0,])
  f_ = np.argmax(f[0,])
  ans = tokenizer.decode(inpenc.ids[s_ - 1: f_ + 1])

  return ans

Testing 1

In [12]:
con = 'Cancer is a group of diseases involving abnormal cell\
 growth with the potential to invade or spread to other parts of the body\
 . These contrast with benign tumors, which do not spread.\
  Possible signs and symptoms include a lump, abnormal bleeding\
  , prolonged cough, unexplained weight loss, and a change in\
   bowel movements. While these symptoms may indicate cancer,\
    they can also have other causes. Over 100 types of cancers \
    affect humans'
que = 'What disease involves abnormal cell growth?'
ans = generate_ans(con,que,model,tokenizer)
print(ans)

 cancer is


In [13]:
que = 'what are symptoms?'
ans = generate_ans(con,que,model,tokenizer)
print(ans)

 spread.  possible signs and symptoms include a lump, abnormal bleeding  , prolonged cough, unexplained weight loss, and a change in   bowel


In [14]:
que = 'how many types?'
ans = generate_ans(con,que,model,tokenizer)
print(ans)

 other causes. over 100


Testing 2

In [15]:
con2 = "Malaria is a life-threatening disease caused by parasites that are transmitted to people through the bites of infected female Anopheles mosquitoes. It is preventable and curable. In 2018, there were an estimated 228 million cases of malaria worldwide. The WHO African Region carries a disproportionately high share of the global malaria burden. In 2018, the region was home to 93% of malaria cases and 94% of malaria deaths."

In [16]:
que2 = 'What is Malaria?'
ans2 = generate_ans(con2,que2,model,tokenizer)
print(ans2)

 malaria is a life-threatening disease caused by parasites that are transmitted to people through the bites of infected female anopheles mosquitoes. it is preventable and cur


In [17]:
que2 = 'What causes Malaria?'
ans2 = generate_ans(con2,que2,model,tokenizer)
print(ans2)

 malaria is a life-threatening disease caused by parasites that are transmitted to people through the bites of infected female anopheles mosquitoes


In [18]:
que2 = 'Is malaria preventable?'
ans2 = generate_ans(con2,que2,model,tokenizer)
print(ans2)

. it is preventable and curable


Testing with a file

In [33]:
test_df = pd.read_csv('test_100.csv')
test_df.head()

Unnamed: 0,question,context
0,What status has the Brotherhood obtained in th...,"Despite periodic repression, the Brotherhood h..."
1,What impact does higher worker productivity an...,"In Marxian analysis, capitalist firms increasi..."
2,What is the goal of individual civil disobedie...,Non-revolutionary civil disobedience is a simp...
3,What is set aside for question periods in the ...,Parliamentary time is also set aside for quest...
4,What year was the University of Warsaw establi...,The University of Warsaw was established in 18...


In [46]:
answers = []
for index, row in test_df.iterrows():
    context = str(row["context"])
    question = str(row["question"])

    answer = generate_ans(context, question, model, tokenizer)
    print(answer)
    answers.append(answer)

 despite periodic repression, the brotherhood has become one of the most influential movements in the islamic world, particularly in the arab world. for
. the substitution of capital equipment for labor (mechanization and automation) raises the productivity of each worker, resulting in a situation of relatively stagnant wages for the working class amidst rising levels of property income for the capitalist class.
 non-revolutionary civil disobedience is a simple disobedience of laws on the grounds that they are judged "wrong" by an individual conscience, or as part of an effort to render certain laws ineffective, to cause their repeal, or to exert pressure to get one's political wishes on some other issue.
 parliamentary time is also
16,
 a progressive tax is
 there were 158,349 households
's full-sized cars reflected the crisis. by 1979, virtually all "full
 teaches nearly 10,000 pupils. the
 in 1872, the central
 of feynman diagrams.
 that disobedience
ogram-force (kgf) (sometimes kil

To append the 'predicted answers' to the test_100.csv file and save it

In [44]:
test_df["answer"] = answers

In [45]:
test_df.to_csv("test_100_with_answers.csv", index=False)