In [2]:
import re
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [3]:
data = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
data.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [4]:
# Cleaning Dataset and Create a new dataset

all_list = []
for i in range(len(data)):
    for j in range(len(data.data[i]['questions'])):
        context = data.data[i]['story']
        question = data.data[i]['questions'][j]['input_text']
        answer = data.data[i]['answers'][j]['input_text']
        all_list.append([context,question,answer])

coll = ['Context', 'Question', 'Answer']

df_new = pd.DataFrame(all_list, columns= coll)
df_new.head()
    

Unnamed: 0,Context,Question,Answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [5]:
# Split dataset into Train and Test for fine-tunning

df_train, df_test = train_test_split(df_new, test_size=0.2, shuffle=True, random_state=42)
print(df_train.shape, df_test.shape)

(86917, 3) (21730, 3)


In [6]:
# Import Transformers

from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering
import tensorflow as tf

In [7]:
answer

'six points'

In [8]:
def answering(question, context):
    
    #Tokenize the context and return TensorFlow tensors
    tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    inputs = tokenizer(question, context, return_tensors= 'tf')
    
    #passing inputs into the model and return logits
    model = TFAutoModelForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    outputs = model(**inputs)
    
    start_index = int(tf.math.argmax(outputs.start_logits, axis = -1)[0])
    end_index = int(tf.math.argmax(outputs.end_logits, axis = -1)[0])
    
    predict_answer = inputs.input_ids[0, start_index: end_index + 1]
    
    
    return tokenizer.decode(predict_answer)
    
    

In [9]:
rand_num = np.random.randint(0, len(df_new))

qs = df_new['Question'][rand_num]
ct = df_new['Context'][rand_num]
orginal_answer = df_new['Answer'][rand_num]

In [10]:
answering(qs,ct)


All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


'jimmy and justin are also brothers'

In [13]:
qs

'Are they related?'

In [14]:
ct

'Mitchell and his brother, Graham are biking to the store to buy lemons. They want to make a lemonade stand for their friends. In order to do this, they need to buy lemons, sugar, and cups from the store. While they are at the store, they run into their friends Jimmy and Justin. Jimmy and Justin are also brothers. Mitchell and Graham stop to talk to their friends for a bit before they go back to their shopping. After filling up their basket with the items they need to make lemonade, they go to the front of the store to pay. Once they get home, they start making lemonade and set out their table by the sidewalk. They talk to a few of their neighbors as they walk buy and some of them buy some lemonade. After sitting outside for some time, they think about making a sign to let the neighbors know that they have lemonade for sale. Mitchell gets the markers and Graham gets the sign. They work together to make the sign. After putting the sign in front of the table, they find people want much m

In [11]:

print('Orginal Answer: ', orginal_answer)


Orginal Answer:  yes


In [19]:
new_q = 'where are the money'


In [20]:
answering(new_q,ct)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


'in their piggy banks'

In [263]:
ct

"Islamabad is the capital city of Pakistan located within the federal Islamabad Capital Territory. With a population of two million, it is the 10th largest city of Pakistan, while the larger Islamabad-Rawalpindi metropolitan area is the third largest in Pakistan with a population exceeding five\xa0million. The city is the political seat of Pakistan and is administered by the Islamabad Metropolitan Corporation, supported by the Capital Development Authority (CDA). \n\nIslamabad is located in the Pothohar Plateau in the northeastern part of the country, between Rawalpindi District and the Margalla Hills National Park to the north. The region has historically been a part of the crossroads of Punjab and Khyber Pakhtunkhwa with the Margalla Pass acting as the gateway between the two regions. \n\nIslamabad was built during the 1960s to replace Karachi as Pakistan's capital. The city's master-plan divides the city into eight zones, including administrative, diplomatic enclave, residential are

In [36]:
#Tokenize the context and return TensorFlow tensors
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

input_ids = tokenizer.encode(new_q,ct)

tokens = tokenizer.convert_ids_to_tokens(input_ids)

sep_id = input_ids.index(tokenizer.sep_token_id)

num_seg_a = sep_id + 1

num_seg_b = len(input_ids) - num_seg_a

segment_ids = [0]* num_seg_a + [1]*num_seg_b

In [37]:
model = TFAutoModelForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

input_ids_tensor = tf.convert_to_tensor([input_ids])
segment_ids_tensor = tf.convert_to_tensor([segment_ids])

output = model(input_ids_tensor, token_type_ids=segment_ids_tensor)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [33]:
def get_answer(text, question):
    inputs = tokenizer.encode_plus(question, text, return_tensors="tf", padding=True, truncation=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Perform inference
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the most likely answer
    answer_start_scores = outputs.start_logits.numpy()
    answer_end_scores = outputs.end_logits.numpy()

    answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
    answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1

    # Decode the answer tokens
    answer_tokens = input_ids[0][answer_start:answer_end]
    answer = tokenizer.decode(answer_tokens)

    return answer

In [35]:
get_answer(ct, new_q)

'in their piggy banks'

In [38]:
start_index = int(tf.math.argmax(output.start_logits, axis = -1)[0])
end_index = int(tf.math.argmax(output.end_logits, axis = -1)[0])

In [40]:
start_index

287

In [41]:
if end_index > start_index:
    ans = tokens[start_index]
    for i in range(start_index+1, end_index +1):
        if tokens[i][0:2] == '##':
            ans = ans + tokens[i][2:]
        else:
            ans = ans + ' ' + tokens[i]
print("\nPredicted answer:\n{}".format(ans.capitalize()))


Predicted answer:
In their piggy banks


In [28]:
start_index

0

In [29]:
end_index

212

In [23]:
orginal_answer

'an alarm'

In [14]:
qs

'What does the alarm sound like?'

In [13]:
ct



In [34]:
int(tf.math.argmax(output.end_logits)[0])

0