# Importing all the required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer


# Importing the Dataset

In [2]:
dataset = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')

In [3]:
dataset.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


# Preprocessing

In [4]:
del dataset["version"]

In [5]:
cols = ["text","question","answer"]

# j = 1
comp_list = []
for index, row in dataset.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
#         temp_list.append(j)
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)
#     j += 1
new_df = pd.DataFrame(comp_list, columns=cols)

In [6]:
new_df.to_csv("CoQA_data.csv", index=False)

In [7]:
data = pd.read_csv("CoQA_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


# Importing BERT module and preparing for Question Answering

In [8]:
model = BertForQuestionAnswering.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-cased-whole-word-masking-finetuned-squad')

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [9]:
random_num = np.random.randint(0,len(data))

question = data["question"][random_num]
text = data["text"][random_num]

# Tokenisation

In [10]:
input_ids = tokenizer.encode(question, text)

In [11]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)


In [12]:
sep_idx = input_ids.index(tokenizer.sep_token_id)
print(sep_idx)

num_seg_a = sep_idx+1
print(num_seg_a)

num_seg_b = len(input_ids) - num_seg_a

segment_ids = [0]*num_seg_a + [1]*num_seg_b

assert len(segment_ids) == len(input_ids)

9
10


In [13]:

output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

In [14]:

answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)

# Framing of Answer

In [15]:
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("The answer to this question is not possible please check with your question :)")

print("Text:\n{}".format(text.capitalize()))
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))

Text:
North carolina consists of three main geographic sections: the atlantic coastal plain, which occupies the eastern 45% of the state; the piedmont region, which contains the middle 35%; and the appalachian mountains and foothills. the extreme eastern section of the state contains the outer banks, a string of sandy, narrow barrier islands between the atlantic ocean and two inland waterways or "sounds": albemarle sound in the north and pamlico sound in the south. they are the two largest landlocked sounds in the united states. 

the coastal plain transitions to the piedmont region along the atlantic seaboard fall line, a line which marks the elevation at which waterfalls first appear on streams and rivers. the piedmont region of central north carolina is the state's most urbanized and densely populated section. it consists of gently rolling countryside frequently broken by hills or low mountain ridges. small, isolated, and deeply eroded mountain ranges and peaks are located in the pi

In [16]:
start_scores = output.start_logits.detach().numpy().flatten()
end_scores = output.end_logits.detach().numpy().flatten()

token_labels = []
for i, token in enumerate(tokens):
    token_labels.append("{}-{}".format(token,i))

In [17]:
print(len(token_labels))

350


In [18]:
answer = tokens[answer_start]

for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

# Generating the Answer

In [19]:
def question_answer(question, text):

    input_ids = tokenizer.encode(question, text)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx+1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]

    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    print("\nAnswer:\n{}".format(answer.capitalize()))

In [20]:
text = """John McCarthy was an American computer scientist and cognitive scientist. McCarthy was one of the founders of the discipline of artificial intelligence"""
question = "Who is the founder fo AI?"

question_answer(question, text)


Answer:
John mccarthy


In [22]:
import json
with open('/content/data.json', 'r') as json_file:
    data = json.load(json_file)

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
predicted_answers = []
true_answers = []

for item in data:
    description = item['description']
    question = item['question']
    true_answer = item['answer']

    input_ids = tokenizer.encode(question, description)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    num_seg_a = sep_idx + 1
    num_seg_b = len(input_ids) - num_seg_a
    segment_ids = [0] * num_seg_a + [1] * num_seg_b
    assert len(segment_ids) == len(input_ids)

    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start + 1, answer_end + 1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
    else:
        answer = ""

    predicted_answers.append(answer.lower())
    true_answers.append(true_answer.lower())



In [None]:
#predicted_answers

In [None]:
#true_answers

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

def filter_stop_words(text):
    nlp = spacy.load("en_core_web_sm")
    tokens = [token.text.lower() for token in nlp(text) if not token.is_stop]
    return ' '.join(tokens)

def calculate_cosine_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]

def evaluate_question_answering_model(predictions, true_answers):
    if len(predictions) != len(true_answers):
        raise ValueError("Number of predictions must be equal to the number of true answers.")

    correct_answers = 0
    total_questions = len(predictions)

    for predicted_answer, true_answer in zip(predictions, true_answers):
        filtered_predicted_answer = filter_stop_words(predicted_answer)
        filtered_true_answer = filter_stop_words(true_answer)
        similarity_score = calculate_cosine_similarity(filtered_predicted_answer, filtered_true_answer)
        similarity_threshold = 0.65
        if similarity_score >= similarity_threshold:
            correct_answers += 1

    accuracy = correct_answers / total_questions
    print(f"Accuracy: {accuracy * 100:.2f}% ({correct_answers}/{total_questions} correct answers)")



In [29]:
evaluate_question_answering_model(predicted_answers, true_answers)

Accuracy: 84.13% (53/63 correct answers)


### User Implementation

In [None]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")

while True:
    question_answer(question, text)

    flag = True
    flag_N = False

    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True

    if flag_N == True:
        break