In [1]:
import requests
from bs4 import BeautifulSoup
import re
import spacy
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
import pandas as pd

In [2]:
url = "https://simple.wikipedia.org/wiki/Shivaji"

response = requests.get(url)
html_content = response.content

soup = BeautifulSoup(html_content, "html.parser")
raw_text = ""

paragraphs = soup.find_all('p')
for p in paragraphs:
    raw_text += p.get_text()

text = re.sub(r'\[.*?\]|\(.*?\)', '', raw_text)
text = re.sub(r'\b\n(?=[^\W\d_])', '', text)

text

"Shivaji was the founder of the Maratha Empire. He was born in the Shivneri Fort in Maharashtra probably on 19 February 1630. He is named after a local goddess, Shivai Devi.  \nShivaji is one of the revered historical figures of Maharashtra. He created an independent and sovereign state in the Maharashtra region. His mother, Raajmaata Jijabai was the daughter of Shri.Lakhuji Jadhavrao of Sindkhed. His father Shri.Shahajiraje Bhosale was a  Maratha general  in the Deccan. \nMost of the territory in Maharashtra was then under the possession of the Nizamshah of Ahmednagar and the Adilshah of Bijapur who were known as the Deccan sultanates. The Mughals launched a campaign to conquer the Nizamshahi Kingdom. The Adilshah of Bijapur allied with the Mughals in this campaign. Shri. Shahajiraje Bhosale tried to rebel, but he could not withstand the combined might of the Mughals and the Adilshahi. The Nizamshahi kingdom came to an end in 1636. Thereafter Shri.Shahajiraje became a Sardar of the Ad

In [3]:
num_parts = 10

full_stops_indices = [i for i, char in enumerate(text) if char == '.']
full_stops_per_part = len(full_stops_indices) // num_parts
split_indices = [full_stops_indices[(idx+1)*full_stops_per_part] for idx in range(num_parts-1)]
parts = [text[i:j] for i, j in zip([0] + split_indices, split_indices + [None])]

for i, part in enumerate(parts):
    print(f"Chunk {i+1}: {part.strip()}\n")

Chunk 1: Shivaji was the founder of the Maratha Empire. He was born in the Shivneri Fort in Maharashtra probably on 19 February 1630. He is named after a local goddess, Shivai Devi.  
Shivaji is one of the revered historical figures of Maharashtra. He created an independent and sovereign state in the Maharashtra region. His mother, Raajmaata Jijabai was the daughter of Shri.Lakhuji Jadhavrao of Sindkhed. His father Shri.Shahajiraje Bhosale was a  Maratha general  in the Deccan. 
Most of the territory in Maharashtra was then under the possession of the Nizamshah of Ahmednagar and the Adilshah of Bijapur who were known as the Deccan sultanates. The Mughals launched a campaign to conquer the Nizamshahi Kingdom. The Adilshah of Bijapur allied with the Mughals in this campaign. Shri. Shahajiraje Bhosale tried to rebel, but he could not withstand the combined might of the Mughals and the Adilshahi. The Nizamshahi kingdom came to an end in 1636. Thereafter Shri.Shahajiraje became a Sardar of 

In [4]:
def answer_question(context, question):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512, truncation=True)

    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        
    start_scores = torch.softmax(outputs.start_logits, dim=1).cpu().numpy()[0]
    end_scores = torch.softmax(outputs.end_logits, dim=1).cpu().numpy()[0]

    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits)

    # Get the tokens as a list of strings
    all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].tolist()[0])

    answer_tokens = all_tokens[start_index:end_index+1]
    answer = tokenizer.convert_tokens_to_string(answer_tokens)

    confidence = (start_scores[start_index] + end_scores[end_index]) / 2

    return answer, confidence

In [5]:
def main(question):
    answers_dict = {}
    for i, part in enumerate(parts):
        answer, confidence = answer_question(part, question)
        answers_dict[f"Chunk {i+1}"] = {answer :confidence}

        max_score = max(answers_dict.values(), key=lambda x: list(x.values())[0])
        max_value = list(max_score.keys())[0]
        
    print(answers_dict)
    print("max confidence:", list(max_score.values())[0])
    print("____________________________________________________________________________________")
    return max_value



In [6]:
question = "When Shivaji Maharahj Died"
print("Question:",question+"\nAnswer:",main(question))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trai

{'Chunk 1': {'.': 0.16971230506896973}, 'Chunk 2': {'[CLS]': 0.48739564418792725}, 'Chunk 3': {'[SEP]': 0.4951815903186798}, 'Chunk 4': {'[CLS]': 0.4951270818710327}, 'Chunk 5': {'[CLS]': 0.49649566411972046}, 'Chunk 6': {'[CLS]': 0.4956756830215454}, 'Chunk 7': {'[CLS]': 0.49542126059532166}, 'Chunk 8': {"[CLS] when shivaji maharahj died [SEP] . on the way back he fought a battle with the mughals at vani - dindori in nashik district . he defeated daud khan in the battle . the marathas under the leadership of moropant pingle captured trimbakgad . the maratha army then invaded baglan , a hilly district that was guarded by nine hill forts . the maratha army not only captured smaller hill forts of baglan but also captured mulher fort and salher which lay on the border of khandesh and gujarat . salher became a base of operations against the rich provinces of gujarat and khandesh . the mughals tried to recapture salher but failed . in 1672 , shivaji ' s army conquered the principality of ja