### Import libraries

In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

  from .autonotebook import tqdm as notebook_tqdm


### Load model

In [2]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

### QA function

In [3]:
def getAnswer(question:str, context:str)-> str: 
    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

    # Get the input IDs and attention mask
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Get the start and end scores for the answer
    outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Apply softmax to get probabilities
    start_probs = F.softmax(start_scores, dim=1).tolist()[0]
    end_probs = F.softmax(end_scores, dim=1).tolist()[0]

    # Get the most likely beginning and end of the answer span
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Convert token ids to string
    tokens = input_ids[0][start_index : end_index + 1]
    answer = tokenizer.decode(tokens, skip_special_tokens=True)

    # Certainty of answer
    prob_score = (start_probs[start_index] + end_probs[end_index]) / 2

    return {
        'answer': answer,
        'prob_score' : prob_score
    }

### Testing

In [4]:
# Define the context and question
context = """
The Apollo program was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA),
which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-person spacecraft
to follow the one-person Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of
"landing a man on the Moon and returning him safely to the Earth" by the end of the 1960s, which he proposed in an address to Congress on May 25, 1961.
"""

question = "What was the goal of the Apollo program?"


# Print the result
print(f"Question: {question}")
print(f"Answer: {getAnswer(question=question, context=context)['answer']}")


Question: What was the goal of the Apollo program?
Answer:  landing the first humans on the Moon


### PDF reader

In [3]:
import PyPDF2

In [4]:
reader = PyPDF2.PdfReader('../PDFs/context.pdf')

In [5]:
type(reader)

PyPDF2._reader.PdfReader

In [7]:
reader.pages[0].extract_text()

'Unit 1\nIntroduction to Cryptography\nCryptography\nCryptography\xa0is technique  of securing information  and communications  through use of\ncodes so that only those person for whom the information is intended can understand it and\nprocess it. Thus, preventing unauthorized access to information is known as cryptography.\nThe prefix “crypt” means “hidden” and suffix graphy means “writing”.\n1.In Cryptography, the techniques, which are used to protect information, are obtained\nfrom  mathematical  concepts  and  a  set  of  rule  based  calculations  known  as\nalgorithms to convert messages in ways that make it hard to decode it. \n2.Algorithms are used for cryptographic key generation, digital signing, verification to\nprotect  data  privacy,  web  browsing  on  internet  and  to  protect  confidential\ntransactions such as credit card and debit card transactions.\n3.Cryptography is associated with the process of converting ordinary plain text into\nunintelligible text and vice-ver

In [8]:
pdf_context = ''

for page in reader.pages:
    pdf_context += page.extract_text()

In [9]:
pdf_context

'Unit 1\nIntroduction to Cryptography\nCryptography\nCryptography\xa0is technique  of securing information  and communications  through use of\ncodes so that only those person for whom the information is intended can understand it and\nprocess it. Thus, preventing unauthorized access to information is known as cryptography.\nThe prefix “crypt” means “hidden” and suffix graphy means “writing”.\n1.In Cryptography, the techniques, which are used to protect information, are obtained\nfrom  mathematical  concepts  and  a  set  of  rule  based  calculations  known  as\nalgorithms to convert messages in ways that make it hard to decode it. \n2.Algorithms are used for cryptographic key generation, digital signing, verification to\nprotect  data  privacy,  web  browsing  on  internet  and  to  protect  confidential\ntransactions such as credit card and debit card transactions.\n3.Cryptography is associated with the process of converting ordinary plain text into\nunintelligible text and vice-ver

In [10]:
test_questions = [
    'What is Cryptography ?',
    'What is encryption ?',
]

In [11]:
for question in test_questions:
    # Print the result
    print(f"Question: {question}")
    print(f"Answer: {getAnswer(question=question, context=reader.pages[0].extract_text()[:1000])}")


Question: What is Cryptography ?
Answer: {'answer': ' technique  of securing information  and communications  through use of\ncodes', 'prob_score': 0.6295348554849625}
Question: What is encryption ?
Answer: {'answer': ' technique  of securing information  and communications  through use of\ncodes so that only those person for whom the information is intended can understand it and\nprocess it. Thus, preventing unauthorized access to information', 'prob_score': 0.3868671804666519}


### Sliding window approach

In [12]:
reader.pages[0].extract_text()

'Unit 1\nIntroduction to Cryptography\nCryptography\nCryptography\xa0is technique  of securing information  and communications  through use of\ncodes so that only those person for whom the information is intended can understand it and\nprocess it. Thus, preventing unauthorized access to information is known as cryptography.\nThe prefix “crypt” means “hidden” and suffix graphy means “writing”.\n1.In Cryptography, the techniques, which are used to protect information, are obtained\nfrom  mathematical  concepts  and  a  set  of  rule  based  calculations  known  as\nalgorithms to convert messages in ways that make it hard to decode it. \n2.Algorithms are used for cryptographic key generation, digital signing, verification to\nprotect  data  privacy,  web  browsing  on  internet  and  to  protect  confidential\ntransactions such as credit card and debit card transactions.\n3.Cryptography is associated with the process of converting ordinary plain text into\nunintelligible text and vice-ver

#### Segmentation

In [13]:
def to_segments(content:str, max_lines:int = 30)->list[str]:
    lines = content.split('\n')
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for line in lines:
        current_chunk.append(line)
        current_chunk_len += 1
        if(current_chunk_len >= max_lines):
            chunks.append('\n'.join(current_chunk))
            current_chunk = []

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks


In [14]:
pdf_segments = to_segments(pdf_context)

In [15]:
question = 'What is Man-In-The-Middle attack ?'
answers = []
for segment in pdf_segments:
    ans = getAnswer(question=question, context=segment)
    if (ans['answer']):
        answers.append(ans)

In [16]:
filtered_answers = [ans for ans in answers if ans["answer"]]

best_answer = max(filtered_answers, key=lambda x: x["prob_score"])
print(f"Question: {question}")
print(f"Answer: {best_answer}")

Question: What is Man-In-The-Middle attack ?
Answer: {'answer': 'What is Man-In-The-Middle attack?\uf0b7It is virtually un-hack able', 'prob_score': 0.42718422412872314}


In [17]:
filtered_answers

[{'answer': ' the  attacker  chooses  random  plaintexts',
  'prob_score': 0.38262370228767395},
 {'answer': 'MITM', 'prob_score': 0.373177170753479},
 {'answer': ' mostly public key', 'prob_score': 0.36386334896087646},
 {'answer': 'What is Man-In-The-Middle attack?\uf0b7It is virtually un-hack able',
  'prob_score': 0.42718422412872314}]

In [18]:
getAnswer(question=question, context='''Man-In-The-Middle (MITM) attack :
In this type of attack, attacker intercepts the message/key between two communicating
parties through a secured channel.''')

{'answer': ' attacker intercepts the message/key between two communicating\nparties through a secured channel',
 'prob_score': 0.602444589138031}