## Baseline question answering model

#### Functions to read pdf and convert it into smaller chunks

In [1]:

# Import libraries
import PyPDF2 

# Function to read pdf
def read_pdf(filepath:str) -> PyPDF2._reader.PdfReader:
    reader = PyPDF2.PdfReader(filepath)
    return reader

# Function that returns only content of pdf as string
def get_pdf_content(filepath:str) -> str:
    reader = read_pdf(filepath=filepath)
    pdf_content = ''
    for page in reader.pages:
        pdf_content += page.extract_text()
    
    return pdf_content


# Function that takes large block of text and return list of smaller segments / chunks
def content_to_segments(content:str, max_words:int = 250, overlap:int=20)->list[str]:
    lines = content.split(' ')
    chunks = []
    
    # Iterate through the lines with the specified overlap
    for i in range(0, len(lines), max_words - overlap):
        # Extract the chunk with the given number of lines, including overlap
        chunk = " ".join(lines[i:i + max_words])
        chunks.append(chunk)
    
    return chunks

In [2]:
content = get_pdf_content(filepath='Data/computer.pdf')
segments = content_to_segments(content)

### Question answering model

In [3]:
# Import libraries
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:

# Main function that returns answer given a context
def getAnswer(question:str, context:str)-> dict: 
    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

    # Get the input IDs and attention mask
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Get the start and end scores for the answer
    outputs = model(input_ids, attention_mask=attention_mask)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Apply softmax to get probabilities
    start_probs = F.softmax(start_scores, dim=1).tolist()[0]
    end_probs = F.softmax(end_scores, dim=1).tolist()[0]

    # Get the most likely beginning and end of the answer span
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    
    tokens = input_ids[0][start_index : end_index + 1]
    answer = tokenizer.decode(tokens, skip_special_tokens=True)

    # Certainty of answer
    prob_score = (start_probs[start_index] + end_probs[end_index]) / 2

    return {
        'answer': answer,
        'score' : prob_score
    }


## Get answer from multiple segments
def getAnswerFromSegments(question:str, segments:list[str])->list[dict]:
    answers = []
    for segment in segments:
        try:
            ans = getAnswer(question=question, context=segment)
            if(ans['answer'] != ''):
                answers.append(ans)
        except:
            ...
       
    
    try:
        ans = max(answers, key=lambda x: x['score'])
    except:
        ans='No answer found'

    return ans

### Test the model

In [5]:
import pandas as pd

test_data = pd.read_csv("Data/test_data.csv", delimiter=", ")

  test_data = pd.read_csv("Data/test_data.csv", delimiter=", ")


In [6]:
pred_answers = []
n = len(test_data['question'])
i = 1

for question in test_data['question'][:n]:
    ans = getAnswerFromSegments(question=question, segments=segments)
    pred_answers.append(ans)
    print(i, "Questions answered")
    i+=1

Token indices sequence length is longer than the specified maximum sequence length for this model (802 > 512). Running this sequence through the model will result in indexing errors


1 Questions answered
2 Questions answered
3 Questions answered
4 Questions answered
5 Questions answered
6 Questions answered
7 Questions answered
8 Questions answered
9 Questions answered
10 Questions answered
11 Questions answered
12 Questions answered
13 Questions answered
14 Questions answered
15 Questions answered
16 Questions answered
17 Questions answered
18 Questions answered
19 Questions answered
20 Questions answered
21 Questions answered
22 Questions answered
23 Questions answered
24 Questions answered
25 Questions answered


In [15]:
pred_answers[2]

'No answer found'

In [16]:

ans = []
for pred in pred_answers:
    try:
        ans.append(pred['answer'].replace("\n", " "))
    except:
        ans.append(pred)
    
model_result = pd.DataFrame({'question' : test_data['question'][:n], 
                            'answer' : test_data['answer'][:n], 
                            'pred_answers': ans})


In [17]:
model_result.to_csv('Data/test_result.csv')

In [23]:
len(content.split(' '))

47493

In [28]:
test_data['type'].value_counts()

type
s    22
l     3
Name: count, dtype: int64

# 1. Baseline model Evolution
### Used PDF
**Pdf used** : Computer Science Grade 10 [English version]  

**PDF size** : 257 pages, 4354 Lines, 47493 Words

### Models used

**Model** : deepset/roberta-base-squad2

**Tokenizer** : deepset/roberta-base-squad2 

### Segmentation
**Max words** : 250

**Overlap** : 20

**No of chunks** : 207

### Testing dataset
**No of questions** : 25

**Types** : 22 short and 3 long

### Result
**Time taken** : 55 min 45.2 sec

**Average time** : 2 min 13.8 sec

**Fully Correct**: 8 (6 Terminology,  2 Sentence)

**somehow right**: 2 (2 Sentence)

**partial answer**: 5 (3 Point based LQ, 1 Sentence, 1 Terminology)

**No ans**: 5

**Wrong** : 5
