# INSTALLING LIBRARIES

In [1]:
!pip install transformers PyPDF2 torch gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m134.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m35.9 MB/s[0m 

# IMPORTS

In [2]:
import gradio as gr
import os
from PyPDF2 import PdfReader
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# TEXT EXTRACTION FROM PDF

In [3]:
# Parse the PDF and extract text
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# LOADING MODEL

In [4]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# DEFINING QUESTION ANSWERING FUNCTION

In [11]:
# Function to perform question answering on the PDF text
def perform_question_answering(pdf_file, user_question):

    try:
      pdf_text = extract_text_from_pdf(pdf_file.name)
      paragraphs = pdf_text.split("\n\n")

      best_answer = None
      #best_score = -1
      best_confidence = -float("inf")
      for paragraph in paragraphs:
          question = user_question
          context = paragraph

          inputs = tokenizer(question, context, return_tensors="pt")

          with torch.no_grad():
              outputs = model(**inputs)

          '''Assigning a higher score to answers that were longer in terms of the number of characters.
          Longer answers would receive a higher score under this scoring method.
          However, this scoring method is quite simplistic and may not necessarily reflect the quality
          or correctness of the answer. It's based solely on the length of the text and doesn't take into
          account the semantic meaning or relevance of the answer to the question. That's why it was
          recommended to use a more sophisticated scoring method like confidence scores or F1 score to
          improve the quality of the selected answer.

          CODE:
          answer_start = torch.argmax(outputs.start_logits)
          answer_end = torch.argmax(outputs.end_logits) + 1

          answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))

          score = len(answer)
          if score > best_score:
          best_score = score
          best_answer = answer'''

#Comparing confidence scores
          answer_start_scores = outputs.start_logits
          answer_end_scores = outputs.end_logits

          confidence = torch.max(answer_start_scores) - torch.max(answer_end_scores)

          answer_start = torch.argmax(answer_start_scores)
          answer_end = torch.argmax(answer_end_scores) + 1

          answer = tokenizer.convert_tokens_to_string(
              tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
          )

          if confidence > best_confidence:
              best_confidence = confidence
              best_answer = answer

      if best_answer:
          return best_answer
      else:
          return "No answer found."
    except Exception as e:
        # Return an error message with details in case of an exception
        return f"Error processing PDF: {str(e)}"

# IMPLEMENTING THROUGH GRADIO

In [12]:
# Define the Gradio interface
iface = gr.Interface(
    fn=perform_question_answering,
    inputs=[
        gr.inputs.File(label="Upload PDF"),
        gr.inputs.Textbox(label="Ask a question")
    ],
    outputs=gr.outputs.Textbox(label="Answer")
)

iface.launch()

  gr.inputs.File(label="Upload PDF"),
  gr.inputs.File(label="Upload PDF"),
  gr.inputs.File(label="Upload PDF"),
  gr.inputs.Textbox(label="Ask a question")
  gr.inputs.Textbox(label="Ask a question")
  gr.inputs.Textbox(label="Ask a question")
  outputs=gr.outputs.Textbox(label="Answer")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

