# INSTALLING LIBRARIES

In [1]:
!pip install transformers PyPDF2 torch gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.7/138.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 kB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m6

# IMPORTS

In [2]:
import gradio as gr
import os
from PyPDF2 import PdfReader
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# TEXT EXTRACTION FROM PDF

In [3]:
# Parse the PDF and extract text
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# LOADING MODEL

In [4]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# DEFINING QUESTION ANSWERING FUNCTION

In [9]:
# Function to perform question answering on the PDF text
def perform_question_answering(pdf_file, user_question):
    try:
        pdf_text = extract_text_from_pdf(pdf_file.name)
        paragraphs = pdf_text.split("\n\n")

        best_answer = None
        best_confidence = -float("inf")

        for paragraph in paragraphs:
            question = user_question
            context = paragraph

            # Tokenize and truncate/split the document
            inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)

            with torch.no_grad():
                outputs = model(**inputs)

            answer_start_scores = outputs.start_logits
            answer_end_scores = outputs.end_logits

            confidence = torch.max(answer_start_scores) - torch.max(answer_end_scores)

            answer_start = torch.argmax(answer_start_scores)
            answer_end = torch.argmax(answer_end_scores) + 1

            answer = tokenizer.convert_tokens_to_string(
                tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])
            )

            if confidence > best_confidence:
                best_confidence = confidence
                best_answer = answer

        if best_answer:
            return best_answer
        else:
            return "No answer found."

    except Exception as e:
        return f"Error processing PDF: {str(e)}"

# Rest of your code remains unchanged


# IMPLEMENTING THROUGH GRADIO

In [10]:
# Define the Gradio interface
iface = gr.Interface(
    fn=perform_question_answering,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Textbox(label="Ask a question")
    ],
    outputs=gr.Textbox(label="Answer")
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2cfbfca18eb80f5f92.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


