In [1]:
!pip install PyPDF2 transformers nltk gradio

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m342.1 kB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1

In [3]:
import gradio as gr
from google.colab import files
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import pipeline
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page_num in range(len(reader.pages)):
        text += reader.pages[page_num].extract_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(cleaned_tokens)

# Load the QA pipeline model
qa_pipeline = pipeline("question-answering")

# Function to answer a question using the QA model
def answer_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Function to split the report into sections
def split_report_into_sections(text):
    sections = re.split(r'\n\s*\n', text)  # Split based on double newline
    section_dict = {}
    for section in sections:
        if '\n' in section:
            heading, content = section.split('\n', 1)
            section_dict[heading.lower()] = content
    return section_dict

# Function to get the relevant section based on the question
def get_relevant_section(question, section_dict):
    for heading in section_dict:
        if any(word in heading for word in question.lower().split()):
            return section_dict[heading]
    return " ".join(section_dict.values())  # If no match, search the whole report

# Main function with sections
def main_with_sections(pdf_file, question):
    # Step 1: Extract and split the report into sections
    text = extract_text_from_pdf(pdf_file)
    section_dict = split_report_into_sections(text)

    # Step 2: Get the relevant section based on the question
    relevant_section = get_relevant_section(question, section_dict)

    # Step 3: Answer the question using the QA model
    answer = answer_question(question, relevant_section)

    return answer

# Gradio Interface Function
def process_pdf_question(pdf_file, question):
    return main_with_sections(pdf_file.name, question)

# Creating Gradio Interface using the updated API
with gr.Blocks() as interface:
    gr.Markdown("# PDF Question Answering System")
    gr.Markdown("Upload a PDF file and ask a question. The system will analyze the text and provide an answer.")

    pdf_input = gr.File(label="Upload PDF")
    question_input = gr.Textbox(label="Ask a Question", placeholder="e.g., What is the company's profit after tax?")

    output_text = gr.Textbox(label="Answer")

    # When the button is clicked, the process_pdf_question function will be executed
    gr.Button("Submit").click(process_pdf_question, inputs=[pdf_input, question_input], outputs=output_text)

# Launch the interface
interface.launch()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://5095783b384a21bd42.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


