In [1]:
pip install gradio



In [2]:
pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [3]:
import gradio as gr
import PyPDF2
from transformers import pipeline

# Load pre-trained models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# Extract text from PDF
def extract_text_from_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ''
    for page in pdf_reader.pages:
        text += page.extract_text() or ''  # Handle pages with no text
    return text

# Split text into chunks
def split_text_into_chunks(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Summarize PDF with chunking
def summarize_pdf(file):
    try:
        text = extract_text_from_pdf(file)
        chunks = split_text_into_chunks(text, chunk_size=1000)  # Adjust chunk size as needed
        summaries = []
        for chunk in chunks:
            summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
            summaries.append(summary)
        return " ".join(summaries)
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Question Answering with chunking
def answer_question(file, question):
    try:
        text = extract_text_from_pdf(file)
        chunks = split_text_into_chunks(text, chunk_size=500)  # Smaller chunks for QA
        best_answer = ""
        max_score = 0
        for chunk in chunks:
            result = qa_pipeline(question=question, context=chunk)
            if result["score"] > max_score:
                max_score = result["score"]
                best_answer = result["answer"]
        return best_answer if best_answer else "No answer found in the document."
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Create Gradio interface with tabs for Summarization and QA
with gr.Blocks() as demo:
    gr.Markdown("# PDF Summarization and Question Answering Tool")

    with gr.Tabs():
        with gr.TabItem("Summarization"):
            with gr.Row():
                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            with gr.Row():
                summary_output = gr.Textbox(label="Summary")
            summarize_button = gr.Button("Summarize")
            summarize_button.click(fn=summarize_pdf, inputs=pdf_input, outputs=summary_output)

        with gr.TabItem("Question Answering"):
            with gr.Row():
                pdf_input_qa = gr.File(label="Upload PDF", file_types=[".pdf"])
                question_input = gr.Textbox(label="Ask a Question")
            with gr.Row():
                answer_output = gr.Textbox(label="Answer")
            ask_button = gr.Button("Ask")
            ask_button.click(fn=answer_question, inputs=[pdf_input_qa, question_input], outputs=answer_output)

# Launch Gradio interface with shareable link
demo.launch(share=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ca13d71c39ea95accc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


