<a href="https://colab.research.google.com/github/Snechavan/PDFSummarizer/blob/main/Untitled22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Step 1: Install necessary libraries
!pip install transformers
!pip install nltk
!pip install fpdf
!pip install gensim
!pip install PyPDF2  # Install PyPDF2 for PDF handling

# Step 2: Import libraries
import nltk
from transformers import pipeline, AutoTokenizer
from google.colab import files
from fpdf import FPDF  # For saving summary as PDF
from PyPDF2 import PdfReader  # For extracting text from PDFs

# Step 3: Download NLTK resources
nltk.download('punkt')

# Step 4: Load models for summarization
abstractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")  # BART for abstractive summarization

# Step 5: Define summarization function with better chunking based on tokens
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

def abstractive_summary(text):
    # Tokenize text to ensure chunks are within the token limit of the BART model
    max_tokens = 1024  # BART model max token limit

    tokens = tokenizer.encode(text, truncation=False)
    num_tokens = len(tokens)

    # If text exceeds max token limit, break it into smaller chunks
    if num_tokens > max_tokens:
        text_chunks = [tokens[i:i + max_tokens] for i in range(0, num_tokens, max_tokens)]
        summaries = []
        for chunk in text_chunks:
            chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
            chunk_summary = abstractive_summarizer(chunk_text, min_length=30, max_length=200)
            summaries.append(chunk_summary[0]['summary_text'])
        return " ".join(summaries)
    else:
        summary = abstractive_summarizer(text, min_length=30, max_length=200)
        return summary[0]['summary_text']

# Step 6: Functions to extract text from uploaded files
def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text = page.extract_text()
            if text:
                pdf_text += text
    return pdf_text

def extract_text_from_txt(txt_path):
    with open(txt_path, "r", encoding="utf-8") as file:
        text = file.read()
    return text

# Step 7: Summarize uploaded files and create new PDF with summary
def summarize_uploaded_file(uploaded_files, method="abstractive"):
    for uploaded_file in uploaded_files:
        file_path = uploaded_file
        file_extension = uploaded_file.split(".")[-1]

        if file_extension == "pdf":
            text = extract_text_from_pdf(file_path)
        elif file_extension == "txt":
            text = extract_text_from_txt(file_path)
        else:
            print(f"Unsupported file format: {file_extension}")
            return

        # Ensure text was extracted
        if len(text.strip()) == 0:
            print(f"No text extracted from {file_path}, skipping summarization.")
            continue

        # Display the extracted text for debugging
        print(f"Extracted text from {file_path}:\n{text[:500]}...")  # Display the first 500 characters

        if method == "abstractive":
            summary = abstractive_summary(text)
        else:
            print("Invalid summarization method selected.")
            return

        print(f"Summary for {file_path} ({method} summarization):")
        print(summary)
        print("\n" + "-"*80 + "\n")

        # Step 8: Save summary as a new PDF
        output_pdf_path = file_path.split(".")[0] + "_summary.pdf"
        save_summary_as_pdf(summary, output_pdf_path)

        # Trigger the download
        files.download(output_pdf_path)  # Download the summary PDF

# Step 9: Save summary to a PDF file with UTF-8 support
def save_summary_as_pdf(summary, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()

    # Set font to a Unicode-compatible font (Arial Unicode MS or another Unicode font)
    pdf.set_font("Arial", size=12)  # You can change to a more suitable Unicode font if needed
    pdf.multi_cell(0, 6, summary.encode('latin-1', 'replace').decode('latin-1'))  # Handle non-ASCII chars gracefully

    pdf.output(output_pdf_path)

# Step 10: Upload files
uploaded = files.upload()
uploaded_file_paths = list(uploaded.keys())

# Step 11: Run the summarization function
summarize_uploaded_file(uploaded_file_paths, method="abstractive")


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=094244a471cc194177abfed6068e6b4e43f9bd2ba93532760756d5b59dd0781c
  Stored in directory: /root/.cache/pip/wheels/f9/95/ba/f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
