In [43]:
import fitz
from unidecode import unidecode
import os
import requests
import PIL.Image
import io
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())


class DocumentSummarizer:

    API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")

    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
    # API_URL = "https://api-inference.huggingface.co/models/Falconsai/text_summarization"
    headers = {"Authorization": f"Bearer {API_TOKEN}"}

    def __init__(self, path: str) -> None:
        self.doc = fitz.open(path)
        self.toc = self.doc.get_toc(simple=True)
    
    def _summarize(self, payload: str):
        response = requests.post(self.API_URL, headers=self.headers, json=payload)
        return response.json()[0]["summary_text"]

    def _get_images(self) -> None:
        counter = 0
        if not os.path.exists("./images/"):
            os.mkdir("./images/")
        
        for page in self.doc:
            images = page.get_images()
        
            for image in images:
                base_image = self.doc.extract_image(image[0])
                image_data = base_image["image"]
                ext = base_image["ext"]
                image = PIL.Image.open(io.BytesIO(image_data))
                image.save(open(f"./images/{self.doc.name}_image_{counter}.{ext}", "wb"))
                counter += 1    

    def _get_abstract(self):
        page_1_text = unidecode(self.doc[0].get_text())
        start_idx = page_1_text.lower().find("abstract") + len("abstract")
        end_idx = page_1_text.lower().find(self.toc[0][1].lower())
        return page_1_text[start_idx:end_idx].replace("\n", " ")

    def summarize(self) -> None:

        summarized_doc = {}
        summarized_doc["Abstract"] = self._get_abstract()
        for i, content in enumerate(self.toc):
            title = content[1]
            page_no = content[2]

            if i+1==len(self.toc):
                text = unidecode(self.doc[page_no-1].get_text())
                start_idx = text.find(title)
                content_text = text[len(title)+start_idx:]
                
            
            else:
                start_idx = unidecode(self.doc[page_no-1].get_text()).find(title)
                end_idx = unidecode(self.doc[self.toc[i+1][2]-1].get_text()).find(self.toc[i+1][1])
                if page_no == self.toc[i+1][2]:
                    content_text = unidecode(self.doc[page_no-1].get_text())[len(title)+start_idx:end_idx]
                else:
                    content_text = unidecode(self.doc[page_no-1].get_text())[len(title)+start_idx:] + \
                                   unidecode(self.doc[self.toc[i+1][2]-1].get_text())[:end_idx]
            
            content_text = content_text.replace("\n", " ")
            summzarized_content_text = ""
            while len(content_text)>512:
                summzarized_content_text += self._summarize(content_text[:512])
                content_text = content_text[512:]
            summarized_doc[title] = summzarized_content_text
        
        self._get_images()
        return summarized_doc

In [44]:
ds = DocumentSummarizer(path="test1.pdf")
doc = ds.summarize()

In [45]:
from fpdf import FPDF
from typing import Dict
import os
from PIL import Image
import shutil

class PdfDoc(FPDF):

    def footer(self):
        self.set_y(-15)
        self.set_font("helvetica", "I", 10)
        self.cell(0, 10, f"Page {self.page_no()}", align="C")
    
    def write(self, doc: Dict[str, str]):
        self._write_doc(doc)
        self._write_images()
    
    def _write_doc(self, doc: Dict[str, str]):
        for title in doc:
            text = doc[title]
            self.set_font("times", "B", size=16)
            self.cell(0, 15, text=title)
            self.ln()
            if len(text)>0:
                self.set_font("times", "", 12)
                self.multi_cell(0, 5, text=text)
                self.ln()
    
    def _write_images(self):
        self.add_page()
        self.set_font(family="helvetica", style="B", size=16)
        self.cell(0, 10, text="Figures", align="C")
        self.ln()
        counter = 1
        for _, _, files in os.walk("./images/"):
            for file in files:
                self.image(name=Image.open(f"./images/{file}"),
                        w=75,
                        h=75,
                        x=self.w/2 - 37.5)
                self.ln()
                self.set_font(family="times", style="I", size=10)
                self.cell(0, 5, text=f"Figure {counter}", align="C")
                self.ln(20)
                counter += 1
            
        shutil.rmtree("./images/")
        


In [46]:
pdf = PdfDoc(orientation="P", unit="mm", format="letter")
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.write(doc=doc)
pdf.output("summarized_doc.pdf")

In [49]:
papers = client.search("A decomposable attention model")

In [60]:
papers.results[0].paper.url_pdf

'http://arxiv.org/pdf/1606.01933v2.pdf'

In [61]:
from paperswithcode import PapersWithCodeClient
import requests
def download_ref_papers(title: str):
    client = PapersWithCodeClient()
    papers = client.search(q=title)
    pdf_url = papers.results[0].paper.url_pdf
    response = requests.get(pdf_url)
    with open(f"{title}.pdf", "wb") as f:
        f.write(response.content)

download_ref_papers(title="A decomposable attention model")