In [1]:
import io
import re
import zipfile
from dataclasses import dataclass

import pypdf
from tqdm import tqdm

In [2]:
@dataclass
class Paper:
    filename: str
    title_authors_info: str = ''
    abstract: str = ''
    keywords: str = ''
    introduction: str = ''
    
    def __repr__(self):
        return f'----------\n filename \n----------\n\n {self.filename}' + \
               f'\n\n----------\n title_authors_info \n----------\n\n {self.title_authors_info}' + \
               f'\n\n----------\n abstract \n----------\n\n {self.abstract}' + \
               f'\n\n----------\n keywords \n----------\n\n {self.keywords}' + \
               f'\n\n----------\n introduction \n----------\n\n {self.introduction}'

In [3]:
zip_path = 'ml-engineer/ICDAR2024_papers.zip'

papers: list[Paper] = []

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for file_info in tqdm(zip_ref.infolist()):
        if not file_info.is_dir() and file_info.filename.lower().endswith('.pdf'):
            
            paper = Paper(filename=file_info.filename)
            
            with zip_ref.open(paper.filename) as pdf_file:
                pdf_reader = pypdf.PdfReader(io.BytesIO(pdf_file.read()))
                first_page = pdf_reader.pages[0]                
                
                text = first_page.extract_text()
                
                # Split abstract only once
                abstract_split = re.split(r'Abstract\.?', text, maxsplit=1)

                if len(abstract_split) == 1:
                    print(f'No abstract found in {paper.filename}')
                    continue

                if len(abstract_split) == 2:
                    paper.title_authors_info = abstract_split[0]
                    paper.abstract = abstract_split[1]
                    text = paper.abstract
                
                # Split keywords only once
                keywords_split = re.split(r'Keywords:?', text, maxsplit=1)
                
                # if len(keywords_split) == 1: pass # No keywords found

                if len(keywords_split) == 2:
                    paper.abstract = keywords_split[0]
                    paper.keywords = keywords_split[1]
                    text = paper.keywords                
                
                # Split at introduction only once
                introduction_split = re.split(r'1\s*Introduction|Introduction', text, maxsplit=1)
                
                # if len(introduction_split) == 1: pass # No introduction found

                if len(introduction_split) == 2:
                    if len(keywords_split) == 1:
                        paper.abstract = introduction_split[0]
                        paper.introduction = introduction_split[1]
                    if len(keywords_split) == 2:
                        paper.keywords = introduction_split[0]
                        paper.introduction = introduction_split[1]
                
                papers.append(paper)

100%|██████████| 149/149 [00:05<00:00, 26.71it/s]


In [4]:
for paper in papers:
    if paper.filename == 'ICDAR2024_proceedings_pdfs/0022.pdf':
        print(paper)
        break

----------
 filename 
----------

 ICDAR2024_proceedings_pdfs/0022.pdf

----------
 title_authors_info 
----------

 CREPE: Coordinate-Aware End-to-End
Document Parser
Yamato Okamoto1,2∗†[0009−0009−2153−3782],
Youngmin Baek1,2∗‡[0000−0001−7001−4641],
Geewook Kim1[0009 −0001−6713−3858], Ryota Nakao1,2[0009 −0001−6692−6952],
DongHyun Kim1[0000 −0001−9033−5231], Moon Bin Yim1[0000 −0002−7272−2198],
Seunghyun Park1[0000 −0002−8509−9163], and Bado Lee1[0000 −0003−4962−8977]
1NAVER Cloud AI, Seongnam-si, Gyeonggi-do, Korea
okamoto.yamato.w15@kyoto-u.jp
{youngmin.baek, gw.kim, dong.hyun }@navercorp.com
{moonbin.yim, seung.park, bado.lee }@navercorp.com
2LINE WORKS, Shibuya-city, Toyko, Japan
nakao.ryota@line-works.com


----------
 abstract 
----------

  In this study, we formulate an OCR-free sequence genera-
tion model for visual document understanding (VDU). Our model not
only parses text from document images but also extracts the spatial
coordinates of the text based on the multi-head ar