In [1]:
import io
import re
import pickle
import zipfile
from dataclasses import dataclass

import pypdf
from tqdm import tqdm

In [2]:
@dataclass
class Paper:
    filename: str
    title: str = ''
    authors: str = ''
    abstract: str = ''
    keywords: str = ''
    introduction: str = ''    
    
    def __repr__(self):
        return f' filename \n----------\n {self.filename}' + \
               f'\n\n title \n----------\n {self.title}' + \
               f'\n\n authors \n----------\n {self.authors}' + \
               f'\n\n abstract \n----------\n {self.abstract}' + \
               f'\n\n keywords \n----------\n {self.keywords}' + \
               f'\n\n introduction \n----------\n {self.introduction}'

In [3]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    cleaned_text = url_pattern.sub('', text)
    return cleaned_text

def remove_whitespace(text):
    text = re.sub(r'\n', ' ', text)  # newlines
    text = re.sub(r'\s+', ' ', text) # extra spaces
    text = text.strip()              # leading and trailing spaces
    return text

In [4]:
zip_path = 'ml-engineer/ICDAR2024_papers.zip'

papers: list[Paper] = []

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for file_info in tqdm(zip_ref.infolist()):
        if not file_info.is_dir() and file_info.filename.lower().endswith('.pdf'):
            
            paper = Paper(filename=file_info.filename)
            papers.append(paper)
            
            with zip_ref.open(paper.filename) as pdf_file:
                pdf_reader = pypdf.PdfReader(io.BytesIO(pdf_file.read()))
                first_page = pdf_reader.pages[0]          
                
                ## PLAIN EXTRACTION ##

                text = first_page.extract_text(extraction_mode="plain")
                
                # Split abstract only once
                abstract_split = re.split(r'Abstract\.?', text, maxsplit=1)

                if len(abstract_split) == 1:
                    print(f'No abstract found in {paper.filename}')
                    continue
                
                header = abstract_split[0]
                abstract = abstract_split[1]
                text = abstract
                
                # Split keywords only once
                keywords_split = re.split(r'Keywords:?', text, maxsplit=1)
                
                # if len(keywords_split) == 1: pass # No keywords found

                if len(keywords_split) == 2:
                    abstract = keywords_split[0]
                    keywords = keywords_split[1]
                    text = keywords                
                
                # Split at introduction only once
                introduction_split = re.split(r'1\s*Introduction|Introduction', text, maxsplit=1)
                
                # if len(introduction_split) == 1: pass # No introduction found

                if len(introduction_split) == 2:
                    if len(keywords_split) == 1:
                        abstract = introduction_split[0]
                        introduction = introduction_split[1]
                    if len(keywords_split) == 2:
                        keywords = introduction_split[0]
                        introduction = introduction_split[1]

                abstract = remove_urls(abstract)
                abstract = remove_whitespace(abstract)
                paper.abstract = abstract

                keywords = remove_urls(keywords)
                keywords = remove_whitespace(keywords)
                paper.keywords = keywords

                introduction = remove_urls(introduction)
                introduction = remove_whitespace(introduction)
                paper.introduction = introduction
                
                ## LAYOUT EXTRACTION ##

                try:
                    text = first_page.extract_text(extraction_mode="layout")
                except:
                    print(f'Difficulty with layout in {paper.filename}')
                    continue
                
                # Split abstract only once
                abstract_split = re.split(r'Abstract\.?', text, maxsplit=1)
                
                # Split title and authors
                header_split = re.split(r'\n\n+', abstract_split[0].strip())

                if len(header_split) == 1:
                    print(f'Difficulty splitting header in {paper.filename}')
                    continue

                title, authors = header_split[0], header_split[1]
                
                title = remove_whitespace(title)
                paper.title = title

                authors = remove_whitespace(authors)
                authors = re.sub(r'[^A-Za-z\s,]', '', authors)          # remove non-alphabetical characters except spaces and commas
                authors = re.sub(r'\s+,', ',', authors)                 # remove whitespaces before commas
                authors = re.sub(r',+', ',', authors)                   # replace multiple commas with one comma
                authors = re.sub(r',(?!\s)', ', ', authors)             # add space after comma if not present
                authors = re.sub(r' and(?!\s)', ' and ', authors)       # add space after 'and' if not present
                authors = authors.strip()
                authors = re.sub(r'\s\s+.*?,', '', authors)             # remove more than two whitespaces and anything after until the closest comma
                authors = re.sub(r'\s\s+and', ' and', authors)          # replace more than two whitspaces before 'and' with one space
                authors = re.sub(r'\s\s+.*?$', '', authors)             # remove trailing withespaces and anything after until the end
                authors = re.sub(r'([a-z])([A-Z])', r'\1 \2', authors)  # split small and capital letters
                authors = re.sub(r',$', '', authors)                    # remove trailing comma
                authors = re.sub(r', [a-zA-Z]$', '', authors)           # remove trailing comma and single letter
                authors = re.sub(r', and ', ' and ', authors)           # remove comma before 'and'
                paper.authors = authors

 69%|██████▉   | 103/149 [00:10<00:04, 11.10it/s]

Difficulty with layout in ICDAR2024_proceedings_pdfs/0191.pdf


100%|██████████| 149/149 [00:14<00:00, 10.44it/s]


In [5]:
# Save papers to a pickle file
with open('papers.pkl', 'wb') as f:
    pickle.dump(papers, f)

In [6]:
papers[0]

 filename 
----------
 ICDAR2024_proceedings_pdfs/0004.pdf

 title 
----------
 SAGHOG: Self-Supervised Autoencoder for Generating HOG Features for Writer Retrieval

 authors 
----------
 Marco Peer, Florian Kleber and Robert Sablatnig

 abstract 
----------
 This paper introduces Saghog , a self-supervised pretraining strategy for writer retrieval using HOG features of the binarized input image. Our preprocessing involves the application of the Segment Any- thing technique to extract handwriting from various datasets, ending up with about 24k documents, followed by training a vision transformer on reconstructing masked patches of the handwriting. Saghog is then finetuned by appending NetRVLAD as an encoding layer to the pre- trained encoder. Evaluation of our approach on three historical datasets, Historical-WI, HisFrag20, and GRK-Papyri, demonstrates the effective- ness of Saghog for writer retrieval. Additionally, we provide ablation studies on our architecture and evaluate un- and 