### Objective

In this notebook, we will try to automatically extract the page numbers of an ABB review issue.

In [1]:
from langchain.document_loaders import PyMuPDFLoader
import re

In [2]:
# issue_name = 'ABB Review_03_2023_layout complete_EN_300dpi'
issue_name = 'ABB Review_02_2023_layout complete_EN_72-300dpi'
loader = PyMuPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()
TOC_page = 3
TOC = raw_documents[TOC_page-1].page_content

# Remove font issue
TOC = TOC.replace('�', '')

In [3]:
def extract_articles(content):
    # Split content by potential page numbers and extract the sections
    sections = re.split(r'\b(\d{1,4})\b', content)
    
    # Create a list to store extracted articles
    articles = []
    
    # Iterate over sections to identify articles
    for i in range(1, len(sections) - 1, 2):
        # Check if the section following the number looks like an article title
        potential_title_section = sections[i+1].strip().split("\n")
        
        # If the section is too short or starts with non-alphabetical character, it's likely not a title
        if len(potential_title_section) < 1 or (potential_title_section[0] and not potential_title_section[0][0].isalpha()):
            continue
        
        # Extract title, subtitle, and category
        title = potential_title_section[0]
        if len(title)>0:
        
            # Check for category and subtitle indicators
            subtitle = []
            for line in potential_title_section[1:]:
                if '—' in line:
                    break
                else:
                    subtitle.append(line.strip())

            subtitle = " ".join(subtitle) if subtitle else None

            # Add the article title, subtitle, category, and page number to the list
            articles.append({
                "title": title,
                "subtitle": subtitle,
                "starting page": int(sections[i])
            })
    
    return articles

In [5]:
# Extract final articles with titles, subtitles, and categories
articles = extract_articles(TOC)

# Rearrange artical order
articles.sort(key=lambda x: int(x['starting page']))

# Infer ending page
end_pages = []
for i in range(len(articles) - 1):
    end_pages.append(int(articles[i+1]['starting page']) - 1)
    
# Add placeholder for the last article since we don't have its end page
end_pages.append("Unknown")

# Attach end page to articles
for article, end_page in zip(articles, end_pages):
    if end_page != "Unknown":
        article['page number'] = end_page - article['starting page'] + 1
    else:
        article['page number'] = "Unknown"
    
# Retain only valid articles
subscribe_index = next((i for i, article in enumerate(articles) if article['title'] == 'Subscribe'), None)
if subscribe_index is not None:
    articles = articles[:subscribe_index]
    
# Page number in relative
editorial_page = next((article['starting page'] for article in articles if article['title'] == 'Editorial'), None)
for article in articles:
    article['starting page'] = article['starting page'] - editorial_page + 5

In [6]:
articles

[{'title': 'pp.', 'subtitle': None, 'starting page': -59, 'page number': 0},
 {'title': 'pp.', 'subtitle': None, 'starting page': -59, 'page number': 15},
 {'title': 'the acknowledgments ',
  'subtitle': 'should read: “The authors wish to acknowledge the outstanding contribu- tions of colleagues from ABB Corporate Research, Västerås, Sweden and of colleagues in Ericsson R&D” The authors and editors of ABB Review apologize for these errors.',
  'starting page': -44,
  'page number': 49},
 {'title': 'Editorial',
  'subtitle': None,
  'starting page': 5,
  'page number': 5},
 {'title': 'For greater results',
  'subtitle': 'High Speed Alignment – visual servoing technology for ultra- high precision assembly',
  'starting page': 10,
  'page number': 8},
 {'title': 'The right moves',
  'subtitle': 'Software that optimizes robot performance',
  'starting page': 18,
  'page number': 6},
 {'title': 'The DCS of tomorrow',
  'subtitle': 'Envisioning the future of process automation',
  'starting 