### Objective

In this notebook, we will try to automatically extract the page numbers of an ABB review issue.

In [1]:
from langchain.document_loaders import PyMuPDFLoader
import re

In [3]:
# issue_name = 'ABB Review_03_2023_layout complete_EN_300dpi'
# issue_name = 'ABB Review_02_2023_layout complete_EN_72-300dpi'
issue_name = 'ABB Review_01_2023_layout complete_EN_72-300dpi'
loader = PyMuPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()
TOC_page = 3
TOC = raw_documents[TOC_page-1].page_content

# Remove font issue
TOC = TOC.replace('�', '')

In [4]:
TOC

'01|2023\n3\n— \nTransportation  \nand efficiency\n56 \nA circular future \nSustainability in the life of an \nelectric motor\n62 \nBORDLINE® ESS\nHigh-performance lithium-ion \nbatteries for rolling stock \n66 \nGrid support\nSynchronous condensers pro-\nvide inertia for grid stabilization\n72 \nBreaking ground\nSolid-state protection for DC \ndistribution onboard\n— \nBuzzword Demystifier\n80 \nOPC UA\n—\n81 \nSubscribe\n81 \nFrench and Spanish translations\n81 \nImprint\n—\n04\nGuest editorial\n05\nEditorial\n— \n900th edition\n06\nA word from the chairman\n07\nPostcards\n09 \nThe shoulders of giants \nABB Review publishes its  \n900th issue\n— \nBest innovations\n16 \nSelected innovations in brief\n— \nDigitalization\n34 \nCutting the cables \n5Gforprocessautomation\n40 \nSecure onboarding\nOPC UA helps make industrial \ncommunication secure\n44 \nPerfect circle\nDigitalization and sustainability\n50 \nDigital future\nABB’s next generation drive \n control platform\n—\nCover pictur

In [5]:
def extract_articles(content):
    # Split content by potential page numbers and extract the sections
    sections = re.split(r'\b(\d{1,4})\b', content)
    
    # Create a list to store extracted articles
    articles = []
    
    # Iterate over sections to identify articles
    for i in range(1, len(sections) - 1, 2):
        # Check if the section following the number looks like an article title
        potential_title_section = sections[i+1].strip().split("\n")
        
        # If the section is too short or starts with non-alphabetical character, it's likely not a title
        if len(potential_title_section) < 1 or (potential_title_section[0] and not potential_title_section[0][0].isalpha()):
            continue
        
        # Extract title, subtitle, and category
        title = potential_title_section[0]
        if len(title)>0:
        
            # Check for category and subtitle indicators
            subtitle = []
            for line in potential_title_section[1:]:
                if '—' in line:
                    break
                else:
                    subtitle.append(line.strip())

            subtitle = " ".join(subtitle) if subtitle else None

            # Add the article title, subtitle, category, and page number to the list
            articles.append({
                "title": title,
                "subtitle": subtitle,
                "starting page": int(sections[i])
            })
    
    return articles

In [6]:
# Extract final articles with titles, subtitles, and categories
articles = extract_articles(TOC)

# Rearrange artical order
articles.sort(key=lambda x: int(x['starting page']))

# Infer ending page
end_pages = []
for i in range(len(articles) - 1):
    end_pages.append(int(articles[i+1]['starting page']) - 1)
    
# Add placeholder for the last article since we don't have its end page
end_pages.append("Unknown")

# Attach end page to articles
for article, end_page in zip(articles, end_pages):
    if end_page != "Unknown":
        article['page number'] = end_page - article['starting page'] + 1
    else:
        article['page number'] = "Unknown"
    
# Retain only valid articles
subscribe_index = next((i for i, article in enumerate(articles) if article['title'] == 'Subscribe'), None)
if subscribe_index is not None:
    articles = articles[:subscribe_index]
    
# Page number in relative
editorial_page = next((article['starting page'] for article in articles if article['title'] == 'Editorial'), None)
for article in articles:
    article['starting page'] = article['starting page'] - editorial_page + 5

In [7]:
articles

[{'title': 'Guest editorial',
  'subtitle': None,
  'starting page': 4,
  'page number': 1},
 {'title': 'Editorial',
  'subtitle': None,
  'starting page': 5,
  'page number': 1},
 {'title': 'A word from the chairman',
  'subtitle': None,
  'starting page': 6,
  'page number': 1},
 {'title': 'Postcards',
  'subtitle': None,
  'starting page': 7,
  'page number': 2},
 {'title': 'The shoulders of giants ',
  'subtitle': 'ABB Review publishes its 900th issue',
  'starting page': 9,
  'page number': 7},
 {'title': 'Selected innovations in brief',
  'subtitle': None,
  'starting page': 16,
  'page number': 4},
 {'title': 'bits, and had an interruptible ',
  'subtitle': 'architecture with parallel input and output channels. The operator’s console and teletypewriter can be seen in the foreground. The cabinet with the central processor, storage and the input/output system are in the background. This computer and its applications are described in several articles in Brown Boveri Review',
  'sta