### Objective

In this notebook, we will try to automatically extract the page numbers of an ABB review issue.

In [1]:
from langchain.document_loaders import PyMuPDFLoader, PyPDFLoader
import re

In [7]:
# issue_name = 'ABB Review_03_2023_layout complete_EN_300dpi'
issue_name = 'ABB Review_02_2023_layout complete_EN_72-300dpi'
# issue_name = 'ABB Review_01_2023_layout complete_EN_72-300dpi'
loader = PyPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()
TOC_page = 3
TOC = raw_documents[TOC_page-1].page_content

# Remove font issue
TOC = TOC.replace('�', '')

In [8]:
TOC

'02|202385\n— \nAssets in motion\n136 Clean machine\nCarbon emissions from EV \nbattery production and use\n140 Plug-in mines\nWorld’s first fully automated \ncharging system for mining \ntrucks\n146 Modeling flow\nMultiphysics-based reduced \n order model (ROM) for mine \npollution control\n— \nBuzzword Demystifier\n152 Industrial Metaverse\nHow can the Industrial \nMetaverse help ABB and its \n customers?\n—\n153 Subscribe\n153 French and Spanish translations\n153 Imprint—\n87\t Editorial\n— \n88\t 2022\tABB\tResearch\tAward\t\n  Prestigious award for work on \nconnected device security\n— \nEnhanced knowledge\n92 For greater results\nHigh Speed Alignment – visual \nservoing technology for ultra- \nhigh precision assembly\n100 The right moves\nSoftware that optimizes robot \nperformance\n106 The DCS of tomorrow\nEnvisioning the future of process \nautomation\n112 Safe cyber space\n  ABB Ability™ Cyber Security \nWorkplace\n118 The virtues of  virtualization\n  Virtual protection and 

In [9]:
# Split the sections
sections = TOC.split('\n')

In [10]:
sections

['02|202385',
 '— ',
 'Assets in motion',
 '136 Clean machine',
 'Carbon emissions from EV ',
 'battery production and use',
 '140 Plug-in mines',
 'World’s first fully automated ',
 'charging system for mining ',
 'trucks',
 '146 Modeling flow',
 'Multiphysics-based reduced ',
 ' order model (ROM) for mine ',
 'pollution control',
 '— ',
 'Buzzword Demystifier',
 '152 Industrial Metaverse',
 'How can the Industrial ',
 'Metaverse help ABB and its ',
 ' customers?',
 '—',
 '153 Subscribe',
 '153 French and Spanish translations',
 '153 Imprint—',
 '87\t Editorial',
 '— ',
 '88\t 2022\tABB\tResearch\tAward\t',
 '  Prestigious award for work on ',
 'connected device security',
 '— ',
 'Enhanced knowledge',
 '92 For greater results',
 'High Speed Alignment – visual ',
 'servoing technology for ultra- ',
 'high precision assembly',
 '100 The right moves',
 'Software that optimizes robot ',
 'performance',
 '106 The DCS of tomorrow',
 'Envisioning the future of process ',
 'automation',
 '11

In [4]:
print(TOC)

01|20233
— 
Transportation  
and efficiency
56 A circular future 
Sustainability in the life of an 
electric motor
62 BORDLINE® ESS
High-performance lithium-ion 
batteries for rolling stock  
66 Grid support
Synchronous condensers pro -
vide inertia for grid stabilization
72 Breaking ground
Solid-state protection for DC 
distribution onboard
— 
Buzzword Demystifier
80 OPC UA
—
81 Subscribe
81 French and Spanish translations
81 Imprint—
04	 Guest editorial
05	 Editorial
— 
900th edition
06	 A word from the chairman
07	 Postcards
09 The shoulders of giants 
ABB Review publishes its  
900th issue
— 
Best innovations
16 Selected innovations in brief
— 
Digitalization
34 Cutting the cables 
5G	for	process	automation
40 Secure onboarding
OPC UA helps make industrial 
communication secure
44 Perfect circle
Digitalization and sustainability
50 Digital future
ABB’s next generation drive 
 control platform—
Cover pictures
The picture on the left shows a DP200 
computer. This computer was created

In [None]:
sections = re.split(r'\b(\d{1,4})\b', TOC)

In [None]:
sections

In [None]:
for i in range(1, len(sections) - 1, 2):
    # Check if the section following the number looks like an article title
    potential_title_section = sections[i+1].strip().split("\n")
    print(potential_title_section)

In [None]:
potential_title_section = sections[-1].strip().split("\n")

In [None]:
potential_title_section

In [None]:
subtitle = []
for line in potential_title_section[1:]:
    if '—' in line:
        break
    else:
        subtitle.append(line.strip())

subtitle = " ".join(subtitle) if subtitle else None

In [None]:
subtitle

In [None]:
def extract_articles(content):
    # Split content by potential page numbers and extract the sections
    sections = re.split(r'\b(\d{1,4})\b', content)
    
    # Create a list to store extracted articles
    articles = []
    
    # Iterate over sections to identify articles
    for i in range(1, len(sections) - 1, 2):
        # Check if the section following the number looks like an article title
        potential_title_section = sections[i+1].strip().split("\n")
        
        # If the section is too short or starts with non-alphabetical character, it's likely not a title
        if len(potential_title_section) < 1 or (potential_title_section[0] and not potential_title_section[0][0].isalpha()):
            continue
        
        # Extract title, subtitle, and category
        title = potential_title_section[0]
        if len(title)>0:
        
            # Check for category and subtitle indicators
            subtitle = []
            for line in potential_title_section[1:]:
                if '—' in line:
                    break
                else:
                    subtitle.append(line.strip())

            subtitle = " ".join(subtitle) if subtitle else None

            # Add the article title, subtitle, category, and page number to the list
            articles.append({
                "title": title,
                "subtitle": subtitle,
                "starting page": int(sections[i])
            })
    
    return articles

In [None]:
articles

In [None]:
articles.sort(key=lambda x: int(x['starting page']))

In [None]:
articles

In [None]:
# Extract final articles with titles, subtitles, and categories
articles = extract_articles(TOC_replaced)

# Rearrange artical order
articles.sort(key=lambda x: int(x['starting page']))

# Infer ending page
end_pages = []
for i in range(len(articles) - 1):
    end_pages.append(int(articles[i+1]['starting page']) - 1)
    
# Add placeholder for the last article since we don't have its end page
end_pages.append("Unknown")

# Attach end page to articles
for article, end_page in zip(articles, end_pages):
    if end_page != "Unknown":
        article['page number'] = end_page - article['starting page'] + 1
    else:
        article['page number'] = "Unknown"
    
# Retain only valid articles
subscribe_index = next((i for i, article in enumerate(articles) if article['title'] == 'Subscribe'), None)
if subscribe_index is not None:
    articles = articles[:subscribe_index]
    
# Page number in relative
editorial_page = next((article['starting page'] for article in articles if article['title'] == 'Editorial'), None)
for article in articles:
    article['starting page'] = article['starting page'] - editorial_page + 5

In [None]:
articles

In [None]:
articles

In [None]:
# Infer the last page
valid_articles.sort(key=lambda x: int(x['starting page']))

end_pages = []
for i in range(len(valid_articles) - 1):
    end_pages.append(int(articles[i+1]['starting page']) - 1)
    
# Add placeholder for the last article since we don't have its end page
end_pages.append("Unknown")

# Attach end page to articles
for article, end_page in zip(articles, end_pages):
    article['ending page'] = end_page
    
articles

In [None]:
for article in valid_articles:
    if article['subtitle'] is not None:
        print(len(article['subtitle']))

In [None]:
articles[0]

In [None]:
articles[1]