### Objective

In this notebook, we will try to automatically extract the page numbers of an ABB review issue.

In [1]:
from langchain.document_loaders import PyPDFLoader
import re

In [6]:
# issue_name = 'ABB Review_03_2022_layout complete_EN_72-200dpi'
issue_name = 'ABB Review_03_2023_layout complete_EN_300dpi'
# issue_name = 'ABB Review_02_2023_layout complete_EN_72-300dpi'
# issue_name = 'ABB Review_01_2023_layout complete_EN_72-300dpi'
loader = PyPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()
TOC_page = 3
TOC = raw_documents[TOC_page-1].page_content

# Remove font issue
TOC = TOC.replace('�', '')

In [7]:
def extract_metadata(text):
    # Split the text into lines
    lines = text.split('\n')
    
    # Extract TOC page number
    TOC_page_number = int(lines[0].split('|')[-1][4:])

    # Prepare to iterate through the lines
    i = 0
    articles = []
    while i < len(lines):
        line = lines[i].strip()

        # Check if the line starts with a number followed by the article title
        # Ensure the following characters are not digits
#         match = re.match(r'(\d+)\s+([^\d]+)$', line)
        match = re.search(r'(\d+)\s+([^\d]+)$', line)
        if match:
            # Extract the starting page number and title
            start_page, title = match.groups()
            subtitle = ""
            i += 1

            # Extract the subtitle (lines until delimiter '—' or next "page number + title" combination)
            while i < len(lines) and not re.match(r'\d+\s+[^\d]+$', lines[i]):
                if '—' in lines[i]:  # Check if delimiter is present in the line
                    # Split the line at the delimiter and keep only the left portion
                    left_portion = lines[i].split('—')[0].strip()
                    subtitle += ' ' + left_portion
                    i += 1
                    break
                subtitle += ' ' + lines[i].strip()
                i += 1

            subtitle = subtitle.strip()
            
            # remove invalid articles with too long subtitles
            if len(subtitle)<150:
                articles.append({"start_page": int(start_page), "title": title, "subtitle": subtitle})
        else:
            i += 1

    return articles, TOC_page_number

In [8]:
# Extract final articles with titles, subtitles, and categories
articles, TOC_page_number = extract_metadata(TOC)

# Rearrange artical order
articles.sort(key=lambda x: int(x['start_page']))

# Infer ending page
end_pages = []
for i in range(len(articles) - 1):
    end_pages.append(articles[i+1]['start_page'] - 1)
    
# Add placeholder for the last article since we don't have its end page
end_pages.append("Unknown")

# Attach end page to articles
for article, end_page in zip(articles, end_pages):
    if end_page != "Unknown":
        article['length'] = end_page - article['start_page'] + 1
    elif article['title']=='Editorial':
        article['length'] = 1
    else:
        article['length'] = "Unknown"
    
# Retain only valid articles
subscribe_index = next((i for i, article in enumerate(articles) if article['title'] == 'Subscribe'), None)
if subscribe_index is not None:
    articles = articles[:subscribe_index]
    
# Page number in relative
for article in articles:
    article['start_page'] = article['start_page'] - TOC_page_number + 3

In [9]:
articles

[{'start_page': 5, 'title': 'Editorial', 'subtitle': '', 'length': 3},
 {'start_page': 8,
  'title': 'Pathway to sustainability',
  'subtitle': "Interview with ABB's Head of Sustainability, Anke Hampel",
  'length': 6},
 {'start_page': 14,
  'title': 'Electric switch',
  'subtitle': 'Improving sustainability by switching to electric vehicles',
  'length': 4},
 {'start_page': 18,
  'title': 'In grid we trust',
  'subtitle': 'The electric grid is the silent enabler of a more sustainable energy system',
  'length': 8},
 {'start_page': 26,
  'title': 'On a mission',
  'subtitle': 'Smart energy and asset  mana  ge- ment makes buildings  energy- efficient',
  'length': 8},
 {'start_page': 34,
  'title': 'Sustainable living',
  'subtitle': 'Interview: Smarter buildings and the ABB Electrification Startup Challenge',
  'length': 6},
 {'start_page': 40,
  'title': 'Perfect partners',
  'subtitle': 'Microsoft and ABB: Enabling improved energy efficiency in customer operations',
  'length': 6},
 

In [None]:
len(articles)