### Objective

In this notebook, we will try to automatically extract the page numbers of an ABB review issue by using LLM.

In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.prompts import PromptTemplate
from langchain.llms import AzureOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List
import os

#### Load ABB review issues

In [2]:
issue_name = 'ABB Review_02_2023_layout complete_EN_72-300dpi'
loader = PyMuPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()
TOC_page = 3
TOC = raw_documents[TOC_page-1].page_content

In [3]:
TOC

'02|2023\n85\n— \nAssets in motion\n136 Clean machine\nCarbon emissions from EV \nbattery production and use\n140 Plug-in mines\nWorld’s first fully automated \ncharging system for mining \ntrucks\n146 Modeling flow\nMultiphysics-based reduced \n order model (ROM) for mine \npollution control\n— \nBuzzword Demystifier\n152 Industrial Metaverse\nHow can the Industrial \nMetaverse help ABB and its \n customers?\n—\n153 Subscribe\n153 French and Spanish translations\n153 Imprint\n—\n87�\nEditorial\n— \n88�\n2022�ABB�Research�Award�\n \n Prestigious award for work on \nconnected device security\n— \nEnhanced knowledge\n92 \nFor greater results\nHigh Speed Alignment – visual \nservoing technology for ultra- \nhigh precision assembly\n100 The right moves\nSoftware that optimizes robot \nperformance\n106 The DCS of tomorrow\nEnvisioning the future of process \nautomation\n 112 Safe cyber space\n ABB Ability™ Cyber Security \nWorkplace\n 118 The virtues of  virtualization\n Virtual protection 

#### Output parser

In [None]:
class table_of_content(BaseModel):
    title: str = Field(description="title of an article")
    subtitle: str = Field(description="subtitle of an article")
    starting_page_number: str = Field(description="starting page number of an article")

In [None]:
# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=table_of_content)

prompt = PromptTemplate(
    template="""Given the table of content page extracted from a company's technical review journal,
    please extract the title, subtitle, and starting page number of individual articles based on 
    your understanding of the text semantic.
    {format_instructions}
    {TOC}""",
    input_variables=["TOC"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(TOC=TOC)

output = llm(_input.to_string())

parser.parse(output)

#### Set up output parser

In [None]:
response_schemas = [
    ResponseSchema(name="title", description="title of the article"),
    ResponseSchema(name="starting_page_number", description="starting page number of the article"),
    ResponseSchema(name="subtitle", description="subtitle of the article")
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [None]:
format_instructions = output_parser.get_format_instructions()
prompt = PromptTemplate(
    template="""Given the table of content page extracted from a company's technical review journal,
    please extract the title, subtitle, and starting page number of individual articles.
    {format_instructions}
    {TOC}""",
    input_variables=["TOC"],
    partial_variables={"format_instructions": format_instructions}
)

In [4]:
llm = AzureOpenAI(
    deployment_name="deployment-5af509f3323342ee919481751c6f8b7d",
    model_name="text-davinci-003",
    openai_api_base="https://abb-chcrc.openai.azure.com/",
    openai_api_version="2023-03-15-preview",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_type="azure",
)

In [5]:
prompt = f"""The following is the table of content page extracted from a company's technical review journal. 
Could you please extract the title, subtitle, as well as the starting page of individial articles? 
Please do so based on your understanding of the text semantic.

{TOC}"""

output = llm(prompt)

In [7]:
print(output)


Article 1: 
Title: Assets in motion 
Subtitle: Carbon emissions from EV battery production and use
Starting page: 136 

Article 2: 
Title: Buzzword Demystifier 
Subtitle: Industrial Metaverse 
Starting page: 152 

Article 3: 
Title: Editorial 
Subtitle: N/A 
Starting page: 88 

Article 4: 
Title: 2022 ABB Research Award 
Subtitle: Prestigious award for work on connected device security 
Starting page: 92 

Article 5: 
Title: Enhanced knowledge 
Subtitle: High Speed Alignment – visual servoing technology for ultra-high precision assembly 
Starting page: 100 

Article 6: 
Title: The right moves 
Subtitle: Software that optimizes robot performance 
Starting page: 106 

Article 7: 
Title: The DCS of tomorrow 
Subtitle: Envisioning the future of process automation 
Starting page: 112 

Article 8: 
Title: Safe cyber space 
Subtitle: ABB Ability™ Cyber Security Workplace 
Starting page: 118 




In [None]:
_input = prompt.format_prompt(TOC=TOC)
output = llm(_input.to_string())
output_parser.parse(output)

In [None]:
output