#### Imports

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from typing import List, Literal, Annotated
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.tools import tool
import getpass
import numpy as np
from datasets import load_dataset
from prompt_poet import Prompt
from tqdm import tqdm
import pandas as pd
from devtools import pprint
from typing import List, Optional

In [3]:
groq_api_key = getpass.getpass()

 ········


In [4]:
import requests
import PyPDF2

def download_arxiv_pdf(arxiv_id, save_path):
    # Construct the arXiv PDF URL
    url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
    # Send a GET request to download the PDF
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Write the content of the response (PDF) to a file
        with open(save_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"PDF downloaded successfully and saved to {save_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""

        # Loop through all the pages
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()

    return text

In [5]:
download_arxiv_pdf("1706.03762","tmp/1706.03762.pdf")

PDF downloaded successfully and saved to tmp/1706.03762.pdf


In [6]:
pdf_text = extract_text_from_pdf("tmp/1706.03762.pdf")

In [7]:
print(pdf_text[0:500])

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.comNoam Shazeer∗
Google Brain
noam@google.comNiki Parmar∗
Google Research
nikip@google.comJakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.comAidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.eduŁukasz Kaise


In [8]:
# estimating how long the document is

from transformers import GPT2Tokenizer

def estimate_context_length(text):
    # Load GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    # Tokenize the text
    tokens = tokenizer.encode(text)
    # Return the number of tokens
    return len(tokens)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [9]:
estimate_context_length(pdf_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (10540 > 1024). Running this sequence through the model will result in indexing errors


10540

Can we parse the structure of the paper, extract title, abstract, sections and their subsections?

#### Zero shot

In [26]:
raw_template = """
- name: system instructions
  role: system
  content: |
   You are an expert in parsing a given research article into title, abstract, section and subsections.

- name: user query
  role: user
  content: |
   Please extract properties defined in 'Parser' function from the text.
   {{ escape_special_characters(text) }}
"""

In [165]:
class Subsection(BaseModel):
    subsection_title: Optional[str] = Field(default=None,description="Title of subsection. Keep it short and within 2-3 words")

class Section(BaseModel):
    section_title: Optional[str] = Field(default=None,description="Title of section. Keep it short and within 2-3 words.")
    # subsections: Optional[List[Subsection]] = Field(default=None,description="List of titles of sub-sections within the section if any")
    
class Parser(BaseModel):
    title: str = Field(default=None,description="The title of the research article")
    authors: List[str] = Field(default=None,description="List of Authors of the research article, keep only the name")
    abstract: str = Field(default=None,description="Abstract of the research article")
    sections: Optional[List[Section]] = Field(default=None,description="Title of the sections mentioned in the text. Keep it short and within 2-3 words")

In [67]:
prompt = Prompt(
    raw_template=raw_template,
    template_data={"text": pdf_text[0:1000]}
)

In [166]:
llm_parser = ChatGroq(temperature=0, model_name="llama3-groq-70b-8192-tool-use-preview",
                      api_key=groq_api_key).with_structured_output(Parser)

In [69]:
result = llm_parser.invoke(prompt.messages)

In [70]:
pprint(result)

Parser(
    title='Attention Is All You Need',
    authors=[
        'Ashish Vaswani',
        'Noam Shazeer',
        'Niki Parmar',
        'Jakob Uszkoreit',
        'Llion Jones',
        'Aidan N. Gomez',
        'Łukasz Kaiser',
        'Illia Polosukhin',
    ],
    abstract=(
        'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks tha'
        't include an encoder and a decoder. The best performing models also connect the encoder and decoder through a'
        'n attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attenti'
        'on mechanisms, dispensing with recurrence and convolutions entirely. Experime'
    ),
    sections=[
        Section(
            section_title='Abstract',
        ),
    ],
)


In [76]:
prompt = Prompt(
    raw_template=raw_template,
    template_data={"text": pdf_text}
)

In [80]:
# result = llm_parser.invoke(prompt.messages)

In [None]:
pprint(result)

There are two problems here.

1. Context Length too large
2. Too many tasks in a single prompt
3. We don't see the structure of the text after parsing (Parsing PDFs is not easy)

What if we read page by page.

Or even better option is to use divide and conquer.

We can one prompt for extraction, another prompt for sections, another one for sub-sections etc.

#### Mutli-Prompt 

In [112]:
prompt_0_template = """
- name: system instructions
  role: system
  content: |
   You are an expert in parsing a given research article into title, abstract, section and subsections.
   Since I can not pass the whole document in a single iteration, I will pass text page by page.
   This is the first page. Read the page and understand what components do you need to do this task. 
   Keep in memory that the output from first page will be passed on to you when you process next page.

- name: user query
  role: user
  content: |
   Please extract title, abstract, sections, subsections from the following text.
   {{ escape_special_characters(text) }}
"""

In [113]:
def read_page_by_page(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""

        # Loop through all the pages
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            yield page.extract_text()

In [114]:
page_gen = read_page_by_page("tmp/1706.03762.pdf")

In [115]:
llm_base = ChatGroq(temperature=0, model_name="llama3-groq-70b-8192-tool-use-preview",
                      api_key=groq_api_key)

In [116]:
page_0 = next(page_gen)

In [117]:
prompt_0 = Prompt(
    raw_template=prompt_0_template,
    template_data={"text": page_0}
)

In [118]:
page0_resp = llm_base.invoke(prompt_0.messages)

In [119]:
prompt_x_template = """
- name: system instructions
  role: system
  content: |
   You are an expert in parsing a given research article into title, abstract, section and subsections.
   Since I can not pass the whole document in a single iteration, I will pass text page by page.
   This is the not the first page. Read the page and understand what components do you need to do this task. 
   The output from previous page is given below.
   {{ escape_special_characters(prev_page_response) }}

- name: user query
  role: user
  content: |
   Please extract title, abstract, sections, subsections from the following text.
   {{ escape_special_characters(text) }}
"""

In [120]:
page_1 = next(page_gen)

In [121]:
prompt_1 = Prompt(
    raw_template=prompt_x_template,
    template_data={"text": page_1, "prev_page_response": page0_resp.content}
)

In [122]:
page1_resp = llm_base.invoke(prompt_1.messages)

In [125]:
print(page1_resp.content)

Title: Attention Is All You Need

Abstract: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models f

Looks like it is working. Let me simplify the code.

In [150]:
parser_prompt = """
- name: system instructions
  role: system
  content: |
   You are an expert in parsing a given research article into title, abstract, section and subsections.
   Since I can not pass the whole document in a single iteration, I will pass text page by page.
   {% if page_number == 1 %}
   This is the first page. Read the page and understand what components do you need to do this task. 
   Keep in memory that the output from first page will be passed on to you when you process next page.
   Only keep the text that you need. Keep it short.
   {% endif %}
   {% if page_number != 1 %}
   This is the not the first page. Read the current page and understand what components do you need to do this task. 
   The output from previous page is given below.
    Only keep the text that you need. Keep it short.
   {{ escape_special_characters(prev_page_response) }}
   {% endif %}

- name: user query
  role: user
  content: |
   Please extract title, abstract, sections, subsections from the following text.
   {{ escape_special_characters(current_page) }}
"""

In [151]:
page_gen = read_page_by_page("tmp/1706.03762.pdf")

In [152]:
pages = list(page_gen)

In [153]:
len(pages)

15

In [155]:
current_resp = ''
i =1
for page in tqdm(pages):
    prompt = Prompt(
    raw_template=parser_prompt,
    template_data={"prev_page_response": current_resp, "current_page": page, "page_number" : i}
    )
    resp = llm_base.invoke(prompt.messages)
    # print(resp.content)
    current_resp += resp.content
    i += 1

100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [02:19<00:00,  9.27s/it]


In [156]:
print(current_resp)

Title: Attention Is All You Need
Abstract: We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train.Title: Attention Is All You Need
Abstract: We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train.

Section 1: Introduction
- Discusses the use of recurrent neural networks and their limitations in sequence modeling and transduction problems.
- Introduces the Transformer model as a new approach that relies entirely on self-attention mechanisms.

Section 2: Backgro

In [167]:
prompt = Prompt(
    raw_template=raw_template,
    template_data={"text": current_resp}
)

In [168]:
result = llm_parser.invoke(prompt.messages)

In [169]:
pprint(result)

Parser(
    title='Attention Is All You Need',
    authors=[
        'Ashish Vaswani',
        'Noam Shazeer',
        'Niki Parmar',
        'Llion Jones',
        'Aidan Gomez',
        'Lukasz Kaiser',
        'Illia Polosukhin',
    ],
    abstract=(
        'We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensi'
        'ng with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models '
        'to be superior in quality while being more parallelizable and requiring significantly less time to train.'
    ),
    sections=[
        Section(
            section_title='Introduction',
        ),
        Section(
            section_title='Background',
        ),
        Section(
            section_title='Model Architecture',
        ),
        Section(
            section_title='Why Self-Attention',
        ),
        Section(
            section_title='Training the Transformer Model',


Looks like the mentions of Law triggered the LLM to bring out noise.