#### Imports

In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from typing import List, Literal, Annotated
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.tools import tool
import getpass
import numpy as np
from datasets import load_dataset
from prompt_poet import Prompt
from tqdm import tqdm
import pandas as pd
from devtools import pprint
from typing import List, Optional

In [2]:
groq_api_key = getpass.getpass()

 ········


In [14]:
import requests
import PyPDF2

def download_arxiv_pdf(arxiv_id, save_path):
    # Construct the arXiv PDF URL
    url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
    
    # Send a GET request to download the PDF
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Write the content of the response (PDF) to a file
        with open(save_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"PDF downloaded successfully and saved to {save_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")

def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""

        # Loop through all the pages
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()

    return text


In [8]:
download_arxiv_pdf("1706.03762","tmp/1706.03762.pdf")

PDF downloaded successfully and saved to tmp/1706.03762.pdf


In [15]:
pdf_text = extract_text_from_pdf("tmp/1706.03762.pdf")

In [16]:
print(pdf_text[0:500])

Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.comNoam Shazeer∗
Google Brain
noam@google.comNiki Parmar∗
Google Research
nikip@google.comJakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.comAidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.eduŁukasz Kaise


In [32]:
from transformers import GPT2Tokenizer

def estimate_context_length(text):
    # Load GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    # Tokenize the text
    tokens = tokenizer.encode(text)
    # Return the number of tokens
    return len(tokens)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [34]:
estimate_context_length(pdf_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (10540 > 1024). Running this sequence through the model will result in indexing errors


10540

Can we parse the structure of the paper, extract title, abstract, sections and their subsections?

#### Zero shot

In [156]:
raw_template = """
- name: system instructions
  role: system
  content: |
   You are an expert in parsing a given research article into title, abstract, section and subsections.

- name: user query
  role: user
  content: |
   Please extract properties defined in 'Parser' function from the text.
   {{ escape_special_characters(text) }}
"""

In [166]:
Section

__main__.Section

In [162]:
# class Subsection(BaseModel):
#     subsection_title: Optional[str] = Field(description="Title of subsection")

class Section(BaseModel):
    section_tile: Optional[str] = Field(description="Title of section. Keep it short and within 2-3 words.")
    # subsections: Optional[List[Subsection]] = Field(description="List of sub-sections within the section")
    
class Parser(BaseModel):
    title: str = Field(description="The title of the research article")
    authors: List[str] = Field(description="List of Authors of the research article, keep only the name")
    abstract: str = Field(description="Abstract of the research article")
    sections: Optional[List[Section]] = Field(description="Main Section mentioned in the text")

In [169]:
prompt = Prompt(
    raw_template=raw_template,
    template_data={"text": pdf_text[0:15000]}
)

In [170]:
llm_parser = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768",api_key=groq_api_key).with_structured_output(Parser)

In [171]:
result = llm_parser.invoke(prompt.messages)

In [172]:
pprint(result)

Parser(
    title='Attention is All You Need',
    authors=[
        'Ashish Vaswani∗',
        'Noam Shazeer∗',
        'Niki Parmar∗',
        'Jakob Uszkoreit∗',
        'Llion Jones∗',
        'Aidan N. Gomez∗ †',
        'Łukasz Kaiser∗',
        'Illia Polosukhin∗ ‡',
    ],
    abstract=(
        'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks tha'
        't include an encoder and a decoder. The best performing models also connect the encoder and decoder through a'
        'n attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attenti'
        'on mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation t'
        'asks show these models to be superior in quality while being more parallelizable and requiring significantly '
        'less time to train.'
    ),
    sections=[
        Section(
            section_tile='Abstract

In [176]:
prompt = Prompt(
    raw_template=raw_template,
    template_data={"text": pdf_text[15000:25000]}
)

In [177]:
llm_parser = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768",api_key=groq_api_key).with_structured_output(Parser)

In [178]:
result = llm_parser.invoke(prompt.messages)

In [179]:
pprint(result)

Parser(
    title='Self-Attention (restricted)',
    authors=[
        'Ashish Vaswani',
        'Noam Shazeer',
        'Niki Parmar',
        'Jakob Uszkoreit',
        'Llion Jones',
        'Aidan N. Gomez',
        'Lukasz Kaiser',
        'Illia Polosukhin',
    ],
    abstract=(
        'In this work, we introduce a new type of attention mechanism, called self-attention, which allows the model t'
        'o focus on different parts of the input sequence when producing an output. We show that self-attention can be'
        ' used to improve the performance of machine translation models, and we also demonstrate its effectiveness on '
        'other tasks such as language modeling and text classification.'
    ),
    sections=[
        Section(
            section_tile='Introduction',
        ),
        Section(
            section_tile='Self-Attention Mechanism',
        ),
        Section(
            section_tile='Machine Translation Experiments',
        ),
        Section(
   