In [1]:
import io
from pdfminer.high_level import extract_text
import tiktoken

In [2]:
def count_tokens(text: str):
    # Encoding for GPT-3 and later models
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text, disallowed_special=())
    num_tokens = len(tokens)
    return num_tokens

# Read the PDF
Let's read the PDF and see how many tokens it has. We'll also print out a snippet to see what it looks like.

In [14]:
pdf_path = '/Users/nicholasking/Library/CloudStorage/OneDrive-Personal/Computer Science/Papers/ML/Yi- Open Foundation Models by 01AI.pdf'
pdf_text = extract_text(pdf_path)

print(f"PDF file has {len(pdf_text)} chars, {count_tokens(pdf_text)} tokens")
print(pdf_text[:1000] + "...")

PDF file has 87764 chars, 24231 tokens
Yi: Open Foundation Models by 01.AI

01.AI

Code: https://github.com/01-ai/Yi
Model: https://huggingface.co/01-ai

Abstract

We introduce the Yi model family, a series of language and multimodal models that
demonstrate strong multi-dimensional capabilities. The Yi model family is based
on 6B and 34B pretrained language models, then we extend them to chat models,
200K long context models, depth-upscaled models, and vision-language models.
Our base models achieve strong performance on a wide range of benchmarks like
MMLU, and our finetuned chat models deliver strong human preference rate on
major evaluation platforms like AlpacaEval and Chatbot Arena. Building upon our
scalable super-computing infrastructure and the classical transformer architecture,
we attribute the performance of Yi models primarily to its data quality resulting
from our data-engineering efforts. For pretraining, we construct 3.1 trillion tokens
of English and Chinese corpora usi

# Summarize the PDF with an Guidance
Next, we'll prompt an LLM to generate a summary of the PDF using Guidance. 

In [3]:
import os
from guidance import models

azure_model = os.getenv("AZUREAI_CHAT_MODEL", "Please set the model")
azure_endpoint = os.getenv("AZUREAI_CHAT_ENDPOINT", "Please set the endpoint")
azure_api_key=os.getenv("AZUREAI_CHAT_KEY", "Please set API key")

gpt4 = models.AzureOpenAI(
    model=azure_model,
    azure_endpoint=azure_endpoint,
    api_key=azure_api_key
)

In [None]:
from guidance import system, user, assistant, gen

gpt4_summary = gpt4.copy()

with system():
    gpt4_summary += "You are a reading and summarization assistant."

with user():
    gpt4_summary += f"Please summarize the following machine learning paper for me.\n\n{pdf_text}"

with assistant():
    gpt4_summary += gen(max_tokens=2000, temperature=0.7)

gpt4_summary

# Read the paper as HTML
Try reading the paper using HTML layout and see how it changes the quality of the summary.

In [7]:
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
html_stringio = StringIO()
with open(pdf_path, 'rb') as fin:
    extract_text_to_fp(fin, html_stringio, laparams=LAParams(),
                       output_type='html', codec=None)

html_string = html_stringio.getvalue()

print(f"PDF -> HTML has {len(html_string)} chars, {count_tokens(html_string)} tokens")
print(html_string[:1000] + "...")

NameError: name 'pdf_path' is not defined

In [10]:
# There are too many tokens in the HTML, so we need to clean it up
# Also see: https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#using-pdfminer-to-generate-html-text

from bs4 import BeautifulSoup, NavigableString, Comment

def is_empty_element(elem):
    if isinstance(elem, (NavigableString, Comment)):
        # Skip NavigableString and Comment objects; they're not containers
        return False
    # Check if a tag is empty or contains only whitespace
    text_content = ''.join(elem.stripped_strings)
    return not text_content and all(is_empty_element(child) for child in elem.children)

def remove_empty_elements(soup):
    for elem in list(soup.find_all(True)):  # Find all tags
        if is_empty_element(elem) and elem.name != 'body':  # Avoid removing the body tag
            elem.decompose()

def minify_html(html_string):
    # Assuming `html_string` is your original HTML content
    soup = BeautifulSoup(html_string, 'html.parser')

    # Remove all <br> tags
    for br in soup.find_all("br"):
        br.decompose()

    # Iterate over all tags and clear their attributes
    for tag in soup.find_all(True):
        tag.attrs = {}

    # Remove empty elements
    remove_empty_elements(soup)

    # Convert the soup object back to a string without extra attributes, without <br> tags, and without empty elements
    return str(soup)



In [None]:
minified_html = minify_html(html_string)
print(f"Cleaned HTML has {len(minified_html)} chars, {count_tokens(minified_html)} tokens")
print(minified_html[:1000] + "...")

In [None]:
from guidance import system, user, assistant, gen

gpt4_summary_html = gpt4.copy()

with system():
    gpt4_summary_html += "You are a reading and summarization assistant."

with user():
    gpt4_summary_html += f"Please summarize the following machine learning paper for me.\n\n{minified_html}"

with assistant():
    gpt4_summary_html += gen(max_tokens=2000, temperature=0.7)

gpt4_summary_html

In [11]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to fetch
url = "https://arxiv.org/html/2403.04652v1"

# Fetch the HTML content of the webpage
response = requests.get(url)
html_content = response.text

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Select the <article> element and all its children
article_element = soup.find('article')

# Check if the article element exists and stringify it
if article_element:
    article_html = str(article_element)
else:
    article_html = "Article element not found"

# article_html now contains the stringified HTML of the <article> element and its children
print(f"HTML has {len(article_html)} chars, {count_tokens(article_html)} tokens")
print(article_html[:500])  # Print the first 500 characters to verify


HTML has 297354 chars, 114164 tokens
<article class="ltx_document ltx_authors_1line">
<h1 class="ltx_title ltx_title_document">Yi: Open Foundation Models by 01.AI </h1>
<div class="ltx_authors">
<span class="ltx_creator ltx_role_author">
<span class="ltx_personname">
<span class="ltx_text ltx_font_bold" id="id1.1.id1">01.AI</span>
<br class="ltx_break"/>â 
<br class="ltx_break"/><span class="ltx_text ltx_font_bold" id="id2.2.id2">Code:</span>â <a class="ltx_ref ltx_url ltx_font_typewriter" href="https://github.com/01-ai/Yi" tit


In [12]:
article_minified_html = minify_html(article_html)
print(f"Cleaned HTML has {len(article_minified_html)} chars, {count_tokens(article_minified_html)} tokens")
print(article_minified_html[:500] + "...")

Cleaned HTML has 123612 chars, 38154 tokens
<article>
<h1>Yi: Open Foundation Models by 01.AI </h1>
<div>
<span>
<span>
<span>01.AI</span>
â 
<span>Code:</span>â <a>https://github.com/01-ai/Yi</a>
<span>Model:</span>â <a>https://huggingface.co/01-ai</a>

</span><span>
<span>
</span></span></span>
</div>
<div>
<h6>Abstract</h6>
<p>We introduce the Yi model family, a series of language and multimodal models that demonstrate strong multi-dimensional capabilities.
The Yi model family is based on 6B and 34B pretrained language models, th...


In [None]:
from guidance import system, user, assistant, gen

gpt4_summary_html = gpt4.copy()

with system():
    gpt4_summary_html += "You are a reading and summarization assistant."

with user():
    gpt4_summary_html += f"Please summarize the following machine learning paper for me.\n\n{article_minified_html}"

with assistant():
    gpt4_summary_html += gen(max_tokens=4000, temperature=0.9)

gpt4_summary_html

# Test with local model

In [26]:
yi = models.LlamaCppChat("/Users/nicholasking/code/models/yi-34b-200k-llamafied.Q4_K_S.gguf", n_gpu_layers=-1, n_ctx=64000)

In [27]:
yi + "Hello, " + gen(max_tokens=100, temperature=0.9)

In [None]:
yi_chat = yi.copy()

with system():
    yi_chat += "You are a reading and summarization assistant."

with user():
    yi_chat += f"Please summarize the following machine learning paper for me.\n\n{article_minified_html}"

with assistant():
    # Fails with error message that context window is too large
    yi_chat += gen(max_tokens=500, temperature=0.9)