In [1]:
from langchain_community.document_loaders import PDFMinerLoader
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
from typing import Literal
from pathlib import Path

In [3]:
from io import StringIO
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

output_string = StringIO()
with open("2308.04014v2.pdf", "rb") as fin:
    extract_text_to_fp(
        fin, output_string, laparams=LAParams(), output_type="html", codec=None
    )

In [10]:
text = output_string.getvalue().strip()

In [11]:
len(text)

301245

In [6]:
PATH = r"C:\Users\tomas\Downloads\2308.04014v2.pdf"

In [8]:
loader = PDFMinerPDFasHTMLLoader(PATH)
data = loader.load()[0]

In [21]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(text, "html.parser")
content = soup.find_all("div")

In [22]:
import re

cur_fs = None
cur_text = ""
snippets = []  # first collect all snippets that have the same font size
for c in content:
    sp = c.find("span")
    if not sp:
        continue
    st = sp.get("style")
    if not st:
        continue
    fs = re.findall("font-size:(\d+)px", st)
    if not fs:
        continue
    fs = int(fs[0])
    if not cur_fs:
        cur_fs = fs
    if fs == cur_fs:
        cur_text += c.text
    else:
        snippets.append((cur_text, cur_fs))
        cur_fs = fs
        cur_text = c.text
snippets.append((cur_text, cur_fs))
# Note: The above logic is very straightforward. One can also add more strategies such as removing duplicate snippets (as
# headers/footers in a PDF appear on multiple pages so if we find duplicates it's safe to assume that it is redundant info)

In [15]:
from langchain_community.docstore.document import Document

cur_idx = -1
semantic_snippets = []
# Assumption: headings have higher font size than their respective content
for s in snippets:
    # if current snippet's font size > previous section's heading => it is a new heading
    if (
        not semantic_snippets
        or s[1] > semantic_snippets[cur_idx].metadata["heading_font"]
    ):
        metadata = {"heading": s[0], "content_font": 0, "heading_font": s[1]}
        metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content="", metadata=metadata))
        cur_idx += 1
        continue

    # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
    # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)
    if (
        not semantic_snippets[cur_idx].metadata["content_font"]
        or s[1] <= semantic_snippets[cur_idx].metadata["content_font"]
    ):
        semantic_snippets[cur_idx].page_content += s[0]
        semantic_snippets[cur_idx].metadata["content_font"] = max(
            s[1], semantic_snippets[cur_idx].metadata["content_font"]
        )
        continue

    # if current snippet's font size > previous section's content but less than previous section's heading than also make a new
    # section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)
    metadata = {"heading": s[0], "content_font": 0, "heading_font": s[1]}
    # metadata.update(data.metadata)
    semantic_snippets.append(Document(page_content="", metadata=metadata))
    cur_idx += 1

In [18]:
semantic_snippets

[Document(page_content='Kshitij Gupta * 1 2 Benjamin Th´erien * 1 2 Adam Ibrahim * 1 2 Mats L. Richter 1 2 Quentin Anthony 1 2 3\nEugene Belilovsky 4 1 2 Irina Rish 1 2 Timoth´ee Lesort 1 2\n', metadata={'heading': 'Continual Pre-Training of Large Language Models: How to (re)warm your\nmodel?\n', 'content_font': 9, 'heading_font': 14, 'source': 'C:\\Users\\tomas\\Downloads\\2308.04014v2.pdf'}),
 Document(page_content='3\n2\n0\n2\np\ne\nS\n6\n]\nL\nC\n.\ns\nc\n[\n2\nv\n4\n1\n0\n4\n0\n.\n8\n0\n3\n2\n:\nv\ni\nX\nr\na\nLarge language models (LLMs) are routinely pre-\ntrained on billions of tokens, only to restart the\nprocess over again once new data becomes avail-\nable. A much cheaper and more efficient solution\nwould be to enable the continual pre-training of\nthese models, i.e. updating pre-trained models\nwith new data instead of re-training them from\nscratch. However, the distribution shift induced\nby novel data typically results in degraded per-\nformance on past data. Taking a s

In [16]:
from ice.paper import Paragraph, Section, Paper

  from .autonotebook import tqdm as notebook_tqdm


In [83]:
SectionType = Literal["abstract", "main", "back"]

In [84]:
def snippet_to_paragraph_parser(snippets):
    paragraphs = []
    for snippet in snippets:
        sections = [Section(title=snippet.metadata["heading"])]
        section_type = section_type_parser(snippet.metadata["heading"])

        data = {
            "sentences": [snippet.page_content],
            "sections": sections,
            "sectionType": section_type,
        }
        par = Paragraph(**data)
        paragraphs.append(par)
    return Paper(paragraphs=paragraphs)

In [85]:
def section_type_parser(section_str: str) -> SectionType:
    if "abstract" in section_str.lower():
        return "abstract"
    if "references" in section_str.lower():
        return "back"
    else:
        return "main"

In [86]:
paper = snippet_to_paragraph_parser(semantic_snippets)

In [90]:
paper.paragraphs[0].sentences[0]

'Kshitij Gupta * 1 2 Benjamin Th´erien * 1 2 Adam Ibrahim * 1 2 Mats L. Richter 1 2 Quentin Anthony 1 2 3\nEugene Belilovsky 4 1 2 Irina Rish 1 2 Timoth´ee Lesort 1 2\n'

In [24]:
from dotenv import load_dotenv,dotenv_values

In [41]:
load_dotenv(dotenv_path="../.ought-ice/.env", override=True)

True

In [94]:
paper.paragraphs[0].sentences[1]

IndexError: list index out of range

In [65]:
from openai import OpenAI

client = OpenAI()

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-3.5-turbo",
    top_p=1,
    n=1,
    logprobs=True,
    max_tokens=1,
    top_logprobs=5,
)

In [67]:
chat_completion.choices[0].logprobs.content

[ChatCompletionTokenLogprob(token='This', bytes=[84, 104, 105, 115], logprob=-0.00020735491, top_logprobs=[TopLogprob(token='This', bytes=[84, 104, 105, 115], logprob=-0.00020735491), TopLogprob(token='"This', bytes=[34, 84, 104, 105, 115], logprob=-9.357522), TopLogprob(token='Sure', bytes=[83, 117, 114, 101], logprob=-9.853229), TopLogprob(token=' This', bytes=[32, 84, 104, 105, 115], logprob=-10.9564085), TopLogprob(token='this', bytes=[116, 104, 105, 115], logprob=-11.682382)])]

In [71]:
bytes(tokens[0].bytes).decode("utf-8")

'This'

In [68]:
from IPython.display import display, HTML

tokens = chat_completion.choices[0].logprobs.content

colors = [
    "#FF00FF",  # Magenta
    "#008000",  # Green
    "#FF8C00",  # Dark Orange
    "#FF0000",  # Red
    "#0000FF",  # Blue
]

color_idx = 0  # Initialize color index
html_output = ""  # Initialize HTML output
for t in tokens:
    token_str = bytes(t.bytes).decode("utf-8")  # Decode bytes to string

    # Add colored token to HTML output
    html_output += f"<span style='color: {colors[color_idx]}'>{token_str}</span>"

    # Move to the next color
    color_idx = (color_idx + 1) % len(colors)
display(HTML(html_output))  # Display HTML output
print(f"Total number of tokens: {len(tokens)}")

Total number of tokens: 1


In [80]:
[truncate_message(x.page_content) for x in semantic_snippets]

[True, True, True, True, True, True, True, True, False, True, True]

In [1]:
from ice.utils import make_gpt2_tokenizer

In [2]:
tokenizer = make_gpt2_tokenizer()

In [4]:
tokenizer.encode("this is a test")

[5661, 318, 257, 1332]