In [1]:
# Perform Google Colab installs (if running in Google Colab)
import os

if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, installing requirements.")
    !pip install -U torch # requires torch 2.1.1+ (for efficient sdpa implementation)
    !pip install PyMuPDF # for reading PDFs with Python
    !pip install tqdm # for progress bars
    !pip install sentence-transformers # for embedding models
    !pip install accelerate # for quantization model loading
    !pip install bitsandbytes # for quantizing models (less storage space)
    !pip install flash-attn --no-build-isolation # for faster attention mechanism = faster LLM inference

[INFO] Running in Google Colab, installing requirements.
Collecting torch
  Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cuf

In [2]:
import fitz
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm"
import re
import pandas as pd
import random
import numpy as np
import torch

## Import cleaned/preprocessed pdf:

In [3]:
pdf_path = 'utitlites.pdf'

## Parse the utitlities.pdf

In [4]:

def parse_metadata(text):
    """Uses regex to find and parse metadata that matches the specific pattern."""
    metadata_regex = r"Ticker: (\w+), Sector: ([\w\s]+), Filed At: (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}-\d{2}:\d{2})"
    match = re.search(metadata_regex, text)
    if match:
        return {
            "ticker": match.group(1),
            "sector": match.group(2),
            "filing_date": match.group(3)
        }
    return {"ticker": "", "sector": "", "filing_date": ""}

def text_formatter(text):
    """Replaces newlines with spaces and strips unnecessary whitespace."""
    return text.replace('\n', ' ').strip()

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, collects statistics,
    and parses metadata from the first line that matches a specific pattern.
    """
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in enumerate(doc):
        text = page.get_text()
        formatted_text = text_formatter(text)

        # Extract and parse metadata using the updated function
        metadata = parse_metadata(formatted_text)

        pages_and_texts.append({
            "page_number": page_number,
            "page_char_count": len(formatted_text),
            "page_word_count": len(formatted_text.split()),
            "page_sentence_count_raw": len(formatted_text.split(". ")),
            "page_token_count": len(formatted_text.split()) / 4,
            "ticker": metadata['ticker'],
            "sector": metadata['sector'],
            "filing_date": metadata['filing_date'],
            "text": formatted_text
        })
    return pages_and_texts


In [5]:
pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

In [6]:
df = pd.DataFrame(pages_and_texts)
df

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,ticker,sector,filing_date,text
0,0,5097,743,34,185.75,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023..."
1,1,3631,513,19,128.25,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023..."
2,2,3621,502,17,125.50,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023..."
3,3,3674,540,19,135.00,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023..."
4,4,3719,574,19,143.50,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023..."
...,...,...,...,...,...,...,...,...,...
1489,1489,3695,515,18,128.75,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023..."
1490,1490,3742,555,26,138.75,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023..."
1491,1491,3729,557,22,139.25,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023..."
1492,1492,3682,524,25,131.00,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023..."


## Split into Sentences using sPacy:

In [7]:
from spacy.lang.en import English # see https://spacy.io/usage for install instructions

nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7b76c0aadf40>

In [8]:
for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)

    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # Count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1494 [00:00<?, ?it/s]

In [9]:
df = pd.DataFrame(pages_and_texts)
df

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,ticker,sector,filing_date,text,sentences,page_sentence_count_spacy
0,0,5097,743,34,185.75,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...","[Ticker: LNT, Sector: Utilities, Filed At: 202...",32
1,1,3631,513,19,128.25,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...","[Ticker: LNT, Sector: Utilities, Filed At: 202...",19
2,2,3621,502,17,125.50,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...","[Ticker: LNT, Sector: Utilities, Filed At: 202...",17
3,3,3674,540,19,135.00,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...","[Ticker: LNT, Sector: Utilities, Filed At: 202...",19
4,4,3719,574,19,143.50,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...","[Ticker: LNT, Sector: Utilities, Filed At: 202...",19
...,...,...,...,...,...,...,...,...,...,...,...
1489,1489,3695,515,18,128.75,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023...","[Ticker: XEL, Sector: Utilities, Filed At: 202...",18
1490,1490,3742,555,26,138.75,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023...","[Ticker: XEL, Sector: Utilities, Filed At: 202...",26
1491,1491,3729,557,22,139.25,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023...","[Ticker: XEL, Sector: Utilities, Filed At: 202...",22
1492,1492,3682,524,25,131.00,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023...","[Ticker: XEL, Sector: Utilities, Filed At: 202...",25


In [10]:
random.sample(pages_and_texts, k=1)

[{'page_number': 706,
  'page_char_count': 3706,
  'page_word_count': 578,
  'page_sentence_count_raw': 25,
  'page_token_count': 144.5,
  'ticker': 'NFE',
  'sector': 'Utilities',
  'filing_date': '2023-03-01T13:46:49-05:00',
  'text': 'Ticker: NFE, Sector: Utilities, Filed At: 2023-03-01T13:46:49-05:00 our property or result from our operations. Our current operations and future projects  are subject to the inherent risks associated with construction of energy-related  infrastructure, LNG, natural gas, power and maritime operations, shipping and  transportation of hazardous substances, including explosions, pollution, release of toxic  substances, fires, seismic events, hurricanes and other adverse weather conditions,  acts of aggression or terrorism, and other risks or hazards, each of which could result in  significant delays in commencement or interruptions of operations and/or result in  damage to or destruction of the facilities, liquefaction facilities and assets or damage to  

## Group the sentences into chunks:
### Make sure each chunk

In [11]:
import re
from tqdm import tqdm

# Assuming an average of 1 token = 4 characters, but you'll adjust based on your actual tokenization logic
TOKEN_CHAR_RATIO = 4
MAX_TOKEN_COUNT = 384  # Maximum token count per chunk

def split_list_by_token_count(sentences: list[str], max_token_count: int) -> list[list[str]]:
    """
    Splits a list of sentences into sublists, ensuring the total token count of each sublist
    is as close as possible to the max_token_count without exceeding it.
    """
    chunks = []
    current_chunk = []
    current_char_count = 0

    for sentence in sentences:
        sentence_char_count = len(sentence)
        if current_char_count + sentence_char_count <= max_token_count * TOKEN_CHAR_RATIO:
            current_chunk.append(sentence)
            current_char_count += sentence_char_count
        else:
            chunks.append(current_chunk)
            current_chunk = [sentence]
            current_char_count = sentence_char_count
    if current_chunk:
        chunks.append(current_chunk)

    return chunks

# Now, instead of a fixed number of sentences per chunk, split based on token count
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    # Dynamically split sentences into chunks based on token count
    item["sentence_chunks"] = split_list_by_token_count(item["sentences"], MAX_TOKEN_COUNT)

    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        chunk_dict["page_number"] = item["page_number"]
        chunk_dict["ticker"] = item["ticker"]
        chunk_dict["sector"] = item["sector"]
        chunk_dict["filing_date"] = item["filing_date"]

        # Join sentences to form a paragraph-like chunk
        joined_sentence_chunk = " ".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)  # Improving punctuation spacing

        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len(joined_sentence_chunk.split(" "))
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / TOKEN_CHAR_RATIO  # Approximate token count

        pages_and_chunks.append(chunk_dict)

# Display the total number of chunks created
total_chunks = len(pages_and_chunks)



100%|██████████| 1494/1494 [00:00<00:00, 9678.74it/s] 


In [12]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks)
df

Unnamed: 0,page_number,ticker,sector,filing_date,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,0,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...",1458,212,364.50
1,0,LNT,Utilities,2023-02-24T16:03:25-05:00,2) WPL - is a public utility engaged principal...,1298,190,324.50
2,0,LNT,Utilities,2023-02-24T16:03:25-05:00,TABLE_START 3 ##TABLE_ENDTable of Co ntents Co...,1376,204,344.00
3,0,LNT,Utilities,2023-02-24T16:03:25-05:00,"Employees - At December 31, 2022, Alliant Ener...",914,138,228.50
4,1,LNT,Utilities,2023-02-24T16:03:25-05:00,"Ticker: LNT, Sector: Utilities, Filed At: 2023...",1415,200,353.75
...,...,...,...,...,...,...,...,...
4492,1491,XEL,Utilities,2023-02-23T14:54:01-05:00,Changes in environmental policies and regulati...,745,115,186.25
4493,1492,XEL,Utilities,2023-02-23T14:54:01-05:00,"Ticker: XEL, Sector: Utilities, Filed At: 2023...",1470,219,367.50
4494,1492,XEL,Utilities,2023-02-23T14:54:01-05:00,We establish strategies and expectations relat...,1494,209,373.50
4495,1492,XEL,Utilities,2023-02-23T14:54:01-05:00,"More frequent and severe drought conditions, e...",675,96,168.75


## Remove chunks below a threshold

In [13]:
# Show random chunks with under 20 tokens in length
min_token_length = 20
for row in df[df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 18.5 | Text: Clean Water Act The 1972 amendments to the Federal Water Pollution Control
Chunk token count: 0.0 | Text: 
Chunk token count: 0.0 | Text: 
Chunk token count: 0.0 | Text: 
Chunk token count: 0.0 | Text: 


In [14]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:3]

[{'page_number': 0,
  'ticker': 'LNT',
  'sector': 'Utilities',
  'filing_date': '2023-02-24T16:03:25-05:00',
  'sentence_chunk': 'Ticker: LNT, Sector: Utilities, Filed At: 2023-02-24T16:03:25-05:00 ITEM 1. BUSINESS A. GENERAL Alliant Energy maintains its principal executive offices in Madison, Wisconsin. Alliant Energy operates as a regulated investor-owned public utility holding company, and its purpose-driven strategy is to serve its customers and build stronger communities. Alliant Energys primary focus is to provide regulated electric and natural gas service to approximately 995,000 electric and approximately 425,000 natural gas customers in the Midwest through its two public utility subsidiaries, IPL and WPL. The primary first tier wholly-owned subsidiaries of Alliant Energy are as follows: 1) IPL - is a public utility engaged principally in the generation and distribution of electricity and the distribution and transportation of natural gas to retail customers in select markets 

In [15]:
# Get stats about our chunks
df = pd.DataFrame(pages_and_chunks_over_min_token_len)
df.to_csv("pages_and_chunks_over_min_token_len.csv", index=False)