## Chunking

! pip install transformers

In [None]:
# model_name = 'snowflake/snowflake-arctic-embed-m-long'
model_full_name = 'sentence-transformers/all-mpnet-base-v2'
model_name = 'all-mpnet-base-v2'

In [None]:
# Load tokenizer (must match the model)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_full_name)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(
    model_name, 
    trust_remote_code=True,
    cache_folder="./models"   # explicitly setting cache location
)
emb_dimensions = model.get_sentence_embedding_dimension()

In [None]:
tokenizer

In [None]:
filename = "bfp-a3447q.pdf"
input_path = "data/" + filename.split('.')[0]+'.txt'
output_path= input_path.split('.')[0]+'_v2_chunked.txt'
image_path = "data/images"

In [None]:
for index, chunk in enumerate(data):
    text = chunk[-1]
    all_tokens = tokenizer.encode(text, add_special_tokens=False)
    print(text)
    print(all_tokens)
    break

In [None]:
import math
def chunk_text_by_lines(text, tokenizer, token_limit):
    """
    Splits text into chunks based on token count, ensuring cuts only happen
    between lines (preserving \n).
    """
    lines = text.splitlines(keepends=True)  # keep '\n' at end of each line
    chunks, current_chunk, current_tokens = [], [], 0
    all_tokens = tokenizer.encode(text, add_special_tokens=False)
    # setting 'soft limit'
    chunk_size = math.ceil(len(all_tokens) / math.ceil(len(all_tokens)/ token_limit))
    for line in lines:
        line_tokens = tokenizer.encode(line, add_special_tokens=False)
        if current_tokens + len(line_tokens) > token_limit:
            # flush current chunk
            chunks.append("".join(current_chunk).strip())
            current_chunk, current_tokens = [], 0
        current_chunk.append(line)
        current_tokens += len(line_tokens)
        if current_tokens > chunk_size:
            # flush current chunk
            chunks.append("".join(current_chunk).strip())
            current_chunk, current_tokens = [], 0
    
    # add last chunk
    if current_chunk:
        chunks.append("".join(current_chunk).strip())
    
    return chunks, chunk_size

In [None]:
import re
import pathlib, json
json_read = pathlib.Path(input_path).read_text()
data = json.loads(json_read)
chunked_data = []
chunks = []
token_limit = 300
print(f'Token threshold: {token_limit}')
for index, chunk in enumerate(data):
    text = chunk[-1]
    # Replace <br> with semicolons or newlines to reduce token consumption and clean tables
    text = re.sub(r"-<br\s*/?>", "", text) # merge splitted words
    text = re.sub(r"<br\s*/?>", "; ", text) # change line separator to save some tokens
    
    # Strip off excessive whitespace
    text = re.sub(r" +", " ", text)
    text = re.sub(r"\.+", ".", text)
    text = re.sub(r"~", "", text)
        
    chunks, size = chunk_text_by_lines(text, tokenizer, token_limit)
    for text_chunk in chunks:
        chunked_data.append(
            [chunk[0],
             chunk[1],
             chunk[2],
             text_chunk]
        )
    if len(chunks) > 1:
        print(f"{index}) Text length: {len(text)}\tToken chunk size: {size}\tChunks count: {len(chunks)}\t for {chunk[1]}")


In [None]:
len(chunked_data)

In [None]:
import json
json_file=json.dumps(chunked_data, indent=2)
# Exporting data to output file for storage
with open(output_path, mode='w+') as f_out:
    f_out.write(json_file)

In [None]:
data_folder = "data/"
filename = data_folder + "bfp-a3447q.pdf"
content_path= filename.split('.')[0]+'_chunked.txt'
context_path= filename.split('.')[0]+'_context.txt'
image_path = "data/images"

In [None]:
import pathlib, json
json_read = pathlib.Path(context_path).read_text()
data_context = json.loads(json_read)
data_context = {int(k): v for k, v in data_context.items()}

In [None]:
max_len = 0
for k,v in data_context.items():
    all_tokens = tokenizer.encode(v, add_special_tokens=False)
    if len(all_tokens) > 600:
        if len(all_tokens) > max_len:
            max_len = len(all_tokens)
        print(f'For chapter no: {k}, token count is {len(all_tokens)}')

In [None]:
import json
json_file=json.dumps(data_context, indent=2)
# Exporting data to output file for storage
with open(context_path, mode='w+') as f_out:
    f_out.write(json_file)