In [1]:
import os
import re
import numpy as np
import pdfplumber

from sklearn.metrics.pairwise import cosine_similarity
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

from openai import OpenAI
from dotenv import load_dotenv


In [2]:
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

print("API Loaded Successfully")


API Loaded Successfully


In [4]:
def extract_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

pdf_path = r"C:\MY_Folder\Github\RAG_DIfferent_Chunks_Usage\data\Sample_pdf.pdf"
text = extract_text(pdf_path)

print("Total characters:", len(text))
print(text[:1000])


Total characters: 4334
1.Starting 2. Driving mode 3. Adapter
(cid:2)
Requirements Driving mode selector Select S68, S69, S70 or S71
➟
Motor is not running
➟ (cid:2) ➟
All emergency stop buttons are Turn S42 Adapter 61" , 96", 125" or
(cid:2) unlocked to the right 110" is activated.
Main battery switch is in position I
(cid:2) ➟
Activate handbrake, Stabilisers are retracted,
Brief (cid:2) Press S43 to the right H19 is lit green 61" 96"
Move S42 for approx. 10 seconds
to centre position
125"
Instructions 110"
Driving direction selector
(cid:2)
for Push S43 forwards Caution: Functions only in drive mode!
Starting motor
(cid:2) or backwards
Turn preheater starter switch
CHAMP 70W ➟ S1 to position 1 and position 2 ➟ 4. Lifting mode
Parking brake Parking brake
(cid:2)
H13 is red, goes out
H13 is lit red Activate handbrake,
Press S43 to the right
Caution: Motor is equipped with ➟
Parking brake
preheater (temperature-dependent)!
➟
H2 yellow, goes out Caution: The parking brake is released H13 

In [5]:
def clean_text(text):
    text = re.sub(r'\(cid:\d+\)', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

cleaned_text = clean_text(text)

print("After cleaning:")
print(cleaned_text[:1000])


After cleaning:
1.Starting 2. Driving mode 3. Adapter Requirements Driving mode selector Select S68, S69, S70 or S71 ➟ Motor is not running ➟ ➟ All emergency stop buttons are Turn S42 Adapter 61" , 96", 125" or unlocked to the right 110" is activated. Main battery switch is in position I ➟ Activate handbrake, Stabilisers are retracted, Brief Press S43 to the right H19 is lit green 61" 96" Move S42 for approx. 10 seconds to centre position 125" Instructions 110" Driving direction selector for Push S43 forwards Caution: Functions only in drive mode! Starting motor or backwards Turn preheater starter switch CHAMP 70W ➟ S1 to position 1 and position 2 ➟ 4. Lifting mode Parking brake Parking brake H13 is red, goes out H13 is lit red Activate handbrake, Press S43 to the right Caution: Motor is equipped with ➟ Parking brake preheater (temperature-dependent)! ➟ H2 yellow, goes out Caution: The parking brake is released H13 is lit red automatically. Vehicle may roll! Turn preheater starter swit

In [6]:
def fixed_chunk(text, chunk_size=500, overlap=50):
    splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return splitter.split_text(text)

fixed_chunks = fixed_chunk(cleaned_text)

print("Fixed chunk count:", len(fixed_chunks))
print("\nSample chunk:\n")
print(fixed_chunks[0])


Fixed chunk count: 1

Sample chunk:

1.Starting 2. Driving mode 3. Adapter Requirements Driving mode selector Select S68, S69, S70 or S71 ➟ Motor is not running ➟ ➟ All emergency stop buttons are Turn S42 Adapter 61" , 96", 125" or unlocked to the right 110" is activated. Main battery switch is in position I ➟ Activate handbrake, Stabilisers are retracted, Brief Press S43 to the right H19 is lit green 61" 96" Move S42 for approx. 10 seconds to centre position 125" Instructions 110" Driving direction selector for Push S43 forwards Caution: Functions only in drive mode! Starting motor or backwards Turn preheater starter switch CHAMP 70W ➟ S1 to position 1 and position 2 ➟ 4. Lifting mode Parking brake Parking brake H13 is red, goes out H13 is lit red Activate handbrake, Press S43 to the right Caution: Motor is equipped with ➟ Parking brake preheater (temperature-dependent)! ➟ H2 yellow, goes out Caution: The parking brake is released H13 is lit red automatically. Vehicle may roll! Turn p

In [7]:
def recursive_chunk(text, chunk_size=500, overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap
    )
    return splitter.split_text(text)

recursive_chunks = recursive_chunk(cleaned_text)

print("Recursive chunk count:", len(recursive_chunks))
print("\nSample chunk:\n")
print(recursive_chunks[0])


Recursive chunk count: 9

Sample chunk:

1.Starting 2. Driving mode 3. Adapter Requirements Driving mode selector Select S68, S69, S70 or S71 ➟ Motor is not running ➟ ➟ All emergency stop buttons are Turn S42 Adapter 61" , 96", 125" or unlocked to the right 110" is activated. Main battery switch is in position I ➟ Activate handbrake, Stabilisers are retracted, Brief Press S43 to the right H19 is lit green 61" 96" Move S42 for approx. 10 seconds to centre position 125" Instructions 110" Driving direction selector for Push S43 forwards


In [8]:
def get_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding


In [9]:
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return [s.strip() for s in sentences if s.strip()]


def semantic_chunk(text, threshold=0.75):
    sentences = split_into_sentences(text)

    embeddings = [get_embedding(sentence) for sentence in sentences]

    chunks = []
    current_chunk = [sentences[0]]

    for i in range(1, len(sentences)):
        sim = cosine_similarity(
            [embeddings[i - 1]],
            [embeddings[i]]
        )[0][0]

        if sim < threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

        current_chunk.append(sentences[i])

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


In [10]:
semantic_chunks = semantic_chunk(cleaned_text, threshold=0.70)

print("Semantic chunk count:", len(semantic_chunks))
print("\nSample chunk:\n")
print(semantic_chunks[0])


Semantic chunk count: 25

Sample chunk:

1.Starting 2.


In [11]:
print("Fixed:", len(fixed_chunks))
print("Recursive:", len(recursive_chunks))
print("Semantic:", len(semantic_chunks))


Fixed: 1
Recursive: 9
Semantic: 25


In [12]:
def retrieve(query, chunks, top_k=3):
    chunk_embeddings = [get_embedding(c) for c in chunks]
    query_embedding = get_embedding(query)

    similarities = cosine_similarity(
        [query_embedding],
        chunk_embeddings
    )[0]

    top_indices = similarities.argsort()[-top_k:][::-1]

    return [chunks[i] for i in top_indices]


In [None]:
query = "What is the main topic of this document?"

print("----- FIXED -----")
for r in retrieve(query, fixed_chunks):
    print(r[:300], "\n")

print("----- RECURSIVE -----")
for r in retrieve(query, recursive_chunks):
    print(r[:300], "\n")

print("----- SEMANTIC -----")
for r in retrieve(query, semantic_chunks):
    print(r[:300], "\n")


----- FIXED -----
1.Starting 2. Driving mode 3. Adapter Requirements Driving mode selector Select S68, S69, S70 or S71 ➟ Motor is not running ➟ ➟ All emergency stop buttons are Turn S42 Adapter 61" , 96", 125" or unlocked to the right 110" is activated. Main battery switch is in position I ➟ Activate handbrake, Stabi 

----- RECURSIVE -----
