In [3]:
import time
import json
import numpy as np
import redis
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import tiktoken


PDF_PATH = "sample.pdf"
CHUNK_SIZES = [200, 500, 1000]
CHUNK_OVERLAPS = [0, 50, 100]
EMBEDDING_MODEL = "thenlper/gte-base"
REDIS_HOST = "localhost"
REDIS_PORT = 6379
TOP_K = 3


# defining functions #

def extract_text_from_pdf(pdf_path):
    """read pdf with pypdf"""
    reader = PdfReader(pdf_path)
    text = "\n".join([page.extract_text() or "" for page in reader.pages])
    return text


def chunk_text(text, chunk_size, overlap):
    """text chunking"""
    # tokenizer
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(encoding.decode(chunk))
    
    return chunks


def embed_chunks(model, chunks):
    """generate embeddings for text chunks"""
    return model.encode(chunks, normalize_embeddings=True)


def embeddings_redis(redis_client, chunks, embeddings, chunk_size, overlap):
    """store chunk embeddings in Redis with metadata"""
    prefix = f"chunk_{chunk_size}_{overlap}"
    
    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
        key = f"{prefix}:{i}"
        redis_client.hset(key, mapping={
            "text": chunk,
            "embedding": json.dumps(embedding.tolist()),
            "chunk_size": chunk_size,
            "overlap": overlap
        })


def search_redis(redis_client, query_embedding, chunk_size, overlap):
    """retrieve most relevant chunks for a query using cosine similarity"""
    prefix = f"chunk_{chunk_size}_{overlap}"
    
    results = []
    for key in redis_client.keys(f"{prefix}:*"):
        chunk_data = redis_client.hgetall(key)
        stored_embedding = np.array(json.loads(chunk_data["embedding"]))
        similarity = 1 - cosine(query_embedding, stored_embedding) 
        results.append((chunk_data["text"], similarity))
    
    results.sort(key=lambda x: x[1], reverse=True)
    return results[:TOP_K]


# main script #

text = extract_text_from_pdf(PDF_PATH)
print(f"Extracted text length: {len(text)} characters\n")

# embedding model
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

# connecting to Redis
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=0, decode_responses=True)

redis_client.flushdb()

for chunk_size in CHUNK_SIZES:
    for overlap in CHUNK_OVERLAPS:
        print(f"Processing with chunk_size={chunk_size}, overlap={overlap}")

        # chunk text
        chunks = chunk_text(text, chunk_size, overlap)

        # generate embeddings
        start_time = time.time()
        embeddings = embed_chunks(embedding_model, chunks)
        embedding_time = time.time() - start_time

        # store in Redis
        embeddings_redis(redis_client, chunks, embeddings, chunk_size, overlap)
        print(f" - Stored {len(chunks)} chunks in Redis in {embedding_time:.2f} sec\n")

# testing
query = "Explain logistic regression"
query_embedding = embedding_model.encode([query], normalize_embeddings=True)[0]

print("\nQUERY RESULTS:\n")
for chunk_size in CHUNK_SIZES:
    for overlap in CHUNK_OVERLAPS:
        retrieved_chunks = search_redis(redis_client, query_embedding, chunk_size, overlap)
        print(f"Chunk Size: {chunk_size}, Overlap: {overlap}")
        for chunk, score in retrieved_chunks:
            print(f" - Score: {score:.4f}, Text: {chunk[:100]}...\n")


Extracted text length: 2212 characters

Processing with chunk_size=200, overlap=0
 - Stored 3 chunks in Redis in 0.43 sec

Processing with chunk_size=200, overlap=50
 - Stored 4 chunks in Redis in 0.58 sec

Processing with chunk_size=200, overlap=100
 - Stored 5 chunks in Redis in 0.76 sec

Processing with chunk_size=500, overlap=0
 - Stored 1 chunks in Redis in 0.40 sec

Processing with chunk_size=500, overlap=50
 - Stored 2 chunks in Redis in 0.71 sec

Processing with chunk_size=500, overlap=100
 - Stored 2 chunks in Redis in 0.76 sec

Processing with chunk_size=1000, overlap=0
 - Stored 1 chunks in Redis in 0.39 sec

Processing with chunk_size=1000, overlap=50
 - Stored 1 chunks in Redis in 0.39 sec

Processing with chunk_size=1000, overlap=100
 - Stored 1 chunks in Redis in 0.39 sec


QUERY RESULTS:

Chunk Size: 200, Overlap: 0
 - Score: 0.7634, Text:  Module 3 – Moving Beyond the Relational Model 
 ●  Benefits of the relational model 
 ○  Relational...

 - Score: 0.7566, Text:  ○ 