In [1]:
"""
This notebook loads the cleaned lecture text, splits it into chunks,
creates embeddings, and saves them into a FAISS vector database.
"""

'\nThis notebook loads the cleaned lecture text, splits it into chunks,\ncreates embeddings, and saves them into a FAISS vector database.\n'

In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import os
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Config ---
TEXT_DIR = Path("../data")
VECTOR_DIR = Path("../vectorstore")
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 500
EMBED_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'  # Lightweight and fast

In [3]:
# --- Load model ---
model = SentenceTransformer(EMBED_MODEL)


In [4]:
# --- Helper functions ---
def chunk_text(text, chunk_size=CHUNK_SIZE):
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

def embed_chunks(chunks):
    return model.encode(chunks, show_progress_bar=True)

In [None]:
# --- Process and Store ---
def build_faiss_index(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        raw_text = f.read()

    chunks = chunk_text(raw_text)
    embeddings = embed_chunks(chunks)

    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    faiss.write_index(index, str(VECTOR_DIR / "ctse_faiss.index"))
    print(f"FAISS index saved to {VECTOR_DIR / 'ctse_faiss.index'}")

    # Save chunks for reference
    with open(VECTOR_DIR / "chunks.txt", 'w', encoding='utf-8') as f:
        f.write("\n\n".join(chunks))


In [6]:
# --- Example Usage ---
build_faiss_index(TEXT_DIR / "ctse_lecture_notes_extracted.txt")

Batches: 100%|██████████| 1/1 [00:04<00:00,  4.19s/it]

FAISS index saved to ..\vectorstore\ctse_faiss.index



