In [1]:
# %% [markdown]
# # Build IT Lecture Knowledge Base (KB)
# 
# Steps:
# 1. Load seed IT lecture dataset (JSONL)
# 2. Chunk transcripts into small pieces
# 3. Generate embeddings (Gemini API if available, else Sentence Transformers)
# 4. Save chunks + embeddings + metadata for retrieval


In [2]:
# %% imports
import os, json
from pathlib import Path
import numpy as np
import pandas as pd


In [3]:
# %% paths
KB_DIR = Path("../data/kb")
KB_DIR.mkdir(parents=True, exist_ok=True)

SEED = KB_DIR / "it_lectures_seed.jsonl"   # raw dataset
CHUNKS = KB_DIR / "kb_chunks.jsonl"        # processed chunks
EMB_FILE = KB_DIR / "kb_embeddings.npy"    # embedding matrix
META = KB_DIR / "kb_meta.json"             # metadata


In [4]:
# %% helper: chunker
def chunk_text(text: str, max_words: int = 120):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        piece = " ".join(words[i:i+max_words]).strip()
        if piece:
            chunks.append(piece)
    return chunks


In [5]:
# %% load seed data
rows = []
with open(SEED, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))
df = pd.DataFrame(rows)
df.head()


Unnamed: 0,id,course,topic_tags,transcript,gold_outline
0,ml_basics_01,Machine Learning,"[supervised, unsupervised, reinforcement]",Machine learning is about learning patterns fr...,"{'title': 'ML Basics', 'topics': ['Supervised'..."
1,db_norm_01,Databases,"[normalization, 1NF, 2NF, 3NF]",Database normalization organizes data to reduc...,"{'title': 'Database Normalization', 'topics': ..."
2,net_funda_01,Networking,"[OSI, TCP/IP, routing]",The OSI model has seven layers from physical u...,"{'title': 'Networking Fundamentals', 'topics':..."


In [6]:
# %% chunk transcripts
chunk_rows = []
for r in rows:
    chunks = chunk_text(r["transcript"], max_words=120)
    for j, ch in enumerate(chunks):
        chunk_rows.append({
            "id": f'{r["id"]}_c{j}',
            "parent_id": r["id"],
            "course": r["course"],
            "topic_tags": r["topic_tags"],
            "text": ch
        })

print("Total chunks:", len(chunk_rows))

# save chunks
with open(CHUNKS, "w", encoding="utf-8") as f:
    for item in chunk_rows:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


Total chunks: 3


In [49]:
# %% embeddings (corrected)
USE_GEMINI = bool(os.getenv("GEMINI_API_KEY"))
D = 0  # will set after embedding

if USE_GEMINI:
    from google import genai
    client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
    texts = [r["text"] for r in chunk_rows]

    vectors = []
    BATCH_SIZE = 32  # batch to avoid timeouts

    for i in range(0, len(texts), BATCH_SIZE):
        batch = texts[i : i + BATCH_SIZE]
        # The new GenAI SDK expects "contents" as list
        resp = client.models.embed_content(
            model="text-embedding-004",
            contents=batch
        )
        # resp.embeddings is a list of objects with `.values`
        for emb in resp.embeddings:
            vec = np.array(emb.values, dtype="float32")
            vectors.append(vec)

    E = np.vstack(vectors)
    D = E.shape[1]
else:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    texts = [r["text"] for r in chunk_rows]
    E = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    D = E.shape[1]

# Save embeddings matrix
np.save(EMB_FILE, E)

# Optionally display vector shape
print(f"Embeddings shape: {E.shape} (rows × dim)")


Embeddings shape: (3, 768) (rows × dim)


In [50]:
# %% save metadata
meta = {
    "model": "text-embedding-004" if USE_GEMINI else "all-MiniLM-L6-v2",
    "dim": int(D),
    "rows": int(E.shape[0]),
}
with open(META, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print("✅ KB built")
print(" -", CHUNKS)
print(" -", EMB_FILE)
print(" -", META)


✅ KB built
 - ..\data\kb\kb_chunks.jsonl
 - ..\data\kb\kb_embeddings.npy
 - ..\data\kb\kb_meta.json
