In [None]:
!pip install -q sentence-transformers faiss-cpu nltk tqdm pandas openpyxl xlrd

In [None]:
import re
import nltk
import numpy as np
import faiss

from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

# Load your sentence transformer model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import pandas as pd
from google.colab import files
uploaded = files.upload()
KB_FILE_PATH = "/content/OOP Knowledge Base.txt"

with open(KB_FILE_PATH, "r", encoding="utf-8") as f:
    KB_TEXT = f.read()

print("KB loaded. Characters:", len(KB_TEXT))
# print(KB_TEXT[:2000])

uploaded_dataset = files.upload()

DATASET_PATH = list(uploaded_dataset.keys())[0]

df = pd.read_excel(DATASET_PATH)

print("Dataset loaded")

# Normalize column names
df.columns = df.columns.str.strip().str.lower()
print("Normalized columns:", df.columns.tolist())

print("Columns:", df.columns.tolist())
print("Total rows:", len(df))

In [None]:
def chunk_kb_by_concept_clean(text):
    concepts = []
    current = []

    # Split text into concepts by lines starting with "Name:"
    for line in text.split("\n"):
        if line.strip().startswith("Name:"):
            if current:
                concepts.append("\n".join(current))
                current = []
        current.append(line)
    if current:
        concepts.append("\n".join(current))

    # Section headings
    headings = [
        "Definition","Key Properties","Related Concepts","How to Implement",
        "Variants / Language Differences","Common Patterns","Why It Exists",
        "Trade-offs","Best Practices","Code Snippet","Real-World Analogy",
        "Common Mistakes","Consequences","How to Avoid",
        "Questions the concept can answer","Cross-concept connections"
    ]
    heading_pattern = r"(?=(" + "|".join([re.escape(h) for h in headings]) + r"):)"

    chunks = []
    metadata = []

    for concept in concepts:
        # Extract concept name
        name_match = re.search(r"Name:\s*(.*)", concept)
        concept_name = name_match.group(1).strip() if name_match else "Unknown"

        # Split by headings
        sections = re.split(heading_pattern, concept)
        buffer = ""
        for s in sections:
            if s.strip() in headings:
                if buffer.strip():
                    chunks.append(buffer.strip())
                    metadata.append(concept_name)
                buffer = s
            else:
                buffer += "\n" + s

        if buffer.strip():
            chunks.append(buffer.strip())
            metadata.append(concept_name)

    # Filter tiny chunks (<5 words)
    final_chunks = []
    final_metadata = []
    for c, m in zip(chunks, metadata):
        if len(c.split()) >= 5:
            final_chunks.append(c)
            final_metadata.append(m)

    return final_chunks, final_metadata

In [None]:
from collections import defaultdict

def clean_chunk_text(chunk):
    """Cleans chunk by removing metadata lines and trimming whitespace."""
    lines = chunk.split("\n")
    cleaned = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if line.startswith(("Name:", "Category:", "Difficulty:", "Tags:")):
            continue
        line = line.rstrip(":").strip()
        cleaned.append(line)
    return "\n".join(cleaned)

def merge_chunks_unique(chunks):
    """Merge chunks removing duplicate lines/headings."""
    merged = []
    seen_lines = set()
    for chunk in chunks:
        for line in chunk.split("\n"):
            line = line.strip()
            if line and line not in seen_lines:
                merged.append(line)
                seen_lines.add(line)
    return "\n".join(merged)

def retrieve_context_for_question(question, k=8):
    """
    Retrieve top-k KB chunks for a question, grouped by concept,
    and merge chunks uniquely per concept.

    Returns a list of strings, each in the format:
    "Concept: <concept_name>\n<merged_text>"
    """
    # Embed the question
    q_emb = embedder.encode([question], convert_to_numpy=True)

    # Search the index for top-k nearest neighbors
    distances, indices = index.search(q_emb, k)

    # Group retrieved chunks by concept
    grouped = defaultdict(list)
    for idx in indices[0]:
        concept = kb_metadata[idx]        # get the concept
        chunk = kb_chunks[idx]            # get the chunk
        grouped[concept].append(chunk)

    # Merge chunks per concept and format output
    merged_contexts = [
        f"Concept: {concept}\n{merge_chunks_unique(chunks)}"
        for concept, chunks in grouped.items()
    ]

    return merged_contexts


In [None]:
# === FINAL PIPELINE CELL (FIXED) ===
from tqdm import tqdm
import faiss

# --------------------------
# 1️⃣ Chunk the KB
# --------------------------
kb_chunks, kb_metadata = chunk_kb_by_concept_clean(KB_TEXT)
print(f"Total chunks after concept split: {len(kb_chunks)}")

# --------------------------
# 2️⃣ Clean all chunks
# --------------------------
kb_chunks = [clean_chunk_text(c) for c in kb_chunks]

# --------------------------
# 3️⃣ Encode chunks and build FAISS index
# --------------------------
chunk_embeddings = embedder.encode(kb_chunks, convert_to_numpy=True)
dim = chunk_embeddings.shape[1]

index = faiss.IndexFlatL2(dim)
index.add(chunk_embeddings)
print(f"FAISS index built with {index.ntotal} chunks.")

# --------------------------
# 4️⃣ Generate detailed context for each question in dataset
# --------------------------
K_TOP = 8  # number of top chunks to retrieve per question
detailed_contexts = []

for question in tqdm(df["questions"], desc="Generating detailed context"):
    # Retrieve concept-level chunks (list of strings)
    concept_chunks = retrieve_context_for_question(question, k=K_TOP)

    # Merge all concepts into a single multi-line string
    merged_context = "\n\n".join(concept_chunks)

    # Add follow-up prompts for the AI interviewer
    follow_up_prompts = (
        "\n\nFollow-Up Prompts for AI Interviewer:\n"
        "1. Explain why this concept is important.\n"
        "2. Provide a code example demonstrating it.\n"
        "3. Describe common mistakes developers make.\n"
        "4. Discuss trade-offs and best practices.\n"
    )

    # Append final string to list
    detailed_contexts.append(merged_context + follow_up_prompts)

# Assign to DataFrame
df["detailed-context"] = detailed_contexts
print("✅ Detailed context column added to DataFrame")

In [None]:
from google.colab import files

OUTPUT_PATH = "OOP_dataset.xlsx"

df.to_excel(OUTPUT_PATH, index=False)

print(f"✅ Excel file saved as: {OUTPUT_PATH}")

files.download(OUTPUT_PATH)
