# Day 3 (Course Version) — Evidently & DataTalksClub FAQ
#
# This section reproduces exactly what’s in the official course docs.
#
# Datasets:
#   - Evidently repo (docs folder, chunked into overlapping windows)
#   - DataTalksClub FAQ repo (filtered to data-engineering questions)
#
# Steps:
#   1. Clone the repos
#   2. Chunk docs (for Evidently)
#   3. Build lexical (text) search with minsearch
#   4. Build semantic (vector) search with sentence-transformers
#   5. Combine into hybrid search
#
# Example Queries (from docs):
#   - "What should be in a test dataset for AI evaluation?"   (Evidently)
#   - "Can I join the course now?"                           (FAQ)
#
# 👉 Use this cell if you want to reproduce the original course setup.
#

In [65]:
# === Day 3: Text, Vector, Hybrid Search (Evidently + DTC FAQ) ===
# Runs end-to-end with no external helpers.

!pip -q install minsearch sentence-transformers

import os, shutil, json
from glob import glob

from minsearch import Index, VectorSearch
from sentence_transformers import SentenceTransformer
import numpy as np

# -----------------------------
# Utilities
# -----------------------------
def sliding_window(text: str, window: int, stride: int):
    chunks = []
    n = len(text)
    if n == 0:
        return chunks
    i = 0
    while i < n:
        piece = text[i:i+window]
        if not piece:
            break
        chunks.append({"chunk": piece})
        i += stride
        # ensure tail
        if i >= n and i - stride + window < n:
            tail_start = max(0, n - window)
            tail_piece = text[tail_start:n]
            if not chunks or chunks[-1]["chunk"] != tail_piece:
                chunks.append({"chunk": tail_piece})
            break
    return chunks

def read_text_files(root, exts):
    paths = []
    for ext in exts:
        paths.extend(glob(os.path.join(root, f"**/*{ext}"), recursive=True))
    docs = []
    for p in paths:
        try:
            with open(p, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read()
        except Exception:
            continue
        rel = os.path.relpath(p, root).replace("\\", "/")
        title = os.path.splitext(os.path.basename(p))[0].replace("-"," ").replace("_"," ").title()
        docs.append({"filename": rel, "title": title, "description": "", "content": content})
    return docs

def pretty_print(results, fields=("title","filename","chunk"), max_chars=300):
    for i, r in enumerate(results, 1):
        print(f"\n[{i}] ---")
        for f in fields:
            val = r.get(f)
            if val:
                if isinstance(val, (list, dict)):
                    val = json.dumps(val, ensure_ascii=False)
                val = str(val)
                print(f"{f}: {val[:max_chars].replace('\n',' ')}{'...' if len(val)>max_chars else ''}")

def build_text_index(docs, fields):
    idx = Index(text_fields=fields, keyword_fields=[])
    idx.fit(docs)
    return idx

def build_vector_index(docs, embed_model_name, text_field_or_fields):
    model = SentenceTransformer(embed_model_name)
    texts = []
    if isinstance(text_field_or_fields, str):
        for d in docs:
            texts.append(d.get(text_field_or_fields, ""))
    else:
        for d in docs:
            parts = [d.get(f, "") for f in text_field_or_fields]
            texts.append(" ".join([p for p in parts if p]))
    E = model.encode(texts)
    if not isinstance(E, np.ndarray):
        E = np.array(E)
    v = VectorSearch()
    v.fit(E, docs)
    return v, model

def text_search(idx, q, k=5):
    return idx.search(q, num_results=k)

def vector_search(vindex, model, q, k=5):
    qv = model.encode(q)
    return vindex.search(qv, num_results=k)

def hybrid(text_res, vec_res, key="filename", k=5):
    seen, out = set(), []
    for r in (text_res + vec_res):
        ident = r.get(key) or r.get("id")
        if ident in seen:
            continue
        seen.add(ident)
        out.append(r)
        if len(out) >= k:
            break
    return out

# -----------------------------
# Part A: Evidently docs (chunked)
# -----------------------------
EVID_REPO_URL = "https://github.com/evidentlyai/evidently.git"
EVID_REPO_DIR = "/content/evidently"

# fresh clone
if os.path.exists(EVID_REPO_DIR):
    shutil.rmtree(EVID_REPO_DIR)
!git clone -q {EVID_REPO_URL} {EVID_REPO_DIR}

# read docs & chunk
evid_docs_dir = os.path.join(EVID_REPO_DIR, "docs")
evid_docs = read_text_files(evid_docs_dir, exts=(".md",".mdx",".txt",".rst"))
evid_chunks = []
for d in evid_docs:
    sw = sliding_window(d["content"], window=2000, stride=1000)
    for ch in sw:
        ch.update({"title": d["title"], "description": d["description"], "filename": d["filename"]})
    evid_chunks.extend(sw)

# text index over chunk + metadata
evid_tindex = build_text_index(evid_chunks, fields=["chunk","title","description","filename"])
# vector index over chunk only
evid_vindex, evid_model = build_vector_index(evid_chunks, embed_model_name="all-MiniLM-L6-v2", text_field_or_fields="chunk")

print(f"Evidently: files={len(evid_docs)}, chunks={len(evid_chunks)}")

# Example query (from docs):
q1 = "What should be in a test dataset for AI evaluation?"
tres = text_search(evid_tindex, q1, k=5)
vres = vector_search(evid_vindex, evid_model, q1, k=5)
hres = hybrid(tres, vres, key="filename", k=5)

print("\n=== Evidently | TEXT results ===")
pretty_print(tres, fields=("title","filename","chunk"))
print("\n=== Evidently | VECTOR results ===")
pretty_print(vres, fields=("title","filename","chunk"))
print("\n=== Evidently | HYBRID (top 5, dedup by filename) ===")
pretty_print(hres, fields=("title","filename","chunk"))

# -----------------------------
# Part B: DataTalksClub FAQ (no chunking; filter to data-engineering)
# -----------------------------
FAQ_REPO_URL = "https://github.com/DataTalksClub/faq.git"
FAQ_REPO_DIR = "/content/faq"

if os.path.exists(FAQ_REPO_DIR):
    shutil.rmtree(FAQ_REPO_DIR)
!git clone -q {FAQ_REPO_URL} {FAQ_REPO_DIR}

# read all markdown; filter filenames containing 'data-engineering'
faq_docs = read_text_files(FAQ_REPO_DIR, exts=(".md",".mdx",".txt",".rst"))
faq_de = [d for d in faq_docs if "data-engineering" in d["filename"].lower()]

# normalize fields like in course:
# question := title or filename; content := file text
for d in faq_de:
    d["question"] = d.get("title") or d.get("filename")
    d["content"]  = d.get("content","")

# text index over Q + A
faq_tindex = build_text_index(faq_de, fields=["question","content"])
# vector index over concatenated Q + A
faq_vindex, faq_model = build_vector_index(faq_de, embed_model_name="all-MiniLM-L6-v2",
                                           text_field_or_fields=["question","content"])

print(f"\nFAQ (data-engineering subset): files={len(faq_de)}")

# Example query (from docs):
q2 = "Can I join the course now?"
tres2 = text_search(faq_tindex, q2, k=5)
vres2 = vector_search(faq_vindex, faq_model, q2, k=5)
# dedupe by id/filename if exists; fall back to 'filename'
hres2 = hybrid(tres2, vres2, key="filename", k=5)

print("\n=== FAQ | TEXT results ===")
pretty_print(tres2, fields=("question","filename","content"))
print("\n=== FAQ | VECTOR results ===")
pretty_print(vres2, fields=("question","filename","content"))
print("\n=== FAQ | HYBRID (top 5) ===")
pretty_print(hres2, fields=("question","filename","content"))


Evidently: files=0, chunks=0

=== Evidently | TEXT results ===

=== Evidently | VECTOR results ===

=== Evidently | HYBRID (top 5, dedup by filename) ===

FAQ (data-engineering subset): files=449

=== FAQ | TEXT results ===

[1] ---
question: 003 3F1424Af17 Course Can I Still Join The Course After The Start
filename: _questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md
content: --- id: 3f1424af17 question: 'Course: Can I still join the course after the start date?' sort_order: 3 ---  Yes, even if you don't register, you're still eligible to submit the homework.  Be aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave ...

[2] ---
question: 001 9E508F2212 Course When Does The Course Start
filename: _questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md
content: --- id: 9e508f2212 question: 'Course: When does the course start?' sor

# Day 3 (My Project Version) — DermaScan Android App
#
# This section adapts the official Day 3 workflow to my own project.
#
# Dataset:
#   - Repository: DermaScan_AndroidApp (cloned from GitHub)
#   - Files scanned: .md, .txt, .java, .kt, .xml
#
# Steps:
#   1. Clone my repo into Colab
#   2. Chunk documentation & code (sliding window, saved as JSONL for caching)
#   3. Build lexical (text) search index with minsearch
#   4. Build semantic (vector) search index with sentence-transformers
#   5. Combine into hybrid search
#
# Example Queries (for my repo):
#   - "What models are used and what dataset is used in this project?"
#   - "How is the DermaScan Android app deployed and what technologies are used in the mobile app?"



In [66]:
#  Setup: install deps and check GPU
!pip -q install minsearch sentence-transformers
import torch, platform
print('Python', platform.python_version())
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')


Python 3.12.11
CUDA available: False
Using CPU


In [67]:
#  Configure your repo and file types
REPO_URL = "https://github.com/SiriYellu/DermaScan_AndroidApp.git"  # <- change if needed
REPO_DIR = "/content/DermaScan_AndroidApp"                          # where to clone
SCAN_DIR = REPO_DIR                                                  # which subfolder to scan
EXTS = [".md", ".txt", ".java", ".kt", ".xml"]               # add more: .py, .ipynb, .rst, etc.

# Query to run
QUERY = "What models are used and what dataset is used in this project?"
TOPK = 5


In [68]:
# ⬇ Clone repo (safe to re-run)
import os, shutil, subprocess
if os.path.exists(REPO_DIR):
    shutil.rmtree(REPO_DIR)
print('Cloning', REPO_URL)
!git clone -q {REPO_URL} {REPO_DIR}
print('Done!')
!ls -la {REPO_DIR} | head -n 20


Cloning https://github.com/SiriYellu/DermaScan_AndroidApp.git
Done!
total 35580
drwxr-xr-x 3 root root     4096 Oct  2 00:57 .
drwxr-xr-x 1 root root     4096 Oct  2 00:57 ..
drwxr-xr-x 8 root root     4096 Oct  2 00:57 .git
-rw-r--r-- 1 root root   759019 Oct  2 00:57 Poster-GC-123.pdf
-rw-r--r-- 1 root root  2405831 Oct  2 00:57 PPT.pptx
-rw-r--r-- 1 root root     4181 Oct  2 00:57 README.md
-rw-r--r-- 1 root root 10120802 Oct  2 00:57 Recording 2025-09-28 114054.mp4
-rw-r--r-- 1 root root 22967109 Oct  2 00:57 SCapp_split.zip
-rw-r--r-- 1 root root   151190 Oct  2 00:57 Screenshot 2025-09-28 114520.png


In [69]:
#  Helpers: chunking & IO
from glob import glob
from typing import List, Dict, Any
import json, os

def sliding_window(text: str, window: int, stride: int) -> List[Dict[str, Any]]:
    chunks = []
    n = len(text)
    if n == 0:
        return chunks
    i = 0
    while i < n:
        piece = text[i:i+window]
        if not piece:
            break
        chunks.append({"chunk": piece})
        i += stride
        # ensure tail
        if i >= n and i - stride + window < n:
            tail_start = max(0, n - window)
            tail_piece = text[tail_start:n]
            if not chunks or chunks[-1]["chunk"] != tail_piece:
                chunks.append({"chunk": tail_piece})
            break
    return chunks

def read_docs_from_dir(docs_dir: str, exts: List[str]) -> List[Dict[str, Any]]:
    paths = []
    for ext in exts:
        paths.extend(glob(os.path.join(docs_dir, f"**/*{ext}"), recursive=True))
    out = []
    for p in paths:
        try:
            with open(p, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read()
        except Exception:
            continue
        rel = os.path.relpath(p, docs_dir)
        title = os.path.splitext(os.path.basename(p))[0].replace("-"," ").replace("_"," ").title()
        out.append({
            "filename": rel,
            "title": title,
            "description": "",
            "content": content
        })
    return out

def write_jsonl(path: str, rows: List[Dict[str, Any]]):
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


In [70]:
#  Build chunks (adjust window/stride as needed)
WINDOW, STRIDE = 1000, 500
print('Scanning', SCAN_DIR, 'for', EXTS)
docs = read_docs_from_dir(SCAN_DIR, EXTS)
print('Found files:', len(docs))

chunks = []
for d in docs:
    sw = sliding_window(d["content"], WINDOW, STRIDE)
    for ch in sw:
        ch.update({
            "title": d.get("title", ""),
            "description": d.get("description", ""),
            "filename": d.get("filename", ""),
        })
    chunks.extend(sw)
print('Total chunks:', len(chunks))

# Save chunks to enable embedding cache
CHUNKS_JSONL = "/content/chunks.jsonl"
write_jsonl(CHUNKS_JSONL, chunks)
print('Saved:', CHUNKS_JSONL)


Scanning /content/DermaScan_AndroidApp for ['.md', '.txt', '.java', '.kt', '.xml']
Found files: 1
Total chunks: 9
Saved: /content/chunks.jsonl


In [71]:
# 🔍 Build indices (with embedding cache)
from minsearch import Index, VectorSearch
from sentence_transformers import SentenceTransformer
import numpy as np, json, os, hashlib

def load_chunks(jsonl_path):
    rows = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

def file_hash(path):
    h = hashlib.sha256()
    with open(path, 'rb') as f:
        while True:
            b = f.read(1024*1024)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

chunks = load_chunks(CHUNKS_JSONL)
print('Loaded chunks:', len(chunks))

# Text index
text_fields = ["chunk", "title", "description", "filename"]
tindex = Index(text_fields=text_fields, keyword_fields=[])
tindex.fit(chunks)

# Vector index with cache
MODEL_NAME = "all-MiniLM-L6-v2"  # fast, good quality
EMB_PATH = "/content/embeddings.npy"
META_PATH = "/content/embeddings.meta.json"
model = SentenceTransformer(MODEL_NAME, device='cuda' if torch.cuda.is_available() else 'cpu')

need_encode = True
new_hash = file_hash(CHUNKS_JSONL)
if os.path.exists(EMB_PATH) and os.path.exists(META_PATH):
    with open(META_PATH, 'r') as f:
        meta = json.load(f)
    if meta.get('chunks_hash') == new_hash and meta.get('model') == MODEL_NAME and meta.get('count') == len(chunks):
        E = np.load(EMB_PATH)
        need_encode = False
        print('✅ Loaded cached embeddings:', E.shape)

if need_encode:
    print('Encoding embeddings ...')
    texts = [c['chunk'] for c in chunks]
    E = model.encode(texts)
    if not isinstance(E, np.ndarray):
        E = np.array(E)
    np.save(EMB_PATH, E)
    with open(META_PATH, 'w') as f:
        json.dump({
            'chunks_hash': new_hash,
            'model': MODEL_NAME,
            'count': len(chunks)
        }, f)
    print('Saved cache:', EMB_PATH)

vindex = VectorSearch()
vindex.fit(E, chunks)
print('Vector index ready:', E.shape)


Loaded chunks: 9
✅ Loaded cached embeddings: (9, 384)
Vector index ready: (9, 384)


In [72]:
# 🔎 Run hybrid search
def text_search(q):
    return tindex.search(q, num_results=TOPK)

def vector_search(q):
    qv = model.encode(q)
    return vindex.search(qv, num_results=TOPK)

def hybrid_search(q):
    a = text_search(q)
    b = vector_search(q)
    seen, out = set(), []
    for r in a + b:
        key = r.get('filename') or r.get('id')
        if key in seen:
            continue
        seen.add(key)
        out.append(r)
        if len(out) >= TOPK:
            break
    return out

print('QUERY:', QUERY)
results = hybrid_search(QUERY)
for i, r in enumerate(results, 1):
    print(f"\n[{i}] ---")
    print('filename:', r.get('filename'))
    print('title   :', r.get('title'))
    chunk = (r.get('chunk') or '')
    print('chunk   :', chunk[:500].replace('\n',' ') + ('...' if len(chunk)>500 else ''))


QUERY: What models are used and what dataset is used in this project?

[1] ---
filename: README.md
title   : Readme
chunk   : ng** – Combines multiple models for robust decision-making.   - 🧩 **Lesion Segmentation** – Attention U-Net for precise boundary detection.   - 📚 **Awareness Content** – In-app educational resources for users.    ---  ## 🛠 Tech Stack   - **Android**: Java / Kotlin   - **AI Models**: TensorFlow Lite (TFLite)   - **Model Training**: Python (PyTorch/TensorFlow)   - **Image Processing**: OpenCV   - **Dataset**: [HAM10000](https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000) (10,000+ dermoscopic...


In [None]:
q = input("Enter your question: ")
res = hybrid_search(q)
for i, r in enumerate(res, 1):
    print(f"\n[{i}] ---")
    print('filename:', r.get('filename'))
    print('title   :', r.get('title'))
    ch = (r.get('chunk') or '')
    print('chunk   :', ch[:500].replace('\n',' ') + ('...' if len(ch)>500 else ''))
