## ipynb to test embedding models


In [1]:
import os
import time
import json
import requests
import numpy as np
from pathlib import Path
from typing import List


In [2]:
DATA_DIR = Path(r"C:\Users\Jamin Carter\Downloads\web_archive")

MODELS = [
    "nomic-embed-text",
    "mxbai-embed-large",
    "all-minilm"
]

OLLAMA_URL = "http://localhost:11434/api/embeddings"

TOP_K = 5
MAX_FILES = 10
RANDOM_SEED = 42

import random
random.seed(RANDOM_SEED)



In [3]:
def load_documents_limited(dir_path, max_files=50):
    files = list(dir_path.glob("*.txt"))
    
    if len(files) > max_files:
        files = random.sample(files, max_files)

    docs = {}
    for file in files:
        docs[file.name] = file.read_text(encoding="utf-8")

    return docs


In [4]:
documents = load_documents_limited(DATA_DIR, MAX_FILES)
len(documents)


10

In [5]:
def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks



In [6]:
chunks = []
chunk_meta = []

for doc_name, text in documents.items():
    for chunk in chunk_text(text):
        chunks.append(chunk)
        chunk_meta.append(doc_name)

print(len(chunks))

MAX_CHUNKS = 200

if len(chunks) > MAX_CHUNKS:
    indices = random.sample(range(len(chunks)), MAX_CHUNKS)
    chunks = [chunks[i] for i in indices]
    chunk_meta = [chunk_meta[i] for i in indices]



743


In [7]:
def embed(text: str, model: str):
    response = requests.post(
        OLLAMA_URL,
        json={"model": model, "prompt": text},
        timeout=60
    )
    response.raise_for_status()
    return response.json()["embedding"]


In [17]:
def embed_corpus(chunks, model):
    embeddings = []
    times = []
    i=0
    for c in chunks:
        i+=1
        if i % 50 == 0:
            print(f"Embedded {i} / {len(chunks)} chunks")
        start = time.perf_counter()
        emb = embed(c, model)
        end = time.perf_counter()

        embeddings.append(emb)
        times.append(end - start)
        

    return np.array(embeddings), times


In [18]:
embed("test sentence", "nomic-embed-text")


[0.8493291139602661,
 0.43883460760116577,
 -3.4721643924713135,
 -0.16817086935043335,
 1.3417420387268066,
 -0.2561345100402832,
 0.518891453742981,
 -0.1819201409816742,
 0.33330604434013367,
 -0.4347298741340637,
 -0.13181941211223602,
 1.356335997581482,
 -0.32810524106025696,
 0.4962317645549774,
 -1.0731232166290283,
 -1.0636467933654785,
 1.3862249851226807,
 -2.0697250366210938,
 -0.7877135872840881,
 0.10299061983823776,
 0.2702336311340332,
 -0.450927197933197,
 -0.5212723016738892,
 -0.6690362095832825,
 2.5549535751342773,
 0.35938674211502075,
 -1.4385085105895996,
 0.5460162162780762,
 -1.0647046566009521,
 -0.8236260414123535,
 0.8933995962142944,
 0.17097486555576324,
 0.13953211903572083,
 -0.6649188995361328,
 -1.7843492031097412,
 -1.1704142093658447,
 0.4544524550437927,
 1.593052864074707,
 -0.566588282585144,
 -1.0162768363952637,
 0.8070845603942871,
 0.16378211975097656,
 -0.8856894969940186,
 -1.070311427116394,
 1.9495694637298584,
 -0.32044121623039246,
 0.8

In [None]:
results = {}

for model in MODELS:
    print(f"Embedding with {model}")
    embs, times = embed_corpus(chunks, model)

    results[model] = {
        "embeddings": embs,
        "embed_times": times,
        "dim": embs.shape[1]
    }
    
    print(f"Done. Avg time: {np.mean(times):.2f}s, Dim: {embs.shape[1]}")

Embedding with nomic-embed-text
Embedded 50 / 200 chunks


In [10]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [11]:
import random

NUM_QUERIES = 20          # how many queries to test
QUERY_WORDS = 30          # length of query sampled from chunk
RANDOM_SEED = 42

random.seed(RANDOM_SEED)


In [12]:
def generate_queries(chunks, meta, num_queries=20, query_words=30):
    queries = []

    indices = random.sample(range(len(chunks)), num_queries)

    for idx in indices:
        chunk = chunks[idx]
        words = chunk.split()

        if len(words) <= query_words:
            query = chunk
        else:
            start = random.randint(0, len(words) - query_words)
            query = " ".join(words[start:start + query_words])

        queries.append({
            "query": query,
            "true_doc": meta[idx],
            "chunk_index": idx
        })

    return queries


In [13]:
queries = generate_queries(
    chunks,
    chunk_meta,
    num_queries=NUM_QUERIES,
    query_words=QUERY_WORDS
)

len(queries)


20

In [14]:
def recall_at_k(ranked_indices, true_doc, meta, k):
    return any(meta[i] == true_doc for i in ranked_indices[:k])


def reciprocal_rank(ranked_indices, true_doc, meta):
    for rank, idx in enumerate(ranked_indices, start=1):
        if meta[idx] == true_doc:
            return 1 / rank
    return 0


In [15]:
def evaluate_model(model, model_data, queries, meta, top_k=5):
    recall_hits = 0
    mrr_total = 0.0
    q_times = []
    r_times = []

    for q in queries:
        query_text = q["query"]
        true_doc = q["true_doc"]

        # Query embedding
        q_start = time.perf_counter()
        q_emb = embed(query_text, model)
        q_times.append(time.perf_counter() - q_start)

        # Retrieval
        r_start = time.perf_counter()
        sims = [
            cosine_similarity(q_emb, emb)
            for emb in model_data["embeddings"]
        ]
        ranked = np.argsort(sims)[::-1]
        r_times.append(time.perf_counter() - r_start)

        # Metrics
        if recall_at_k(ranked, true_doc, meta, top_k):
            recall_hits += 1

        mrr_total += reciprocal_rank(ranked, true_doc, meta)

    return {
        "recall@k": recall_hits / len(queries),
        "mrr": mrr_total / len(queries),
        "avg_query_embed_time": sum(q_times) / len(q_times),
        "avg_retrieval_time": sum(r_times) / len(r_times),
    }


In [16]:
for model in MODELS:
    print(f"\nModel: {model}")

    metrics = evaluate_model(
        model=model,
        model_data=results[model],
        queries=queries,
        meta=chunk_meta,
        top_k=TOP_K
    )

    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")



Model: nomic-embed-text


NameError: name 'results' is not defined

In [7]:
# ===================== SINGLE CELL: STABLE OLLAMA EMBEDDING BENCHMARK =====================

import time, random, requests
import numpy as np
from pathlib import Path
from tqdm import tqdm

# ---------------- CONFIG ----------------
DATA_DIR = Path(r"C:\Users\Jamin Carter\Downloads\web_archive")

MODELS = [
    "all-minilm",
    "nomic-embed-text",
    "mxbai-embed-large"
]

MAX_FILES = 10
CHUNK_OVERLAP = 40
MAX_CHARS = 6000
CHUNK_SIZE = 800

NUM_QUERIES = 20
QUERY_WORDS = 25
TOP_K = 5

OLLAMA_URL = "http://localhost:11434/api/embeddings"

REQUEST_SLEEP = 0     # REQUIRED to avoid Ollama crashes
TIMEOUT = 120
SEED = 42

random.seed(SEED)

# ---------------- LOAD FILES ----------------
files = list(DATA_DIR.glob("*.txt"))
files = random.sample(files, min(len(files), MAX_FILES))

documents = {
    f.name: f.read_text(encoding="utf-8", errors="ignore")
    for f in files
}

# ---------------- CHUNKING ----------------
def chunk_text(text):
    words = text.split()
    out, i = [], 0
    while i < len(words):
        out.append(" ".join(words[i:i + CHUNK_SIZE])[:MAX_CHARS])
        i += CHUNK_SIZE - CHUNK_OVERLAP
    return out

chunks, meta = [], []
for name, text in documents.items():
    for c in chunk_text(text):
        chunks.append(c)
        meta.append(name)

print(f"Files: {len(documents)} | Chunks: {len(chunks)}")

# ---------------- SAFE EMBEDDING ----------------
session = requests.Session()

def embed_safe(text, model):
    try:
        r = session.post(
            OLLAMA_URL,
            json={"model": model, "prompt": text},
            timeout=TIMEOUT
        )
        time.sleep(REQUEST_SLEEP)
        if r.status_code != 200:
            return None
        return r.json().get("embedding")
    except Exception:
        return None

def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# ---------------- EMBED CORPUS ----------------
results = {}

for model in MODELS:
    print(f"\nEmbedding model: {model}")

    # Warm-up (critical)
    embed_safe("warmup", model)

    embs, times = [], []

    for c in tqdm(chunks, desc=f"Embedding [{model}]"):
        t0 = time.perf_counter()
        e = embed_safe(c, model)
        if e is not None:
            embs.append(e)
            times.append(time.perf_counter() - t0)

    if len(embs) == 0:
        print(f"⚠️  No embeddings produced for {model}. Skipping.")
        continue

    embs = np.array(embs)

    results[model] = {
        "embeddings": embs,
        "dim": embs.shape[1],
        "avg_embed_time": sum(times) / len(times)
    }

# ---------------- AUTO QUERY GENERATION ----------------
if len(results) == 0:
    raise RuntimeError("No models produced embeddings. Ollama is unstable.")

indices = random.sample(range(len(chunks)), min(NUM_QUERIES, len(chunks)))
queries = []

for i in indices:
    words = chunks[i].split()
    if len(words) <= QUERY_WORDS:
        q = chunks[i]
    else:
        s = random.randint(0, len(words) - QUERY_WORDS)
        q = " ".join(words[s:s + QUERY_WORDS])
    queries.append((q[:MAX_CHARS], meta[i]))

# ---------------- EVALUATION ----------------
for model, data in results.items():
    embs = data["embeddings"]

    recall, mrr = 0, 0
    q_times, r_times = [], []

    for q, true_doc in tqdm(queries, desc=f"Retrieval [{model}]"):
        t0 = time.perf_counter()
        q_emb = embed_safe(q, model)
        if q_emb is None:
            continue
        q_times.append(time.perf_counter() - t0)

        t1 = time.perf_counter()
        sims = [cosine(q_emb, e) for e in embs]
        ranked = np.argsort(sims)[::-1]
        r_times.append(time.perf_counter() - t1)

        if any(meta[i] == true_doc for i in ranked[:TOP_K]):
            recall += 1

        for rank, i in enumerate(ranked, 1):
            if meta[i] == true_doc:
                mrr += 1 / rank
                break

    print(f"\nMODEL: {model}")
    print(f"Embedding dim: {data['dim']}")
    print(f"Avg embed time/chunk: {data['avg_embed_time']:.4f}s")
    print(f"Recall@{TOP_K}: {recall / len(queries):.4f}")
    print(f"MRR: {mrr / len(queries):.4f}")
    print(f"Avg query embed time: {sum(q_times)/len(q_times):.4f}s")
    print(f"Avg retrieval time: {sum(r_times)/len(r_times):.4f}s")

# ===================== END =====================


Files: 10 | Chunks: 249

Embedding model: all-minilm


Embedding [all-minilm]: 100%|██████████| 249/249 [00:05<00:00, 42.04it/s]



Embedding model: nomic-embed-text


Embedding [nomic-embed-text]: 100%|██████████| 249/249 [00:16<00:00, 15.21it/s]



Embedding model: mxbai-embed-large


Embedding [mxbai-embed-large]: 100%|██████████| 249/249 [00:06<00:00, 38.66it/s]
Retrieval [all-minilm]: 100%|██████████| 20/20 [00:00<00:00, 27.34it/s]



MODEL: all-minilm
Embedding dim: 384
Avg embed time/chunk: 0.0424s
Recall@5: 0.0000
MRR: 0.0000
Avg query embed time: 0.0363s
Avg retrieval time: 0.0001s


Retrieval [nomic-embed-text]: 100%|██████████| 20/20 [00:01<00:00, 14.85it/s]



MODEL: nomic-embed-text
Embedding dim: 768
Avg embed time/chunk: 0.0656s
Recall@5: 1.0000
MRR: 0.9750
Avg query embed time: 0.0532s
Avg retrieval time: 0.0137s


Retrieval [mxbai-embed-large]: 100%|██████████| 20/20 [00:00<00:00, 22.08it/s]


MODEL: mxbai-embed-large
Embedding dim: 1024
Avg embed time/chunk: 0.0716s
Recall@5: 0.0000
MRR: 0.0000
Avg query embed time: 0.0448s
Avg retrieval time: 0.0003s



