In [1]:
!pip -q install nltk -U sentence-transformers chromadb tqdm

import nltk
from nltk.tokenize import sent_tokenize
from typing import List

# Download tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m104.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m36.2 MB/s[0m eta [36m0:00

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import pandas as pd
import json

# Set LOCAL variable values
# Article (data) selection
main_data = json.load(open("wikinews_data.json", 'r'))
for i, title in enumerate([d['title'] for d in main_data]):
  print(f'{i}: {title}')

# Change the index for new article
ARTICLE_INDEX = 14
data = main_data[ARTICLE_INDEX]
print(f"\nWill be working with '{data['title']}' article data!")

source_data = data['source_data']
chroma_collection_name = data['chroma_collection_name']
chroma_db_path = "chroma_db"
model_path = "models"
print(f"\nStarted with the chunking and embedding of '{data['title']}' article data")

0: US: Tulsa residents approve $814 million infrastructure package
1: 78th British Academy Film Awards held in London
2: Ryan Gosling cast in upcoming Star Wars film
3: Thai officials seize 238 tons of illegal e-waste at Bangkok port
4: 20-year-old astrophotographer captures rare solar eclipse on Saturn
5: Scientists discover seagrass off Australia is world's largest plant
6: India defeats New Zealand to win 2025 Champions Trophy
7: Researchers film colossal squid in its natural habitat for the first time
8: SpaceX will return stranded astronauts in February 2025, NASA announces
9: Microsoft, Nware sign 10-year cloud gaming deal
10: United Kingdom buries Queen Elizabeth II after state funeral
11: UK heavy metal band Black Sabbath announces final performance with original lineup
12: GSK rejects three Unilever bids to buy consumer healthcare arm, says unit was 'fundamentally undervalued'
13: FIFA World Cup 2018 Last 16: France, Uruguay send Argentina, Portugal home
14: European Union to 

In [None]:
from nltk.tokenize import sent_tokenize


def pick_n_by_source_data_wc(word_count_source_data):
    if word_count_source_data < 700:
        return 4, 1
    elif word_count_source_data < 1800:
        return 3, 1
    else:
        return 2, 1


def pair_sentences(source_data, n, stride, max_chars):
    """
    Make sliding-window chunks of 'n' sentences from a single reference string.
    Stride controls the step between windows.
    Optionally trim each chunk to max_chars (soft cut on whitespace).
    Returns: list of {'chunk_id', 'chunk_text'}.
    """
    sentences = [s.strip() for s in sent_tokenize(source_data) if s.strip()]
    chunks = []
    if not sentences or n <= 0:
        return chunks

    for i in range(0, len(sentences) - n + 1, max(1, stride)):
        text = " ".join(sentences[i:i+n])
        if max_chars and len(text) > max_chars:
            # soft trim at last space to avoid mid-word cut
            cut = text[:max_chars].rsplit(" ", 1)[0]
            text = cut if cut else text[:max_chars]
        chunks.append({"chunk_id": str(i), "chunk_text": text})
    return chunks


wc_source = data['word_count_source_data']
n, stride = pick_n_by_source_data_wc(wc_source)
chunked_articles = pair_sentences(
    source_data,
    n=n,
    stride=stride,
    max_chars=900)

print(f"Source word count: {wc_source}")
print(f"Using n={n}, stride={stride}")
print(f"Total chunks: {len(chunked_articles)}")
if chunked_articles:
    print(f"Sample chunk -- {chunked_articles[0]}")

Source word count: 3911
Using n=2, stride=1
Total chunks: 144
Sample chunk -- {'chunk_id': '0', 'chunk_text': 'The EU has adopted ambitious new targets to curb climate change, with a pledge to make them legally binding. Under a new law agreed between member states and the EU Parliament, the bloc will cut carbon emissions by at least 55% by 2030, compared with 1990 levels.'}


In [4]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch
import os

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model name and local path
model_name = "BAAI/bge-large-en-v1.5"
local_dir = f"{model_path}/{model_name.split('/')[1]}"

# If model isn't already downloaded, fetch and save to local directory
if not os.path.exists(local_dir):
  # Download model
  model = SentenceTransformer(model_name, device=device)
  model.max_seq_length = 512
  model.save(local_dir)
  print("Model download complete and saved locally!")
else:
  print(f"Model already downloaded in {local_dir}. Loading from local path.")
  model = SentenceTransformer(local_dir, device=device)
  model.max_seq_length = 512
  print(f"Loaded model from {local_dir} successfully on {device.upper()}!")

Model already downloaded in /content/drive/MyDrive/Colab Notebooks/Dissertation/models/bge-large-en-v1.5. Loading from local path.
Loaded model from /content/drive/MyDrive/Colab Notebooks/Dissertation/models/bge-large-en-v1.5 successfully on CUDA!


In [8]:
from sentence_transformers import util

# Example words
words = ["cat", "tiger", "paris"]

# Encode into embeddings
embeddings = model.encode(words, normalize_embeddings=True)

# Compute cosine similarities
similarity_matrix = util.cos_sim(embeddings, embeddings)

# Pretty print results
df = pd.DataFrame(
    similarity_matrix.numpy(),
    index=words,
    columns=words
)
print(df.round(3))

         cat  tiger  paris
cat    1.000  0.792  0.654
tiger  0.792  1.000  0.633
paris  0.654  0.633  1.000


In [7]:
import torch
import gc

# This ensures GPU has enough Memory
gc.collect()
torch.cuda.empty_cache()

In [None]:
import chromadb
from chromadb import PersistentClient

# Define embedding + ChromaDB storage function
def embed_and_store_chunks_in_chroma(chroma_db_path, chunked_articles,collection_name, persist_directory="./chroma_store"):
    """
    Embeds and stores chunked_articles in ChromaDB.
    """
    documents = [f"passage: {c['chunk_text']}" for c in chunked_articles]

    ids = [f"{chunk['chunk_id']}"
           for chunk in chunked_articles]

    metadatas = [{"chunk_id": c["chunk_id"],
                  "orig_text": c["chunk_text"]}
                 for c in chunked_articles]

    # Generate embeddings
    embeddings = model.encode(documents, normalize_embeddings=True, show_progress_bar=True).tolist()

    # Create a persistent path
    db_path = chroma_db_path

    # Setup Chroma client
    chroma_client = PersistentClient(path=db_path)

    # Remove existing collection with same name
    existing_collections = [col.name for col in chroma_client.list_collections()]
    if collection_name in existing_collections:
      print("Previous collection deleted!")
      chroma_client.delete_collection(name=collection_name)

    # Create Collection
    collection = chroma_client.get_or_create_collection(
        name=collection_name,
        metadata={
            "hnsw:space": "cosine",
            "embedder": getattr(model, "name_or_path", str(model)),
            "doc_prefix": "passage: ",
            "normalized": str(True),
        },
    )

    # Add vectors
    collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas,
        embeddings=embeddings,
    )

    return collection


# Start the embedding and vector db storing stage
chroma_collection = embed_and_store_chunks_in_chroma(chroma_db_path, chunked_articles, chroma_collection_name)
print("Done with embedding and db storage!")

# Count check
print("Total stored documents:", chroma_collection.count())

# Reload from Local (TEST)
chroma_client = PersistentClient(path=f"{chroma_db_path}")
print("All available collections currently stored,")
for col in list(chroma_client.list_collections()):
  print(col)

col = chroma_client.get_collection(name=chroma_collection_name)
print(f"Currently working with {col} Chroma collection!")

# View the embedding
result = col.get(limit=1, include=["embeddings"])
embedding = result["embeddings"][0]

print("Full embedding vector length:", len(embedding))
print("First 5 values of an embedding:", embedding[:5])   # preview

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Previous collection deleted!
Done with embedding and db storage!
Total stored documents: 144
All available collections currently stored,
Collection(name=gsk_vector_embedding_store)
Collection(name=CT_vector_embedding_store)
Collection(name=colossal_squid_vector_embedding_store)
Collection(name=ryan_gosling_vector_embedding_store)
Collection(name=fifa_wc_vector_embedding_store)
Collection(name=tulsa_residents_vector_embedding_store)
Collection(name=eu_reduce_vector_embedding_store)
Collection(name=spacex_crew_vector_embedding_store)
Collection(name=Queen_funeral_vector_embedding_store)
Collection(name=microsoft_vector_embedding_store)
Collection(name=queen_funeral_vector_embedding_store)
Collection(name=black_sabbath_vector_embedding_store)
Collection(name=bafta_vector_embedding_store)
Collection(name=ewaste_vector_embedding_store)
Collection(name=saturn_vector_embedding_store)
Collection(name=seagrass_vector_embedding_store)
Currently working with Collection(name=eu_reduce_vector_embed

In [None]:
import pandas as pd


def _get_first_hit(results, keywords):
    """
    Finds the rank and max similarity score of the first document with a keyword hit.
    Searches both document content and metadata.
    """
    first_hit_rank = None
    max_similarity = 0

    docs = results["documents"][0]
    distances = results['distances'][0]
    metadatas = results.get('metadatas', [None])[0]

    for rank, (doc, dist) in enumerate(zip(docs, distances), start=1):
        # Check for keyword in document content
        hit_in_doc = any(k.lower() in doc.lower() for k in keywords)

        # Check for keyword in metadata, converting to string
        hit_in_metadata = False
        if metadatas:
            hit_in_metadata = any(k.lower() in str(metadatas[rank - 1]).lower() for k in keywords)

        similarity = 1 - dist
        if similarity > max_similarity:
            max_similarity = similarity

        if (hit_in_doc or hit_in_metadata) and first_hit_rank is None:
            first_hit_rank = rank

    return first_hit_rank, max_similarity


def evaluate_suite(test_data, model, chroma_collection, k=10):
    """
    Evaluates a full test data suite of queries & keywords

    Returns
    """
    all_results = []

    for suite_idx, suite in enumerate(test_data):
        for q in suite['queries']:
            # Embedding query
            query_embedding = model.encode("query: " + q, normalize_embeddings=True)

            # Get top-k candidates
            results = chroma_collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=k,
                include=["documents", "metadatas", "distances"]
            )

            # Compute metrics for this query
            found_keyword = None
            for kw in suite['keywords']:
                for doc in results["documents"][0]:
                    if kw.lower() in doc.lower():
                        found_keyword = kw
                        break
                if not found_keyword and results.get("metadatas"):
                    for meta in results["metadatas"][0]:
                        if kw.lower() in str(meta).lower():
                            found_keyword = kw
                            break
                if found_keyword:
                    break

            keyword_present = found_keyword is not None
            keyword = found_keyword if found_keyword else None
            first_hit_rank, max_similarity = _get_first_hit(results, suite['keywords'])
            mrr_at_k = (1.0 / first_hit_rank) if first_hit_rank else 0.0
            recall_at_k = 1 if first_hit_rank else 0

            all_results.append({
                'suite': suite_idx,
                'query': q,
                'keyword_present': keyword_present,
                'keyword': keyword,
                f'first_hit_rank@{k}': first_hit_rank,
                f'max_similarity_score@{k}': round(max_similarity, 4),
                f'Recall@{k}': recall_at_k,
                f'MRR@{k}': mrr_at_k
            })

    df = pd.DataFrame(all_results)

    # Summary
    total_queries = len(df)
    hit_count = df[f'first_hit_rank@{k}'].notna().sum()
    hit_rate = hit_count / total_queries if total_queries > 0 else 0
    mean_mrr = df[f'MRR@{k}'].mean()
    mean_similarity = df[f'max_similarity_score@{k}'].mean()

    print(f"Total Queries: {total_queries}")
    print(f"Queries with a Hit: {hit_count}")
    print(f"Hit Rate (Recall@{k}): {hit_rate:.2%}")
    print(f"Mean Reciprocal Rank (MRR): {mean_mrr:.3f}")
    print(f"Mean Max Similarity Score: {mean_similarity:.4f}")

    summary_df = pd.DataFrame([{'Total Queries': total_queries,
                               "Queries with a Hit": hit_count,
                               f"Hit Rate (Recall@{k})": hit_rate,
                               "Mean Reciprocal Rank (MRR)": mean_mrr,
                               "Mean Max Similarity Score": mean_similarity}])

    # Misses
    misses = df[df[f'first_hit_rank@{k}'].isna()]
    if not misses.empty:
        print(f"\nQueries with No Hit in Top {k}:")
        for _, row in misses.iterrows():
            print(f"  Suite {row['suite']}: {row['query']}")
    else:
        print(f"\nNo queries missed the top {k} results!")

    return df, summary_df


# load test data from data dictionary
test_data = data['embedding_test_data_suite']

# top embeddings to search for
k_value = 5

results_dense, summary = evaluate_suite(test_data, model, chroma_collection, k=k_value)
results_dense.to_csv(f"/content/drive/MyDrive/Colab Notebooks/Dissertation/Article_{ARTICLE_INDEX}/{data['title'].replace(' ', '_')}_embedding_metric_scores_full.csv", index=False)
summary.to_csv(f"/content/drive/MyDrive/Colab Notebooks/Dissertation/Article_{ARTICLE_INDEX}/{data['title'].replace(' ', '_')}_embedding_metric_scores_summary.csv", index=False)

data['embedding_results'] = results_dense.to_dict('records')
main_data[ARTICLE_INDEX] = data
json.dump(main_data, open("wikinews_data.json", "w"), indent=3, force_ascii=False, orient="records")

Total Queries: 95
Queries with a Hit: 90
Hit Rate (Recall@5): 94.74%
Mean Reciprocal Rank (MRR): 0.834
Mean Max Similarity Score: 0.7393

Queries with No Hit in Top 5:
  Suite 2: EU's net zero emissions target date?
  Suite 2: EU's long-term climate goal?
  Suite 11: Environmental carbon capture methods?
  Suite 16: Consumer changes needed for net zero?
  Suite 16: What must people do differently for climate targets?
