In [2]:
import json

# Load all three datasets
with open("data/youtube.json") as f:
    yt = json.load(f)
with open("data/podcasts.json") as f:
    podcasts = json.load(f)
with open("data/webpages_by_tag_rich.json") as f:
    pages = json.load(f)

# Normalize all into one consistent structure
contents = []
for c in yt + podcasts + pages:
    contents.append({
        "title": c.get("title"),
        "description": c.get("description", ""),
        "source": c.get("source", "youtube" if "youtube" in c.get("link","") else "unknown"),
        "url": c.get("link") or c.get("url"),
        "tags": c.get("tags") if "tags" in c else [c.get("tag")] if "tag" in c else []
    })

In [4]:
pip install -q sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

# Load tag names from your tags.json
with open("data/tags.json") as f:
    tags_data = json.load(f)

all_tags = []
for cat in tags_data["categories"].values():
    all_tags.extend(cat["tags"])

# Embed all tag strings once
tag_embeds = model.encode(all_tags, convert_to_tensor=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
tagged_contents = []

for c in contents:
    # Combine title + description for embedding
    text = (c.get("title", "") or "") + " " + (c.get("description", "") or "")
    content_embed = model.encode(text, convert_to_tensor=True)
    scores = util.cos_sim(content_embed, tag_embeds)[0]
    
    # Get top 3 matching tags
    top_tags_idx = scores.topk(3).indices
    top_tags = [all_tags[i] for i in top_tags_idx]

    # Merge existing and new tags (avoid duplicates)
    existing_tags = c.get("tags", [])
    combined_tags = list(dict.fromkeys(existing_tags + top_tags))  # preserves order & removes duplicates

    # Update and store
    c["tags"] = combined_tags
    tagged_contents.append(c)

In [8]:
for c in tagged_contents:
    print(f"ðŸŽ¬ {c['title']}")
    print("   â†’ Tags:", ", ".join(c["tags"]))
    print()

ðŸŽ¬ What is AI Ethics?
   â†’ Tags: AI ethics, Artificial Intelligence, Data Science, Economics

ðŸŽ¬ AI Ethics | Ethics Defined
   â†’ Tags: AI ethics, Artificial Intelligence, Psychology, Economics

ðŸŽ¬ The Ethics of AIâ”‚Stuart J. Russell (University of California, Berkeley, Professor)
   â†’ Tags: AI ethics, Artificial Intelligence, Article, Economics

ðŸŽ¬ Ethics & AI: Equal Access and Algorithmic Bias
   â†’ Tags: AI ethics, Artificial Intelligence, Economics, Psychology

ðŸŽ¬ Ethics of AI: Challenges and Governance
   â†’ Tags: AI ethics, Artificial Intelligence, Innovation, Leadership

ðŸŽ¬ AI is an Ethical Nightmare
   â†’ Tags: AI ethics, Artificial Intelligence, Economics, Minimalism

ðŸŽ¬ AI Is Dangerous, but Not for the Reasons You Think | Sasha Luccioni | TED
   â†’ Tags: AI ethics, Artificial Intelligence, Documentaries, True Crime

ðŸŽ¬ How to implement AI Ethics
   â†’ Tags: AI ethics, Artificial Intelligence, Machine Learning, Management

ðŸŽ¬ EU AI Act with Punit B