# 1. Data Acquisition (ETL)

## Installation and Setup
First, install all required libraries for the analysis.

In [None]:
# Install core libraries
%pip install pandas pyarrow fastparquet

# Install NLP and ML libraries
%pip install sentence-transformers torch safetensors accelerate
%pip install scikit-learn scipy
%pip install matplotlib umap-learn

# Install translation library
%pip install googletrans==4.0.0-rc1

# Optional: For advanced BERTopic analysis
# %pip install bertopic hdbscan transformers

## Load Article Corpus

Load the article corpus from the public Parquet file hosted on GitHub.

In [None]:
import pandas as pd

# Load data from GitHub
url = "https://github.com/Tao-Pi/CAS-Applied-Data-Science/raw/main/Module-3/01_Module%20Final%20Assignment/export_articles_v2_sample25mb.parquet"
srgssr_article_corpus = pd.read_parquet(url, engine="fastparquet")

# For full dataset access (requires permissions):
# srgssr_article_corpus = spark.table("swi_audience_prd.pdp_articles_v2.articles_v2").toPandas()

In [None]:
# Check access level
has_read_access_udp_articles_v2 = False

# 2. Dataset Overview

## Check Dataset Version and Size

In [None]:
def format_rowcount(n):
    if n >= 1_000_000:
        return f"more than {n // 1_000_000} million"
    elif n >= 1_000:
        return f"more than {n // 1_000} thousand"
    else:
        return f"{n}"

if has_read_access_udp_articles_v2:
    rowcount = srgssr_article_corpus.count()
    print(f"✓ Full dataset loaded: {format_rowcount(rowcount)} articles from SRG-SSR")
else:
    if isinstance(srgssr_article_corpus, pd.DataFrame):
        rowcount = len(srgssr_article_corpus)
    else:
        rowcount = srgssr_article_corpus.count()
    print(f"✓ Public sample loaded: {format_rowcount(rowcount)} articles from SRG-SSR")
    print("You can access the dataframe via 'srgssr_article_corpus'")

## Data Structure Overview

In [None]:
# Display column information
first_row = srgssr_article_corpus.iloc[0].to_dict() if not srgssr_article_corpus.empty else {}

cols_info = [
    {
        "column": col,
        "type": str(dtype),
        "example": first_row.get(col, None)
    }
    for col, dtype in srgssr_article_corpus.dtypes.items()
]

print(f"Dataset shape: {srgssr_article_corpus.shape}")
print(f"Columns: {len(cols_info)}")
pd.DataFrame(cols_info).head(20)

## Preview Data

In [None]:
display(srgssr_article_corpus.head(10))

## Limit Dataset for Testing

For faster iteration during development, we can work with a subset of the data.

In [None]:
# Work with subset for faster processing (optional)
srgssr_article_corpus = srgssr_article_corpus.head(1000)
print(f"Working with {len(srgssr_article_corpus)} articles")

# 3. Semantic Search Implementation

## USE CASE: Quickly Search Articles Without Google

**Goal:** Search all articles without external tools. This helps writers check if a story was already written by colleagues in different branches.

**Approach:**
- Use Sentence Transformers to create semantic embeddings
- Implement similarity-based search
- Enable keyword, phrase, or topic queries

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

TEXT_COL = "content_text_csv"
ID_COL = "id"

# Prepare data
df = srgssr_article_corpus.copy()
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

# Initialize model (singleton pattern)
_model = None
def get_embedder():
    global _model
    if _model is None:
        _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    return _model

print("Creating embeddings for semantic search...")
model = get_embedder()
emb_matrix = model.encode(
    df[TEXT_COL].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

ids = df[ID_COL].tolist()
texts = df[TEXT_COL].tolist()

print(f"✓ Created {emb_matrix.shape[0]} embeddings of dimension {emb_matrix.shape[1]}")

In [None]:
def semantic_search(query: str, top_k: int = 10) -> pd.DataFrame:
    """Search articles using semantic similarity"""
    q = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
    sims = emb_matrix @ q
    top_idx = np.argpartition(-sims, kth=min(top_k, len(sims)-1))[:top_k]
    top_idx = top_idx[np.argsort(-sims[top_idx])]
    return pd.DataFrame({
        "id": [ids[i] for i in top_idx],
        "content_text_csv": [texts[i][:200] + "..." for i in top_idx],  # Truncate for display
        "similarity": [float(sims[i]) for i in top_idx],
    })

# Example usage
results = semantic_search("climate change", top_k=10)
print("Top 10 articles about 'climate change':")
display(results)

# 4. Topic Clustering Analysis

## USE CASE: Discover What Topics SRG Writes About

**Goal:** Identify common topics and themes in SRG articles to enhance navigation and filtering.

**Approach:**
- Use K-means clustering on semantic embeddings
- Extract topic keywords from each cluster
- Visualize clusters in 2D using UMAP

In [None]:
from sklearn.cluster import KMeans
from collections import Counter
import re

# Perform clustering
n_clusters = 10
print(f"Clustering articles into {n_clusters} topics...")

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
labels = kmeans.fit_predict(emb_matrix)

df_clusters = pd.DataFrame({
    "id": ids,
    "content_text_csv": texts,
    "cluster": labels
})

print(f"✓ Clustering complete")

In [None]:
# Extract topic keywords for each cluster
def get_topic_keywords(cluster_id, df_clusters, top_n=3):
    """Extract most common meaningful words from articles in a cluster"""
    cluster_texts = df_clusters[df_clusters['cluster'] == cluster_id]['content_text_csv'].tolist()
    
    # Combine all texts in the cluster
    combined_text = ' '.join(cluster_texts).lower()
    
    # Extract words (filter out very short words and common stopwords)
    words = re.findall(r'\b[a-zäöüàéèêëïôùû]{4,}\b', combined_text)
    
    # Multilingual stopwords
    stopwords = {
        'dass', 'sind', 'wird', 'wurden', 'wurde', 'haben', 'sein', 
        'eine', 'einem', 'einen', 'einer', 'dies', 'diese', 'dieser',
        'auch', 'mehr', 'beim', 'über', 'nach', 'sich', 'oder', 'kann',
        'können', 'müssen', 'soll', 'sollen', 'noch', 'bereits', 'aber',
        'wenn', 'weil', 'denn', 'dann', 'sowie', 'damit', 'with',
        'from', 'have', 'this', 'that', 'will', 'been', 'were', 'their',
        'what', 'which', 'when', 'where', 'there', 'pour', 'dans', 'avec',
        'sont', 'être', 'cette', 'mais', 'plus', 'comme', 'fait'
    }
    
    # Filter and count words
    words = [w for w in words if w not in stopwords]
    word_counts = Counter(words)
    
    # Get top keywords
    top_words = [word for word, count in word_counts.most_common(top_n)]
    return ', '.join(top_words) if top_words else f"Topic {cluster_id}"

# Generate topic labels and add to dataframe
topic_labels = {}
print("\nCluster Topics (based on most frequent keywords):")
print("=" * 70)
for cluster_id in range(n_clusters):
    keywords = get_topic_keywords(cluster_id, df_clusters, top_n=3)
    topic_labels[cluster_id] = keywords
    count = len(df_clusters[df_clusters['cluster'] == cluster_id])
    print(f"Cluster {cluster_id}: {keywords:<40} ({count} articles)")

# Map topic keywords to each row
df_clusters['cluster_topic'] = df_clusters['cluster'].map(topic_labels)

print("\n✓ Topic extraction complete")
display(df_clusters.head(10))

## Visualize Topic Clusters with UMAP

In [None]:
import matplotlib.pyplot as plt
import umap

print("Reducing embeddings to 2D for visualization...")

# Reduce embeddings to 2D using UMAP
reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
embedding_2d = reducer.fit_transform(emb_matrix)

# Create scatter plot
plt.figure(figsize=(16, 12))
scatter = plt.scatter(
    embedding_2d[:, 0], 
    embedding_2d[:, 1], 
    c=labels, 
    cmap='tab10', 
    alpha=0.6, 
    s=50
)

plt.colorbar(scatter, label='Cluster')
plt.title('Topic Clusters Visualization with Keywords (UMAP Projection)', fontsize=16)
plt.xlabel('UMAP Dimension 1', fontsize=12)
plt.ylabel('UMAP Dimension 2', fontsize=12)
plt.grid(True, alpha=0.3)

# Add cluster centers with topic labels
kmeans_centers_2d = reducer.transform(kmeans.cluster_centers_)
plt.scatter(
    kmeans_centers_2d[:, 0], 
    kmeans_centers_2d[:, 1], 
    c='red', 
    marker='X', 
    s=200, 
    edgecolors='black', 
    linewidths=2,
    label='Cluster Centers'
)

# Add text labels for each cluster center
for cluster_id in range(n_clusters):
    x, y = kmeans_centers_2d[cluster_id]
    label_text = f"C{cluster_id}: {topic_labels[cluster_id]}"
    plt.annotate(
        label_text,
        xy=(x, y),
        xytext=(10, 10),
        textcoords='offset points',
        fontsize=9,
        bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.7),
        arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0', color='black', lw=1)
    )

plt.legend()
plt.tight_layout()
plt.show()

# Print cluster distribution
print("\nCluster Distribution:")
cluster_counts = df_clusters['cluster'].value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
    percentage = (count / len(df_clusters)) * 100
    print(f"Cluster {cluster_id} ({topic_labels[cluster_id]}): {count} articles ({percentage:.1f}%)")

# 5. Translation Pipeline

## USE CASE: Translate All Articles to English

**Goal:** Translate the entire corpus to English for consistent analysis and categorization.

**Approach:**
- Use Google Translate API (googletrans library)
- Implement retry logic and rate limiting
- Handle errors gracefully

In [None]:
from googletrans import Translator
import time

# Initialize translator
translator = Translator()

# Create a copy of the dataframe to store translations
df_translated = srgssr_article_corpus.copy()

# Function to translate text with error handling
def translate_text(text, dest='en', max_retries=3):
    """Translate text to target language with retry logic"""
    if pd.isna(text) or text == "":
        return ""
    
    # Limit text length to avoid API issues (Google Translate has limits)
    text_str = str(text)[:5000]  # Limit to 5000 characters
    
    for attempt in range(max_retries):
        try:
            result = translator.translate(text_str, dest=dest)
            return result.text
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(1)  # Wait before retry
                continue
            else:
                print(f"Translation failed after {max_retries} attempts: {str(e)[:100]}")
                return text_str  # Return original text if translation fails
    
    return text_str

print("Translating articles to English...")
print(f"Total articles to translate: {len(df_translated)}")
print("Note: This may take several minutes. Google Translate API has rate limits.\n")

# Translate with progress indicator
translated_texts = []
for idx, text in enumerate(df_translated['content_text_csv']):
    if idx % 50 == 0:  # Progress update every 50 articles
        print(f"Progress: {idx}/{len(df_translated)} articles translated...")
    
    translated = translate_text(text, dest='en')
    translated_texts.append(translated)
    
    # Small delay to avoid rate limiting
    if idx % 10 == 0 and idx > 0:
        time.sleep(0.5)

# Add translated column
df_translated['content_text_en'] = translated_texts

print(f"\n✓ Translation complete! Translated {len(df_translated)} articles.")
print("\nShowing first 3 translated articles:")
display(df_translated[['id', 'content_text_csv', 'content_text_en']].head(3))

# 6. Enhanced Topic Categorization

## Cluster Translated Articles

Create embeddings and clusters for the translated English text.

In [None]:
print("Creating embeddings for translated English articles...")

# Use the English translated column
df_en = df_translated.copy()
df_en['content_text_en'] = df_en['content_text_en'].fillna("").astype(str)

# Create embeddings for the English text
model_en = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb_matrix_en = model_en.encode(
    df_en['content_text_en'].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

# Perform clustering on English translations
n_clusters_en = 10
kmeans_en = KMeans(n_clusters=n_clusters_en, random_state=42, n_init="auto")
labels_en = kmeans_en.fit_predict(emb_matrix_en)

# Create dataframe with cluster assignments
df_clusters_en = pd.DataFrame({
    "id": df_en['id'].tolist(),
    "original_text": df_en['content_text_csv'].tolist(),
    "translated_text_en": df_en['content_text_en'].tolist(),
    "cluster": labels_en
})

print(f"✓ Created {emb_matrix_en.shape[0]} embeddings and {n_clusters_en} clusters")

In [None]:
# Extract topic keywords from English translations
def get_topic_keywords_en(cluster_id, df_clusters, top_n=3):
    """Extract most common meaningful words from English articles in a cluster"""
    cluster_texts = df_clusters[df_clusters['cluster'] == cluster_id]['translated_text_en'].tolist()
    
    # Combine all texts in the cluster
    combined_text = ' '.join(cluster_texts).lower()
    
    # Extract English words
    words = re.findall(r'\b[a-z]{4,}\b', combined_text)
    
    # English stopwords
    stopwords = {
        'this', 'that', 'with', 'from', 'have', 'been', 'were', 'their',
        'what', 'which', 'when', 'where', 'there', 'will', 'would', 'could',
        'should', 'about', 'after', 'also', 'many', 'more', 'most', 'other',
        'some', 'such', 'than', 'them', 'then', 'these', 'they', 'very',
        'into', 'just', 'like', 'only', 'over', 'said', 'same', 'says',
        'does', 'make', 'made', 'well', 'much', 'even', 'back', 'through',
        'year', 'years', 'being', 'people', 'according', 'since', 'during',
        'first', 'time', 'last', 'still', 'however', 'while', 'before'
    }
    
    # Filter and count words
    words = [w for w in words if w not in stopwords and len(w) > 3]
    word_counts = Counter(words)
    
    # Get top keywords
    top_words = [word for word, count in word_counts.most_common(top_n)]
    return ', '.join(top_words) if top_words else f"Topic {cluster_id}"

# Generate topic labels
topic_labels_en = {}
print("\nCluster Topics (based on English translated text):")
print("=" * 70)
for cluster_id in range(n_clusters_en):
    keywords = get_topic_keywords_en(cluster_id, df_clusters_en, top_n=3)
    topic_labels_en[cluster_id] = keywords
    count = len(df_clusters_en[df_clusters_en['cluster'] == cluster_id])
    print(f"Cluster {cluster_id}: {keywords:<40} ({count} articles)")

# Add topic labels to dataframe
df_clusters_en['cluster_topic'] = df_clusters_en['cluster'].map(topic_labels_en)

print("\n✓ Topic extraction complete for English articles")

## Visualize English Article Clusters

In [None]:
print("Creating 2D visualization of English article clusters...")

# Reduce embeddings to 2D using UMAP
reducer_en = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
embedding_2d_en = reducer_en.fit_transform(emb_matrix_en)

# Create scatter plot
plt.figure(figsize=(16, 12))
scatter = plt.scatter(
    embedding_2d_en[:, 0], 
    embedding_2d_en[:, 1], 
    c=labels_en, 
    cmap='tab10', 
    alpha=0.6, 
    s=50
)

plt.colorbar(scatter, label='Cluster')
plt.title('Topic Clusters of Translated English Articles (UMAP Projection)', fontsize=16)
plt.xlabel('UMAP Dimension 1', fontsize=12)
plt.ylabel('UMAP Dimension 2', fontsize=12)
plt.grid(True, alpha=0.3)

# Add cluster centers
kmeans_centers_2d_en = reducer_en.transform(kmeans_en.cluster_centers_)
plt.scatter(
    kmeans_centers_2d_en[:, 0], 
    kmeans_centers_2d_en[:, 1], 
    c='red', 
    marker='X', 
    s=200, 
    edgecolors='black', 
    linewidths=2,
    label='Cluster Centers'
)

# Add text labels for each cluster center
for cluster_id in range(n_clusters_en):
    x, y = kmeans_centers_2d_en[cluster_id]
    label_text = f"C{cluster_id}: {topic_labels_en[cluster_id]}"
    plt.annotate(
        label_text,
        xy=(x, y),
        xytext=(10, 10),
        textcoords='offset points',
        fontsize=9,
        bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.7),
        arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0', color='black', lw=1)
    )

plt.legend()
plt.tight_layout()
plt.show()

print("\n✓ Visualization complete")
display(df_clusters_en[['id', 'translated_text_en', 'cluster', 'cluster_topic']].head(10))

## Hierarchical Topic Categorization

Map clusters to high-level categories (Politics, Sports, Culture, Science, etc.)

In [None]:
# Define keyword patterns for major topic categories
topic_categories = {
    'Politics': ['government', 'election', 'parliament', 'minister', 'political', 'policy', 'president', 
                 'vote', 'party', 'democrat', 'republican', 'law', 'congress', 'senate', 'council',
                 'federal', 'state', 'referendum', 'campaign', 'diplomat'],
    
    'Sports': ['football', 'soccer', 'tennis', 'basketball', 'hockey', 'olympic', 'champion', 'team',
               'player', 'match', 'game', 'tournament', 'league', 'coach', 'athlete', 'sport',
               'championship', 'victory', 'defeat', 'goal', 'score'],
    
    'Economy & Business': ['economy', 'economic', 'business', 'market', 'bank', 'finance', 'investment', 'trade',
                           'company', 'stock', 'price', 'inflation', 'currency', 'export', 'import', 'growth',
                           'gdp', 'employment', 'unemployment', 'budget', 'debt', 'profit'],
    
    'Science & Technology': ['science', 'technology', 'research', 'study', 'university', 'scientist',
                             'experiment', 'discovery', 'innovation', 'digital', 'computer', 'internet',
                             'software', 'data', 'artificial', 'intelligence', 'robot', 'space', 'energy'],
    
    'Health': ['health', 'medical', 'hospital', 'doctor', 'patient', 'disease', 'treatment', 'medicine',
               'virus', 'vaccine', 'pandemic', 'covid', 'care', 'mental', 'clinic', 'drug', 'therapy'],
    
    'Environment & Climate': ['climate', 'environment', 'environmental', 'weather', 'temperature', 'global',
                              'warming', 'carbon', 'pollution', 'sustainable', 'renewable', 'energy', 'nature',
                              'forest', 'ocean', 'animal', 'species', 'biodiversity', 'ecological'],
    
    'Culture & Entertainment': ['culture', 'cultural', 'music', 'film', 'movie', 'concert', 'festival',
                                'artist', 'museum', 'exhibition', 'theater', 'performance', 'book',
                                'author', 'literature', 'entertainment', 'celebrity', 'show'],
    
    'Society & Education': ['social', 'society', 'community', 'people', 'family', 'education', 'school', 'student',
                            'teacher', 'child', 'women', 'rights', 'justice', 'police', 'crime', 'court', 'prison']
}

def assign_category(cluster_keywords):
    """Assign a category based on keyword matching"""
    keywords_lower = cluster_keywords.lower()
    scores = {}
    
    for category, category_keywords in topic_categories.items():
        score = sum(1 for kw in category_keywords if kw in keywords_lower)
        if score > 0:
            scores[category] = score
    
    if scores:
        return max(scores.items(), key=lambda x: x[1])[0]
    else:
        return 'Other'

# Assign categories to each cluster
cluster_categories = {}
print("Mapping clusters to higher-level topic categories:\n")
print(f"{'Cluster':<10} {'Keywords':<40} {'Category':<25}")
print("=" * 75)

for cluster_id in range(n_clusters_en):
    keywords = topic_labels_en[cluster_id]
    category = assign_category(keywords)
    cluster_categories[cluster_id] = category
    print(f"{cluster_id:<10} {keywords:<40} {category:<25}")

# Add category column to dataframe
df_clusters_en['topic_category'] = df_clusters_en['cluster'].map(cluster_categories)

# Count articles per category
print("\n\nArticle Distribution by Topic Category:")
print("=" * 50)
category_counts = df_clusters_en['topic_category'].value_counts()
for category, count in category_counts.items():
    percentage = (count / len(df_clusters_en)) * 100
    print(f"{category:<25} {count:>5} articles ({percentage:>5.1f}%)")

## Visualize Categories with Scatter Plot and Pie Chart

In [None]:
# Create a color map for categories
unique_categories = sorted(df_clusters_en['topic_category'].unique())
category_colors = plt.cm.Set3(np.linspace(0, 1, len(unique_categories)))
category_color_map = {cat: color for cat, color in zip(unique_categories, category_colors)}

# Create visualization with categories
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Left plot: Colored by category
for category in unique_categories:
    mask = df_clusters_en['topic_category'] == category
    indices = df_clusters_en[mask].index
    ax1.scatter(
        embedding_2d_en[indices, 0],
        embedding_2d_en[indices, 1],
        c=[category_color_map[category]],
        label=category,
        alpha=0.6,
        s=50
    )

ax1.set_title('Articles by Topic Category', fontsize=16, fontweight='bold')
ax1.set_xlabel('UMAP Dimension 1', fontsize=12)
ax1.set_ylabel('UMAP Dimension 2', fontsize=12)
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)

# Right plot: Pie chart of category distribution
ax2.pie(
    category_counts.values,
    labels=category_counts.index,
    autopct='%1.1f%%',
    startangle=90,
    colors=[category_color_map[cat] for cat in category_counts.index]
)
ax2.set_title('Distribution of Articles by Topic Category', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n✓ Category visualization complete")

## Enhance Categories with Corpus-Derived Keywords

Analyze actual word frequencies in each cluster to expand keyword lists.

In [None]:
print("Analyzing corpus to extract additional keywords...\n")
print("=" * 80)

# Analyze each cluster to find common words
all_cluster_words = {}
for cluster_id in range(n_clusters_en):
    cluster_texts = df_clusters_en[df_clusters_en['cluster'] == cluster_id]['translated_text_en'].tolist()
    combined_text = ' '.join(cluster_texts).lower()
    
    # Extract words
    words = re.findall(r'\b[a-z]{4,}\b', combined_text)
    
    # Extended stopwords
    stopwords = {
        'this', 'that', 'with', 'from', 'have', 'been', 'were', 'their',
        'what', 'which', 'when', 'where', 'there', 'will', 'would', 'could',
        'should', 'about', 'after', 'also', 'many', 'more', 'most', 'other',
        'some', 'such', 'than', 'them', 'then', 'these', 'they', 'very',
        'into', 'just', 'like', 'only', 'over', 'said', 'same', 'says',
        'does', 'make', 'made', 'well', 'much', 'even', 'back', 'through',
        'year', 'years', 'being', 'people', 'according', 'since', 'during',
        'first', 'time', 'last', 'still', 'however', 'while', 'before'
    }
    
    # Filter and count
    words = [w for w in words if w not in stopwords and len(w) > 3]
    word_counts = Counter(words)
    all_cluster_words[cluster_id] = word_counts.most_common(20)

print("Top 10 words per cluster:")
for cluster_id in range(min(3, n_clusters_en)):  # Show first 3 clusters
    print(f"\nCluster {cluster_id} ({topic_labels_en[cluster_id]}):")
    top_words = [word for word, count in all_cluster_words[cluster_id][:10]]
    print(f"  {', '.join(top_words)}")

print("\n✓ Corpus analysis complete")
print("\nThese words can be used to enhance the topic_categories dictionary")

# 7. BERTopic Analysis (Advanced - Optional)

This section demonstrates advanced topic modeling using BERTopic with custom embeddings.

**Note:** This requires additional libraries and computational resources.

In [None]:
# Uncomment to use BERTopic analysis

# from bertopic import BERTopic
# from sklearn.feature_extraction.text import CountVectorizer
# from hdbscan import HDBSCAN
# 
# # Use existing embeddings
# print("Training BERTopic model...")
# 
# # Configure BERTopic with custom settings
# vectorizer_model = CountVectorizer(stop_words="english", min_df=2, max_df=0.95)
# hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', prediction_data=True)
# 
# # Create BERTopic model with pre-computed embeddings
# topic_model = BERTopic(
#     embedding_model=model_en,
#     vectorizer_model=vectorizer_model,
#     hdbscan_model=hdbscan_model,
#     verbose=True
# )
# 
# # Fit the model
# topics, probs = topic_model.fit_transform(
#     df_en['content_text_en'].tolist(),
#     embeddings=emb_matrix_en
# )
# 
# # Visualize topics
# fig = topic_model.visualize_topics()
# fig.show()
# 
# # Get topic information
# topic_info = topic_model.get_topic_info()
# display(topic_info)
# 
# print("✓ BERTopic analysis complete")

# Summary and Next Steps

## What We Accomplished

1. ✅ **Data Acquisition**: Loaded SRG-SSR article corpus from GitHub/Databricks
2. ✅ **Semantic Search**: Implemented fast similarity-based article search
3. ✅ **Topic Clustering**: Identified 10 main topic clusters using K-means
4. ✅ **Translation**: Translated full corpus to English for consistent analysis
5. ✅ **Categorization**: Mapped clusters to high-level categories (Politics, Sports, etc.)
6. ✅ **Visualization**: Created UMAP projections and pie charts for topic distribution
7. ✅ **Enhancement**: Extracted corpus-specific keywords to improve categorization

## Potential Next Steps

- **Expand Translation**: Translate to multiple languages (FR, IT, DE, etc.)
- **Time Analysis**: Analyze topic trends over time
- **Author Analysis**: Identify topics by author or publication
- **Advanced Models**: Experiment with BERTopic, LDA, or transformer-based models
- **Production Pipeline**: Automate the workflow for continuous updates
- **Interactive Dashboard**: Build a Streamlit/Dash app for exploration

## Resources

- [Sentence Transformers Documentation](https://www.sbert.net/)
- [UMAP Documentation](https://umap-learn.readthedocs.io/)
- [BERTopic Documentation](https://maartengr.github.io/BERTopic/)
- [SRG-SSR GitHub Repository](https://github.com/Tao-Pi/CAS-Applied-Data-Science)

# Appendix: Alternative Approaches

## A. Databricks-Specific Translation

If running in Databricks environment, use the native `ai_translate` function:

```python
from pyspark.sql.functions import expr

# Check if in Databricks
try:
    spark
    is_databricks = True
except NameError:
    is_databricks = False

if is_databricks:
    # Convert to Spark DataFrame
    if isinstance(srgssr_article_corpus, pd.DataFrame):
        df_spark = spark.createDataFrame(srgssr_article_corpus)
    else:
        df_spark = srgssr_article_corpus
    
    # Add translation columns
    lang_map = {"en": "en", "fr": "fr", "it": "it", "de": "de"}
    df_translated = df_spark
    for lang, db_lang in lang_map.items():
        df_translated = df_translated.withColumn(
            f"content_text_{lang}",
            expr(f"ai_translate(content_text_csv, '{db_lang}')")
        )
    
    display(df_translated)
```

## B. Reading from Delta Tables

For users with access to the full dataset:

```python
from pyspark.sql.utils import AnalysisException

def has_read_permission(table_name):
    try:
        spark.sql(f"SELECT 1 FROM {table_name} LIMIT 1")
        return True
    except AnalysisException:
        return False

# Check permissions
has_access = has_read_permission("udp_prd_atomic.pdp.articles_v2")

if has_access:
    # Read full dataset
    df = spark.table("udp_prd_atomic.pdp.articles_v2")
    srgssr_article_corpus = df.toPandas()
```

## C. Custom Embedding Models

For specialized domains, consider fine-tuned models:

```python
# German-specific model
model_de = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# Larger model for better quality
model_large = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Custom model (requires training)
from sentence_transformers import SentenceTransformer, InputExample, losses
# ... fine-tuning code ...
```

---

**Notebook Version:** 1.0 (Merged)  
**Last Updated:** November 26, 2025  
**Authors:** CAS Applied Data Science Module 3 Team  
**License:** For educational purposes only

---