In [2]:
import json
import umap
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

embeddings = np.load("embeddings.npy")
print(embeddings.shape)  # Should be (n, 384)


(66198, 384)


In [None]:

# Define expanded parameter combinations
n_components_range = range(10, 101, 10)  # 10, 20, 30, ..., 100
n_neighbors_range = range(10, 101, 10)   # 10, 20, 30, ..., 100
min_dist_values = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.65, 0.8, 0.99]
distance_thresholds = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

# File to track progress
progress_file = "clustering_progress.json"
results_file = "clustering_results.csv"

# Generate all parameter combinations
umap_params = []
for n_comp in n_components_range:
    for n_neigh in n_neighbors_range:
        for min_dist in min_dist_values:
            umap_params.append((n_comp, n_neigh, min_dist))

# Load existing results and progress if available
results = []
completed_params = set()

if os.path.exists(results_file):
    df_results = pd.read_csv(results_file)
    results = df_results.to_dict('records')
    
    # Extract already completed parameter combinations
    for row in results:
        param_key = (row["n_components"], row["n_neighbors"], row["min_dist"], row["distance_threshold"])
        completed_params.add(param_key)
    
    print(f"Loaded {len(results)} existing results from {results_file}")

if os.path.exists(progress_file):
    with open(progress_file, 'r') as f:
        progress_data = json.load(f)
        last_index = progress_data.get('last_index', 0)
        print(f"Resuming from parameter combination index {last_index}")
else:
    last_index = 0

# Process remaining parameter combinations
try:
    for param_idx, (n_comp, n_neigh, min_dist) in enumerate(umap_params[last_index:], start=last_index):
        print(f"Processing UMAP: n_components={n_comp}, n_neighbors={n_neigh}, min_dist={min_dist} [{param_idx+1}/{len(umap_params)}]")
        
        # Apply UMAP
        umap_reducer = umap.UMAP(
            n_components=n_comp, 
            n_neighbors=n_neigh, 
            min_dist=min_dist, 
            metric='cosine', 
            random_state=42
        )
        umap_embeddings = umap_reducer.fit_transform(embeddings)
        
        for dist_thresh in distance_thresholds:
            # Skip if this combination has already been processed
            param_key = (n_comp, n_neigh, min_dist, dist_thresh)
            if param_key in completed_params:
                print(f"  - Skipping AgglomerativeClustering with distance_threshold={dist_thresh} (already processed)")
                continue
                
            print(f"  - Applying AgglomerativeClustering with distance_threshold={dist_thresh}")
            
            # Apply Agglomerative Clustering
            cluster_model = AgglomerativeClustering(n_clusters=None, distance_threshold=dist_thresh)
            labels = cluster_model.fit_predict(umap_embeddings)
            num_clusters = len(set(labels))
            
            # Compute silhouette score only if there is more than 1 cluster and more than 1 sample
            if num_clusters > 1 and len(set(labels)) < len(labels):
                try:
                    sil_score = silhouette_score(umap_embeddings, labels)
                except:
                    sil_score = -1  # Error in calculation
            else:
                sil_score = -1  # Invalid case (all points in one cluster)
            
            # Save cluster assignments to JSON
            clustered_keywords = {}
            for keyword, cluster in zip(keywords_2011, labels):
                clustered_keywords.setdefault(str(cluster), []).append(keyword)
            
            filename = f"clustered_keywords_ncomp{n_comp}_nneigh{n_neigh}_mindist{min_dist}_dthresh{dist_thresh}.json"
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(clustered_keywords, f, indent=4, ensure_ascii=False)
            print(f"    - Saved results to {filename}")
            
            # Store results in table
            results.append({
                "n_components": n_comp,
                "n_neighbors": n_neigh,
                "min_dist": min_dist,
                "distance_threshold": dist_thresh,
                "num_clusters": num_clusters,
                "silhouette_score": sil_score
            })
            
            # Save results incrementally
            df_results = pd.DataFrame(results)
            df_results.to_csv(results_file, index=False)
            
            # Update progress
            with open(progress_file, 'w') as f:
                json.dump({'last_index': param_idx}, f)
            
            completed_params.add(param_key)
        
except KeyboardInterrupt:
    print("\nProcess interrupted by user. Progress has been saved.")

print(f"\nClustering completed/paused. Results saved in '{results_file}'.")

# If we have enough results, generate the visualizations
if results:
    df_results = pd.DataFrame(results)
    
    # Find best parameters
    valid_results = df_results[df_results["silhouette_score"] > 0]
    if not valid_results.empty:
        best_params = valid_results.loc[valid_results["silhouette_score"].idxmax()]
        print(f"\nBest parameters found so far:")
        print(f"  n_components: {best_params['n_components']}")
        print(f"  n_neighbors: {best_params['n_neighbors']}")
        print(f"  min_dist: {best_params['min_dist']}")
        print(f"  distance_threshold: {best_params['distance_threshold']}")
        print(f"  num_clusters: {best_params['num_clusters']}")
        print(f"  silhouette_score: {best_params['silhouette_score']}")
    
    # Create a more readable visualization with subplots
    plt.figure(figsize=(20, 15))
    
    # Group by n_components and create a subplot for each
    unique_n_comp = sorted(df_results["n_components"].unique())
    num_plots = len(unique_n_comp)
    rows = int(np.ceil(num_plots / 3))  # 3 plots per row
    
    for i, n_comp in enumerate(unique_n_comp):
        ax = plt.subplot(rows, 3, i+1)
        
        comp_data = df_results[df_results["n_components"] == n_comp]
        for (n_neigh, min_dist), group in comp_data.groupby(["n_neighbors", "min_dist"]):
            valid_data = group[group["silhouette_score"] > -1]
            if not valid_data.empty:
                ax.plot(
                    valid_data["distance_threshold"], 
                    valid_data["silhouette_score"], 
                    marker="o", 
                    label=f"n_neigh={n_neigh}, min_dist={min_dist}"
                )
        
        ax.set_xlabel("Distance Threshold")
        ax.set_ylabel("Silhouette Score")
        ax.set_title(f"n_components = {n_comp}")
        ax.grid(True)
        
        # Only show legend for the first subplot to save space
        if i == 0:
            ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig("silhouette_scores.png", dpi=300, bbox_inches="tight")
    
    try:
        # Create a heatmap for top parameter combinations
        top_results = df_results.sort_values("silhouette_score", ascending=False).head(100)
        
        plt.figure(figsize=(12, 8))
        pivot = pd.pivot_table(
            top_results, 
            values="silhouette_score", 
            index=["n_components", "n_neighbors"],
            columns=["min_dist", "distance_threshold"]
        )
        plt.imshow(pivot, cmap="viridis")
        plt.colorbar(label="Silhouette Score")
        plt.title("Top 100 Parameter Combinations by Silhouette Score")
        plt.savefig("top_parameters_heatmap.png", dpi=300)
    except:
        print("Could not create heatmap visualization (likely due to sparse data)")

Processing UMAP: n_components=10, n_neighbors=10, min_dist=0.01 [1/1000]


  warn(


  - Applying AgglomerativeClustering with distance_threshold=0.5
