# Clustering Tools Tests

tests the functionality of various clustering algorithms in the clustering_tool module, to ensure they are working

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, make_moons
from sklearn.preprocessing import StandardScaler
from clustering_tool import clustering_tool, evaluate_clustering
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Generate Sample Data

synthetic data for testing clustering.

In [None]:
def generate_moon_data(n_samples=300, noise=0.1, random_state=42):
    X, y = make_moons(n_samples=n_samples, noise=noise, random_state=random_state)
    
    # Convert to format expected (embedding in eachdocuments) by clustering_tool
    data = [{'id': i, 'embedding': X[i].tolist()} for i in range(len(X))]
    
    return data, X, y

data, X, Y = generate_moon_data()
# print(data)
# print(X)
# print(Y)

# Visualize the data
fig, (ax) = plt.subplots(1, 1)

ax.scatter(X[:, 0], X[:, 1], c=Y, cmap='viridis', s=50, alpha=0.8)
ax.set_title('Moon Data (Non-linear Clusters)')

plt.tight_layout()
plt.show()

## 2. KMeans

In [None]:
kmeans_labels = clustering_tool(
    data,
    n_clusters=3,
    feature_key="embedding",
    algorithm="kmeans"
)

plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', s=50, alpha=0.8)
plt.title('KMeans Clustering Results (n_clusters=3)')
plt.colorbar(label='Cluster Label')
plt.show()

# Evaluate clustering performance, evaluation function is in clustering_tool.py
metrics = evaluate_clustering(X, np.array(kmeans_labels))
print("Clustering Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

## 3. DBSCAN

Note: good for non-linear, density-based clusters.

In [None]:
dbscan_labels = clustering_tool(
    data,
    feature_key="embedding",
    algorithm="dbscan",
    eps=0.2,
    min_samples=5
)

plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=dbscan_labels, cmap='viridis', s=50, alpha=0.8)
plt.title('DBSCAN Clustering Results')
plt.colorbar(label='Cluster Label')
plt.show()

# print(X)
# print(dbscan_labels)
metrics = evaluate_clustering(X, np.array(dbscan_labels))
print("Clustering Evaluation Metrics:")
# print(metrics)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

## 4. Agglomerative Clustering

In [None]:
agg_labels = clustering_tool(
    data,
    n_clusters=3,
    feature_key="embedding",
    algorithm="agglomerative",
    linkage="ward"
)
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=agg_labels, cmap='viridis', s=50, alpha=0.8)
plt.title('Agglomerative Clustering Results (n_clusters=3)')
plt.colorbar(label='Cluster Label')
plt.show()
metrics = evaluate_clustering(X, np.array(agg_labels))
print("Clustering Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

## 5. Gaussian Mixture Model Clustering

In [None]:
gmm_labels = clustering_tool(
    data,
    n_clusters=3,
    feature_key="embedding",
    algorithm="gaussian_mixture",
    covariance_type="full"
)
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=gmm_labels, cmap='viridis', s=50, alpha=0.8)
plt.title('Gaussian Mixture Model Clustering Results (n_components=3)')
plt.colorbar(label='Cluster Label')
plt.show()
metrics = evaluate_clustering(X, np.array(gmm_labels))
print("Clustering Evaluation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

## 6. HDBSCAN

In [None]:
try:
    hdbscan_labels = clustering_tool(
        data,
        feature_key="embedding",
        algorithm="hdbscan",
        min_cluster_size=5,
        min_samples=2
    )
    plt.figure(figsize=(10, 6))
    plt.scatter(X[:, 0], X[:, 1], c=hdbscan_labels, cmap='viridis', s=50, alpha=0.8)
    plt.title('HDBSCAN Clustering Results')
    plt.colorbar(label='Cluster Label')
    plt.show()
    metrics = evaluate_clustering(X, np.array(hdbscan_labels))
    print("Clustering Evaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
except Exception as e:
    print(f"HDBSCAN test failed: {e}")
    print("You may need to install hdbscan: pip install hdbscan")

## BertTopic for sematic clustering

Testing BERTopicCluster Functionality

In [13]:


import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic  # Ensure bertopic is installed

# Set styling for plots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (14, 10)

# Generated sample text documents for topic modeling
news_articles = [
    {
        "id": 1, 
        "content": "The latest advancements in artificial intelligence have led to breakthroughs in natural language processing. Researchers have developed models that can understand context and generate human-like text with unprecedented accuracy.",
        "category": "Technology"
    },
    {
        "id": 2, 
        "content": "Machine learning models are now being deployed in healthcare to predict patient outcomes and improve diagnostic accuracy. Hospitals are increasingly adopting AI systems to assist medical professionals.",
        "category": "Healthcare"
    },
    {
        "id": 3, 
        "content": "Climate scientists warn that global temperatures continue to rise at an alarming rate. New data suggests that immediate action is necessary to prevent irreversible damage to ecosystems worldwide.",
        "category": "Environment"
    },
    {
        "id": 4, 
        "content": "Renewable energy investments reached a record high last quarter. Solar and wind projects are becoming more cost-effective than traditional fossil fuel plants in many regions.",
        "category": "Environment"
    },
    {
        "id": 5, 
        "content": "The stock market showed volatility following the central bank's announcement on interest rates. Investors are cautiously monitoring economic indicators for signs of inflation.",
        "category": "Finance"
    },
    {
        "id": 6, 
        "content": "Tech companies are facing increased scrutiny over data privacy practices. Regulators are proposing new frameworks to protect consumer information and ensure transparency.",
        "category": "Technology"
    },
    {
        "id": 7, 
        "content": "A new study shows promising results for an experimental cancer treatment using immunotherapy. Clinical trials demonstrated significant tumor reduction in patients with advanced stages of the disease.",
        "category": "Healthcare"
    },
    {
        "id": 8, 
        "content": "Financial analysts predict continued growth in the cryptocurrency market despite recent regulatory challenges. Institutional adoption is driving mainstream acceptance of digital assets.",
        "category": "Finance"
    },
    {
        "id": 9, 
        "content": "Researchers have developed more efficient batteries that could extend electric vehicle range by up to 40%. This breakthrough addresses one of the main barriers to widespread EV adoption.",
        "category": "Technology"
    },
    {
        "id": 10, 
        "content": "Conservation efforts have led to the recovery of several endangered species in protected habitats. Biodiversity initiatives are showing positive results in ecosystem restoration.",
        "category": "Environment"
    }
]



In [None]:

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
texts = [article["content"] for article in news_articles]
embeddings = model.encode(texts)

# Create data format expected by clustering_tool
data_for_clustering = [
    {
        "id": article["id"],
        "embedding": embedding.tolist(),
        "content": article["content"],
        "category": article["category"]
    }
    for article, embedding in zip(news_articles, embeddings)
]

print(f"Generated embeddings with shape: {embeddings.shape}")
        

In [None]:
import bertopic

result = clustering_tool(
    inputs=data_for_clustering,
    algorithm="bertopic",
    feature_key="embedding",
    return_metrics=True,
    nr_topics="auto",  # Let BERTopic determine optimal number of topics
    min_topic_size=2,
    n_neighbors=3,  # UMAP parameter
    n_components=5,  # UMAP dimensionality
    verbose=True
)
# print(result)

# metrics is the metrics returned by the clustering tool for evaluation clustering performance
if isinstance(result, dict):
    labels = result.get("labels", [])
    metrics = result.get("metrics", {})
else:
    labels = result
    metrics = {}

# dataFrame with results
results_df = pd.DataFrame({
    "Document": [f"Doc {i+1}" for i in range(len(texts))],
    "Text": [text[:50] + "..." for text in texts],
    "Topic": labels,
    "Category": [article["category"] for article in news_articles]
})

print("\nClustering Results:")
display(results_df)