# FGSD Clustering Experiment on IMDB-MULTI
This notebook runs Unsupervised Clustering (K-Means, Spectral Clustering) on the IMDB-MULTI dataset using FGSD embeddings.

# Imports

In [1]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print(f"Added to path: {parent_dir}")

Added to path: /home/stavros/emb3/fgsd_method/src


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import warnings
import urllib.request
import zipfile
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize 

try:
    import umap
    HAS_UMAP = True
except ImportError:
    print("Warning: 'umap-learn' not found. Visualization will only use t-SNE.")
    HAS_UMAP = False

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, silhouette_score



# Setup Paths

In [3]:
# Ensure we can import from src
current_dir = os.path.dirname(os.path.abspath("__file__"))
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from optimized_method import HybridFGSD 
from fgsd import FlexibleFGSD 

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Data Loading

In [4]:
def download_and_load_imdb():
    data_dir = '/tmp/IMDB-MULTI'
    os.makedirs(data_dir, exist_ok=True)

    base_url = 'https://www.chrsmrrs.com/graphkerneldatasets/IMDB-MULTI.zip'
    zip_path = os.path.join(data_dir, 'IMDB-MULTI.zip')

    if not os.path.exists(os.path.join(data_dir, 'IMDB-MULTI')):
        print("Downloading IMDB-MULTI dataset...")
        urllib.request.urlretrieve(base_url, zip_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print("Download complete.")

    dataset_path = os.path.join(data_dir, 'IMDB-MULTI')

    graph_indicator = np.loadtxt(os.path.join(dataset_path, 'IMDB-MULTI_graph_indicator.txt'), dtype=int)
    edges = np.loadtxt(os.path.join(dataset_path, 'IMDB-MULTI_A.txt'), dtype=int, delimiter=',')
    graph_labels = np.loadtxt(os.path.join(dataset_path, 'IMDB-MULTI_graph_labels.txt'), dtype=int)
    
    # IMDB-MULTI does not have node labels

    num_graphs = len(graph_labels)
    graphs = [nx.Graph() for _ in range(num_graphs)]

    for node_id, graph_id in enumerate(graph_indicator, start=1):
        graphs[graph_id - 1].add_node(node_id)

    for node1, node2 in edges:
        graph_id = graph_indicator[node1 - 1]
        graphs[graph_id - 1].add_edge(node1, node2)

    graphs = [nx.convert_node_labels_to_integers(g) for g in graphs]
    labels = graph_labels - 1

    return graphs, labels

# Feature Generation

In [5]:
def generate_embeddings(graphs, config):
    """
    Generates Spectral features for IMDB-MULTI.
    """
    func = config['func']
    if func == 'hybrid':
        model = HybridFGSD(
            harm_bins=config['harm_bins'], 
            harm_range=config['harm_range'],
            pol_bins=config['pol_bins'], 
            pol_range=config['pol_range'],
            func_type='hybrid', 
            seed=42)
    else:
        model = FlexibleFGSD(
            hist_bins=config['bins'], 
            hist_range=config['range'], 
            func_type=func, 
            seed=42)
            
    model.fit(graphs)
    X_spectral = model.get_embedding()
    
    # Normalize spectral features
    scaler_spec = StandardScaler()
    X_spectral_norm = scaler_spec.fit_transform(X_spectral)
    
    return X_spectral_norm

# Clustering Analysis

In [13]:
def perform_clustering_analysis(X, y_true, n_neighbors=50):
    # scaler = StandardScaler()
    # X_std = scaler.fit_transform(X)
    X_norm = normalize(X, norm='l2')
    pca = PCA(n_components=0.95, random_state=42)
    X_pca = pca.fit_transform(X_norm)
    n_classes = len(np.unique(y_true))
    
    # K-Means Hyperparameters:
    # n_init: Number of times the k-means algorithm will be run with different centroid seeds.
    kmeans = KMeans(n_clusters=n_classes, random_state=42, n_init=50)
    y_kmeans = kmeans.fit_predict(X_pca)
    
    # Spectral Clustering Hyperparameters:
    # n_neighbors: Number of neighbors to use when constructing the affinity matrix.
    # assign_labels: Strategy to assign labels in the embedding space ('kmeans' or 'discretize').
    spectral = SpectralClustering(n_clusters=n_classes, affinity='nearest_neighbors', 
                                  n_neighbors=n_neighbors, assign_labels='discretize', 
                                  random_state=42, n_jobs=-1)
    y_spectral = spectral.fit_predict(X_pca)
    return X_pca, y_kmeans, y_spectral

# Visualization

In [7]:
def visualize_clusters(X_scaled, y_true, y_kmeans, y_spectral, config_name):
    labels_list = [y_true, y_kmeans, y_spectral]
    if HAS_UMAP:
        fig, axes = plt.subplots(2, 3, figsize=(18, 10))
        umap_row = axes[0]
        tsne_row = axes[1]
        reducer_umap = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
        embedding_umap = reducer_umap.fit_transform(X_scaled)
        titles_umap = ['GT (UMAP)', 'KMeans (UMAP)', 'Spectral (UMAP)']
        for ax, labels, title in zip(umap_row, labels_list, titles_umap):
            ax.scatter(embedding_umap[:,0], embedding_umap[:,1], c=labels, cmap='tab10', s=15)
            ax.set_title(title); ax.axis('off')
    else:
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        tsne_row = axes
    reducer_tsne = TSNE(n_components=2, perplexity=30, random_state=42)
    embedding_tsne = reducer_tsne.fit_transform(X_scaled)
    titles_tsne = ['GT (t-SNE)', 'KMeans (t-SNE)', 'Spectral (t-SNE)']
    for ax, labels, title in zip(tsne_row, labels_list, titles_tsne):
        ax.scatter(embedding_tsne[:,0], embedding_tsne[:,1], c=labels, cmap='tab10', s=15)
        ax.set_title(title); ax.axis('off')
    plt.tight_layout()
    plt.savefig(f"clustering_imdb_{config_name}.png")

# Main Execution

In [15]:
configurations = [
    {'name':'hybrid_100_3.5_200_3.5','func':'hybrid','harm_bins':100,'harm_range':3.5,'pol_bins':200,'pol_range':3.5},
    {'name':'polynomial_200_3.1','func':'polynomial','bins':200,'range':3.1},
    {'name':'harmonic_100_3.5','func':'harmonic','bins':100,'range':3.5},
]

# Hyperparameters to tune
neighbor_values = [10, 20]

print("Loading IMDB-MULTI Data...")
graphs, labels = download_and_load_imdb()

for config in configurations:
    print(f"\n{'='*80}")
    print(f"Processing Configuration: {config['name']}")
    print(f"{'='*80}")
    
    X = generate_embeddings(graphs, config)
    
    for n_neighbors in neighbor_values:
        print(f"\n--- Tuning: n_neighbors={n_neighbors} ---")
        X_scaled, y_kmeans, y_spectral = perform_clustering_analysis(X, labels, n_neighbors=n_neighbors)
        
        km_ari = adjusted_rand_score(labels, y_kmeans)
        km_sil = silhouette_score(X_scaled, y_kmeans)
        
        sp_ari = adjusted_rand_score(labels, y_spectral)
        sp_sil = silhouette_score(X_scaled, y_spectral)
        print(f"K-Means  -> ARI: {km_ari:.4f} | Silhouette: {km_sil:.4f}")
        print(f"Spectral -> ARI: {sp_ari:.4f} | Silhouette: {sp_sil:.4f}")
        
        # Visualize only for the first neighbor setting to avoid clutter
        if n_neighbors == 10:
            print("Generating Visualization (n_neighbors=10)...")
            visualize_clusters(X_scaled, labels, y_kmeans, y_spectral, config['name'])
            print(f"Plots displayed above and saved to 'clustering_imdb_{config['name']}.png'")

Loading IMDB-MULTI Data...

Processing Configuration: hybrid_100_3.5_200_3.5

Processing Configuration: hybrid_100_3.5_200_3.5

--- Tuning: n_neighbors=50 ---

--- Tuning: n_neighbors=50 ---
K-Means  -> ARI: 0.0096 | Silhouette: 0.3309
Spectral -> ARI: 0.0065 | Silhouette: 0.2963

Processing Configuration: polynomial_200_3.1
K-Means  -> ARI: 0.0096 | Silhouette: 0.3309
Spectral -> ARI: 0.0065 | Silhouette: 0.2963

Processing Configuration: polynomial_200_3.1

--- Tuning: n_neighbors=50 ---

--- Tuning: n_neighbors=50 ---
K-Means  -> ARI: 0.0059 | Silhouette: 0.3373
Spectral -> ARI: 0.0059 | Silhouette: 0.3367

Processing Configuration: harmonic_100_3.5
K-Means  -> ARI: 0.0059 | Silhouette: 0.3373
Spectral -> ARI: 0.0059 | Silhouette: 0.3367

Processing Configuration: harmonic_100_3.5

--- Tuning: n_neighbors=50 ---

--- Tuning: n_neighbors=50 ---
K-Means  -> ARI: 0.0101 | Silhouette: 0.3458
Spectral -> ARI: 0.0062 | Silhouette: 0.3296
K-Means  -> ARI: 0.0101 | Silhouette: 0.3458
Spectr