# FGSD Experiment on IMDB-MULTI
This notebook runs the Flexible FGSD and Hybrid FGSD experiments on the IMDB-MULTI dataset.

## Imports

In [3]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print(f"✅ Added to path: {parent_dir}")

✅ Added to path: /home/stavros/emb3/fgsd_method/src


In [4]:

import numpy as np
import time
import tracemalloc
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import networkx as nx
from karateclub.estimator import Estimator
import warnings
import os
import urllib.request
import zipfile
import pandas as pd
import sys
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

sys.path.append(".")
from fgsd import FlexibleFGSD
from optimized_method import HybridFGSD


  from .autonotebook import tqdm as notebook_tqdm


## Download and Load IMDB-MULTI Dataset

In [5]:
def download_and_load_imdb():
    data_dir = '/tmp/IMDB-MULTI'
    os.makedirs(data_dir, exist_ok=True)

    base_url = 'https://www.chrsmrrs.com/graphkerneldatasets/IMDB-MULTI.zip'
    zip_path = os.path.join(data_dir, 'IMDB-MULTI.zip')

    if not os.path.exists(os.path.join(data_dir, 'IMDB-MULTI')):
        print("Downloading IMDB-MULTI dataset...")
        urllib.request.urlretrieve(base_url, zip_path)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print("Download complete.")

    dataset_path = os.path.join(data_dir, 'IMDB-MULTI')

    graph_indicator = np.loadtxt(os.path.join(dataset_path, 'IMDB-MULTI_graph_indicator.txt'), dtype=int)
    edges = np.loadtxt(os.path.join(dataset_path, 'IMDB-MULTI_A.txt'), dtype=int, delimiter=',')
    graph_labels = np.loadtxt(os.path.join(dataset_path, 'IMDB-MULTI_graph_labels.txt'), dtype=int)
    
    # IMDB-MULTI does not have node labels

    num_graphs = len(graph_labels)
    graphs = [nx.Graph() for _ in range(num_graphs)]

    for node_id, graph_id in enumerate(graph_indicator, start=1):
        graphs[graph_id - 1].add_node(node_id)

    for node1, node2 in edges:
        graph_id = graph_indicator[node1 - 1]
        graphs[graph_id - 1].add_edge(node1, node2)

    graphs = [nx.convert_node_labels_to_integers(g) for g in graphs]
    labels = graph_labels - 1

    return graphs, labels

## Evaluation Function

In [6]:

def evaluate_classifier(X_train, X_test, y_train, y_test, classifier_name, clf):
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_train_pred = clf.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)

    start_time = time.time()
    y_pred = clf.predict(X_test)
    inference_time = time.time() - start_time

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    try:
        y_test_bin = label_binarize(y_test, classes=np.unique(y_train))
        if hasattr(clf, 'predict_proba'):
            y_score = clf.predict_proba(X_test)
        elif hasattr(clf, 'decision_function'):
            y_score = clf.decision_function(X_test)
            if len(y_score.shape) == 1:
                y_score = y_score.reshape(-1, 1)
        else:
            y_score = None

        if y_score is not None and y_test_bin.shape[1] > 1:
            auc = roc_auc_score(y_test_bin, y_score, average='weighted', multi_class='ovr')
        else:
            auc = None
    except:
        auc = None

    return {
        'classifier': classifier_name,
        'train_accuracy': train_accuracy,
        'accuracy': accuracy,
        'f1_score': f1,
        'auc': auc,
        'train_time': train_time,
        'inference_time': inference_time
    }


## Run FGSD Experiments

In [10]:
def run_experiment(configs, test_size=0.15, random_state=42):
    print("Loading IMDB-MULTI dataset...")
    graphs, labels = download_and_load_imdb()

    print("IMDB-MULTI has no node labels. Using only spectral features.")

    graphs_train, graphs_test, y_train, y_test = train_test_split(
        graphs, labels, test_size=test_size, random_state=random_state, stratify=labels
    )

    results = []

    for i, config in enumerate(configs):
        func = config['func']

        print(f"\n{'='*80}")
        if func == 'hybrid':
            harm_bins = config.get('harm_bins', 200)
            harm_range = config.get('harm_range', 20)
            pol_bins = config.get('pol_bins', 70)
            pol_range = config.get('pol_range', 4.1)
            print(f"Experiment {i+1}/{len(configs)}: Function='{func}'")
        else:
            bins = config['bins']
            rng = config['range']
            print(f"Experiment {i+1}/{len(configs)}: Function='{func}', Bins={bins}, Range={rng}")
        print(f"{'='*80}")

        tracemalloc.start()
        start_time = time.time()

        if func == 'hybrid':
            model = HybridFGSD(
                harm_bins=harm_bins, harm_range=harm_range,
                pol_bins=pol_bins, pol_range=pol_range,
                func_type='hybrid', seed=random_state
            )
        else:
            model = FlexibleFGSD(hist_bins=bins, hist_range=rng, func_type=func, seed=random_state)

        model.fit(graphs_train)
        X_train = model.get_embedding()
        X_test = model.infer(graphs_test)

        generation_time = time.time() - start_time
        current, peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()

        print(f"Embedding Shape: {X_train.shape}")

        classifiers = {
            'SVM (RBF) + Scaler': make_pipeline(
                StandardScaler(),
                SVC(kernel='rbf', C=500, gamma='scale', probability=True, random_state=random_state)
            ),
            'Random Forest': RandomForestClassifier(n_estimators=3000, random_state=random_state),
            'MLP': make_pipeline(
                StandardScaler(), 
                MLPClassifier(
                    hidden_layer_sizes=(1024, 512, 256, 128), 
                    activation='relu',
                    solver='adam',
                    alpha=0.001,
                    learning_rate_init=0.001,
                    learning_rate='adaptive',
                    max_iter=2000, 
                    early_stopping=True,      
                    n_iter_no_change=20,      
                    random_state=random_state
                )
            )
        }

        for clf_name, clf in classifiers.items():
            res = evaluate_classifier(X_train, X_test, y_train, y_test, clf_name, clf)
            res.update(config)
            res['generation_time'] = generation_time
            results.append(res)
            print(f"  -> {clf_name}: Train Acc={res['train_accuracy']:.4f}, Test Acc={res['accuracy']:.4f}, F1={res['f1_score']:.4f}")

    return results

## Summary Function

In [11]:

def print_summary(results):
    print("\n" + "="*120)
    print("SUMMARY OF RESULTS")
    print("="*120)
    print(f"{'Func':<12} {'Parameters':<30} {'Classifier':<20} {'Train Acc':<11} {'Test Acc':<10} {'F1':<10} {'GenTime':<8}")
    print("-" * 120)

    sorted_results = sorted(results, key=lambda x: x['accuracy'], reverse=True)

    for r in sorted_results:
        if r['func'] == 'hybrid':
            params = f"h_bins={r.get('harm_bins')},h_range={r.get('harm_range')},p_bins={r.get('pol_bins')},p_range={r.get('pol_range')}"
        else:
            params = f"bins={r.get('bins')}, range={r.get('range')}"

        print(f"{r['func']:<12} {params:<30} {r['classifier']:<20} "
              f"{r['train_accuracy']:<11.4f} {r['accuracy']:<10.4f} {r['f1_score']:<10.4f} {r['generation_time']:<8.2f}")


## Run the Full Experiment

In [12]:
configs = [
    {'func': 'hybrid', 'harm_bins': 100, 'harm_range': 3.5, 'pol_bins': 200, 'pol_range': 3.5},
    {'func': 'polynomial', 'bins': 200, 'range': 3.1},
    {'func': 'harmonic', 'bins': 100, 'range': 3.5},
]

print("Starting Multi-Configuration FGSD Experiment on IMDB-MULTI...")
results = run_experiment(configs)
print_summary(results)

df = pd.DataFrame(results)
df.to_csv("fgsd_imdb_results.csv", index=False)

Starting Multi-Configuration FGSD Experiment on IMDB-MULTI...
Loading IMDB-MULTI dataset...
IMDB-MULTI has no node labels. Using only spectral features.

Experiment 1/3: Function='hybrid'
Embedding Shape: (1275, 300)
  -> SVM (RBF) + Scaler: Train Acc=0.6486, Test Acc=0.4489, F1=0.4403
  -> Random Forest: Train Acc=0.6541, Test Acc=0.4756, F1=0.4607
  -> MLP: Train Acc=0.6212, Test Acc=0.4667, F1=0.4464

Experiment 2/3: Function='polynomial', Bins=200, Range=3.1
Embedding Shape: (1275, 200)
  -> SVM (RBF) + Scaler: Train Acc=0.6376, Test Acc=0.4844, F1=0.4724
  -> Random Forest: Train Acc=0.6376, Test Acc=0.4978, F1=0.4841
  -> MLP: Train Acc=0.5584, Test Acc=0.4222, F1=0.4237

Experiment 3/3: Function='harmonic', Bins=100, Range=3.5


  func_w = np.where(w > 1e-9, 1.0 / w, 0)


Embedding Shape: (1275, 100)
  -> SVM (RBF) + Scaler: Train Acc=0.6384, Test Acc=0.4756, F1=0.4624
  -> Random Forest: Train Acc=0.6525, Test Acc=0.4756, F1=0.4605
  -> MLP: Train Acc=0.6275, Test Acc=0.4800, F1=0.4575

SUMMARY OF RESULTS
Func         Parameters                     Classifier           Train Acc   Test Acc   F1         GenTime 
------------------------------------------------------------------------------------------------------------------------
polynomial   bins=200, range=3.1            Random Forest        0.6376      0.4978     0.4841     10.40   
polynomial   bins=200, range=3.1            SVM (RBF) + Scaler   0.6376      0.4844     0.4724     10.40   
harmonic     bins=100, range=3.5            MLP                  0.6275      0.4800     0.4575     9.51    
hybrid       h_bins=100,h_range=3.5,p_bins=200,p_range=3.5 Random Forest        0.6541      0.4756     0.4607     8.91    
harmonic     bins=100, range=3.5            SVM (RBF) + Scaler   0.6384      0.4756  