# Instance Selector for VAMOS

This notebook analyzes the TSP instances available in `data/tsplib` to help select a diverse set of benchmark instances. 
It adapts the methodology from [GRAFO-URJC/DRFLP](https://github.com/GRAFO-URJC/DRFLP/tree/master/instance-selector) (Graph Analysis).

## Methodology
1. **Load Instances**: Read `.tsp` files from `data/tsplib`.
2. **Graph Conversion**: Convert TSP coordinates into NetworkX graphs (Euclidean distance weights).
3. **Metric Extraction**: Calculate graph metrics (Density, Degree stats, etc.).
4. **Feature Correlation**: Identify redundant metrics.
5. **Dimensionality Reduction**: Use PCA to visualize the instance space.
6. **Clustering**: Application of K-Means to group similar instances.

In [None]:
import os
import math
import itertools
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer

## 1. Load Data
We read the TSPLIB files and parse the `NODE_COORD_SECTION`.

In [None]:
def read_tsp_file(filepath):
    coords = {}
    with open(filepath, "r") as f:
        lines = f.readlines()

    in_coord_section = False
    name = os.path.basename(filepath)

    for line in lines:
        line = line.strip()
        if line.startswith("NODE_COORD_SECTION"):
            in_coord_section = True
            continue
        if line.startswith("EOF"):
            break

        if in_coord_section:
            parts = line.split()
            if len(parts) >= 3:
                idx = int(parts[0])
                x = float(parts[1])
                y = float(parts[2])
                coords[idx] = (x, y)

    return name, coords


def create_graph_from_tsp(name, coords):
    G = nx.Graph(name=name)
    nodes = list(coords.keys())

    # Add nodes
    for n in nodes:
        G.add_node(n, pos=coords[n])

    # Add edges (Complete Graph with Euclidean distances)
    # Optimization: For very large graphs, we might want to prune this, but for <500 nodes full is fine.
    for u, v in itertools.combinations(nodes, 2):
        p1 = coords[u]
        p2 = coords[v]
        dist = math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)
        G.add_edge(u, v, weight=dist)

    return G


DATA_DIR = "../data/tsplib"
graphs = []

if not os.path.exists(DATA_DIR):
    print(f"Directory {DATA_DIR} not found. Please ensure you are running this from 'notebooks/' and data exists.")
else:
    files = [f for f in os.listdir(DATA_DIR) if f.endswith(".tsp")]
    print(f"Found {len(files)} TSP instances.")

    for f in files:
        path = os.path.join(DATA_DIR, f)
        name, coords = read_tsp_file(path)
        G = create_graph_from_tsp(name, coords)
        graphs.append(G)
        print(f"Loaded {name}: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges.")

## 2. Extract Metrics
We calculate various graph metrics for each instance.

In [None]:
metrics_data = []

for G in graphs:
    # Degree stats
    degrees = [val for (node, val) in G.degree()]

    # Edge weights
    weights = [d["weight"] for u, v, d in G.edges(data=True)]

    m = {
        "name": G.name,
        "num_vertices": G.number_of_nodes(),
        # For complete graphs, edges = n*(n-1)/2, so density is always 1.0
        # But we verify anyway or maybe we should use MST or KNN graph for more interesting structure?
        # We'll stick to basic stats of the full distance matrix for now.
        "vertex_avg_degree": np.mean(degrees),
        "edge_mean_weight": np.mean(weights),
        "edge_std_weight": np.std(weights),
        "edge_min_weight": np.min(weights),
        "edge_max_weight": np.max(weights),
        # Coefficient of variation for weights (dispersion)
        "edge_weight_cv": np.std(weights) / np.mean(weights) if np.mean(weights) > 0 else 0,
    }
    metrics_data.append(m)

df = pd.DataFrame(metrics_data)
df.set_index("name", inplace=True)
print(df.head())

---

## 3. Feature Correlation Analysis
Identify redundant features by computing the Pearson correlation matrix.

In [None]:
if len(df) > 1:
    # Compute correlation matrix
    corr_matrix = df.corr()

    # Plot heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", center=0, fmt=".2f", square=True, linewidths=0.5)
    plt.title("Feature Correlation Heatmap")
    plt.tight_layout()
    plt.show()

    # Identify highly correlated pairs (|r| > 0.9)
    high_corr = []
    for i, col1 in enumerate(corr_matrix.columns):
        for col2 in corr_matrix.columns[i + 1 :]:
            r = corr_matrix.loc[col1, col2]
            if abs(r) > 0.9:
                high_corr.append((col1, col2, r))

    if high_corr:
        print("\nHighly Correlated Feature Pairs (|r| > 0.9):")
        for col1, col2, r in high_corr:
            print(f"  {col1} <-> {col2}: r={r:.3f}")
    else:
        print("\nNo highly correlated pairs found.")
else:
    print("Not enough instances for correlation analysis.")

## 4. PCA & Clustering
Normalize data and apply PCA, then cluster.

In [None]:
if len(df) > 1:
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df)

    # PCA
    pca = PCA(n_components=0.95)  # Keep 95% variance
    X_pca = pca.fit_transform(X_scaled)

    print(f"Reduced to {X_pca.shape[1]} components explaining {np.sum(pca.explained_variance_ratio_):.2%} variance.")

    # Elbow Method for K
    model = KMeans(random_state=42, n_init=10)
    visualizer = KElbowVisualizer(model, k=(2, min(10, len(df) - 1)))
    visualizer.fit(X_pca)
    visualizer.show()
else:
    print("Not enough instances for clustering (need at least 2).")

In [None]:
if len(df) > 1 and visualizer.elbow_value_:
    k = visualizer.elbow_value_
    print(f"Selected K={k}")

    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_pca)
    df["cluster"] = clusters

    # Plot PCA colored by cluster
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette="viridis", s=100)

    for i, name in enumerate(df.index):
        plt.annotate(name, (X_pca[i, 0], X_pca[i, 1]), fontsize=8, alpha=0.7)

    plt.title("TSP Instances PCA + Clustering")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.show()

    print("\nCluster Assignments:")
    print(df["cluster"].sort_values())

## 5. Hardness Proxy Visualization
Color the PCA plot by a "hardness proxy" (e.g., `num_vertices` or `edge_weight_cv`).

In [None]:
if len(df) > 1:
    # Use num_vertices as a proxy for hardness (larger = harder)
    hardness_proxy = df["num_vertices"].values

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=hardness_proxy, cmap="plasma", s=100, edgecolors="black", linewidths=0.5)
    plt.colorbar(scatter, label="Hardness Proxy (num_vertices)")

    for i, name in enumerate(df.index):
        plt.annotate(name, (X_pca[i, 0], X_pca[i, 1]), fontsize=8, alpha=0.7)

    plt.title("TSP Instances PCA (Colored by Hardness Proxy)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("Not enough instances.")