# Edge Coloring Dataset Exploration

This notebook explores the graph datasets used for edge coloring experiments, visualizes different types of graphs, and analyzes extracted features.

In [None]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pathlib import Path

# Add project root to path for imports
sys.path.append('..')
import config
from src.utils import setup_logging, set_seed
from src.graph_generation.random_graphs import generate_random_graph
from src.graph_generation.scale_free_graphs import generate_scale_free_graph
from src.graph_generation.small_world_graphs import generate_small_world_graph
from src.graph_generation.geometric_graphs import generate_geometric_graph
from src.coloring.greedy import greedy_edge_coloring
from src.coloring.vizing import vizing_edge_coloring
from src.features.graph_features import extract_graph_features
from src.features.edge_features import extract_edge_features
from src.visualization.graph_viz import visualize_graph, visualize_coloring, visualize_coloring_comparison

# Set random seed for reproducibility
set_seed(42)

## 1. Generate Sample Graphs for Each Type

Let's create sample graphs of each type supported by the framework:

In [None]:
# Generate sample graphs
random_graph = generate_random_graph(n=20, p=0.3, seed=42)
scale_free_graph = generate_scale_free_graph(n=20, m=2, seed=42)
small_world_graph = generate_small_world_graph(n=20, k=4, p=0.1, seed=42)
geometric_graph = generate_geometric_graph(n=20, radius=0.3, seed=42)

# Visualize each graph type
plt.figure(figsize=(20, 5))

plt.subplot(141)
visualize_graph(random_graph, title="Random Graph (Erdős-Rényi)")

plt.subplot(142)
visualize_graph(scale_free_graph, title="Scale-Free Graph (Barabási-Albert)")

plt.subplot(143)
visualize_graph(small_world_graph, title="Small-World Graph (Watts-Strogatz)")

plt.subplot(144)
visualize_graph(geometric_graph, title="Geometric Random Graph")

plt.tight_layout()
plt.show()

## 2. Graph Properties Analysis

Let's analyze and compare the structural properties of different graph types:

In [None]:
# Create multiple instances of each graph type
graph_types = {
    'random': [generate_random_graph(n=20, p=0.3, seed=i) for i in range(10)],
    'scale_free': [generate_scale_free_graph(n=20, m=2, seed=i) for i in range(10)],
    'small_world': [generate_small_world_graph(n=20, k=4, p=0.1, seed=i) for i in range(10)],
    'geometric': [generate_geometric_graph(n=20, radius=0.3, seed=i) for i in range(10)]
}

# Extract features for each graph
graph_features = {}
for graph_type, graphs in graph_types.items():
    graph_features[graph_type] = [extract_graph_features(G) for G in graphs]

# Convert to DataFrame for easier analysis
feature_dfs = {}
for graph_type, features in graph_features.items():
    feature_dfs[graph_type] = pd.DataFrame(features)
    feature_dfs[graph_type]['graph_type'] = graph_type

# Combine all features into one DataFrame
all_features = pd.concat(feature_dfs.values(), ignore_index=True)

# Visualize key properties across graph types
plt.figure(figsize=(20, 15))

# Plot 1: Density distribution
plt.subplot(221)
sns.boxplot(x='graph_type', y='density', data=all_features)
plt.title('Graph Density by Type')

# Plot 2: Clustering coefficient
plt.subplot(222)
sns.boxplot(x='graph_type', y='clustering_coefficient', data=all_features)
plt.title('Clustering Coefficient by Graph Type')

# Plot 3: Average shortest path length
plt.subplot(223)
sns.boxplot(x='graph_type', y='avg_path_length', data=all_features)
plt.title('Average Path Length by Graph Type')

# Plot 4: Degree assortativity
plt.subplot(224)
sns.boxplot(x='graph_type', y='degree_assortativity', data=all_features)
plt.title('Degree Assortativity by Graph Type')

plt.tight_layout()
plt.show()

## 3. Degree Distribution Analysis

Let's examine the degree distributions of different graph types, which is particularly important for edge coloring problems:

In [None]:
# Analyze degree distributions
plt.figure(figsize=(20, 15))

for i, (graph_type, graphs) in enumerate(graph_types.items(), 1):
    plt.subplot(2, 2, i)
    
    # Collect degrees from all graphs of this type
    all_degrees = []
    for G in graphs:
        degrees = [d for _, d in G.degree()]
        all_degrees.extend(degrees)
    
    # Plot histogram of degrees
    sns.histplot(all_degrees, kde=True, bins=range(max(all_degrees)+2))
    plt.title(f'Degree Distribution: {graph_type.replace("_", " ").title()}')
    plt.xlabel('Degree')
    plt.ylabel('Frequency')
    
plt.tight_layout()
plt.show()

# Compare maximum degrees (most relevant for edge coloring)
max_degrees = {}
for graph_type, graphs in graph_types.items():
    max_degrees[graph_type] = [max(dict(G.degree()).values()) for G in graphs]

plt.figure(figsize=(10, 6))
sns.boxplot(data=pd.DataFrame(max_degrees))
plt.title('Maximum Degree by Graph Type')
plt.ylabel('Maximum Degree')
plt.show()

## 4. Edge Coloring Visualization

Let's visualize edge colorings using different algorithms:

In [None]:
# Generate a sample graph for coloring
G = generate_scale_free_graph(n=20, m=2, seed=42)

# Apply different coloring algorithms
random_coloring = greedy_edge_coloring(G, 'random')
degree_coloring = greedy_edge_coloring(G, 'degree')
vizing_coloring = vizing_edge_coloring(G)

# Visualize the colorings
visualize_coloring_comparison(
    G, 
    [random_coloring, degree_coloring, vizing_coloring],
    ["Random Ordering", "Degree Ordering", "Vizing's Algorithm"],
    figsize=(18, 6)
)

# Compare number of colors used
max_degree = max(dict(G.degree()).values())
print(f"Maximum degree: {max_degree}")
print(f"Random ordering: {len(set(random_coloring.values()))} colors")
print(f"Degree ordering: {len(set(degree_coloring.values()))} colors")
print(f"Vizing's algorithm: {len(set(vizing_coloring.values()))} colors")

## 5. Feature Correlation Analysis

Analyze how different graph features are correlated with coloring quality:

In [None]:
# Generate multiple graphs and extract features
graphs = [generate_random_graph(n=20, p=np.random.uniform(0.1, 0.8), seed=i) for i in range(30)]
graphs += [generate_scale_free_graph(n=20, m=np.random.randint(1, 5), seed=i) for i in range(30)]
graphs += [generate_small_world_graph(n=20, k=4, p=np.random.uniform(0.05, 0.5), seed=i) for i in range(30)]
graphs += [generate_geometric_graph(n=20, radius=np.random.uniform(0.2, 0.4), seed=i) for i in range(30)]

# Extract features and compute coloring metrics
features = []
color_counts = []
color_ratios = []

for G in graphs:
    # Extract features
    graph_feat = extract_graph_features(G)
    features.append(graph_feat)
    
    # Get colorings
    coloring = greedy_edge_coloring(G, 'degree')
    
    # Compute metrics
    max_degree = max(dict(G.degree()).values())
    num_colors = len(set(coloring.values()))
    color_ratio = num_colors / max_degree
    
    color_counts.append(num_colors)
    color_ratios.append(color_ratio)

# Add coloring metrics to features
feature_df = pd.DataFrame(features)
feature_df['color_count'] = color_counts
feature_df['color_ratio'] = color_ratios

# Create correlation heatmap
plt.figure(figsize=(15, 12))
corr = feature_df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Show correlation with coloring metrics
plt.figure(figsize=(10, 8))
color_corr = corr['color_ratio'].sort_values(ascending=False)
sns.barplot(x=color_corr.values, y=color_corr.index)
plt.title('Feature Correlation with Color Ratio')
plt.tight_layout()
plt.show()

## 6. Edge Feature Analysis

Analyze edge-level features that might be relevant for edge coloring:

In [None]:
# Generate a sample graph
G = generate_scale_free_graph(n=20, m=2, seed=42)

# Extract edge features
edge_feats = extract_edge_features(G)

# Create a DataFrame with edge features
edge_df = pd.DataFrame(edge_feats)

# Get a greedy coloring
coloring = greedy_edge_coloring(G, 'degree')

# Add color information to edge features
edge_df['color'] = [coloring.get(edge, coloring.get((edge[1], edge[0]))) for edge in edge_df.index]

# Visualize relationships between edge features and assigned colors
plt.figure(figsize=(16, 12))

# Plot 1: Endpoint degree sum vs color
plt.subplot(221)
sns.boxplot(x='color', y='endpoint_degree_sum', data=edge_df)
plt.title('Endpoint Degree Sum by Color')

# Plot 2: Betweenness centrality vs color
plt.subplot(222)
sns.boxplot(x='color', y='betweenness_centrality', data=edge_df)
plt.title('Edge Betweenness Centrality by Color')

# Plot 3: Adjacent edge count vs color
plt.subplot(223)
sns.boxplot(x='color', y='adjacent_edges_count', data=edge_df)
plt.title('Adjacent Edge Count by Color')

# Plot 4: Neighborhood overlap vs color
plt.subplot(224)
sns.boxplot(x='color', y='neighborhood_overlap', data=edge_df)
plt.title('Neighborhood Overlap by Color')

plt.tight_layout()
plt.show()

## 7. Dataset Size and Composition Analysis

Let's explore the raw dataset used for model training:

In [None]:
# Analyze the composition of the dataset
graph_dirs = [d for d in os.listdir(os.path.join(config.RAW_DATA_DIR, 'graphs')) 
              if os.path.isdir(os.path.join(config.RAW_DATA_DIR, 'graphs', d))]

# Collect dataset statistics
dataset_stats = []
for graph_dir in graph_dirs:
    dir_path = os.path.join(config.RAW_DATA_DIR, 'graphs', graph_dir)
    num_graphs = len([f for f in os.listdir(dir_path) if f.endswith('.graphml')])
    
    # Determine graph type and size from directory name
    graph_type = graph_dir.split('_')[0]
    size = graph_dir.split('_n')[1].split('_')[0] if '_n' in graph_dir else 'unknown'
    
    # Check what coloring methods are available
    coloring_path = os.path.join(config.RAW_DATA_DIR, 'colorings', graph_dir)
    coloring_methods = os.listdir(coloring_path) if os.path.exists(coloring_path) else []
    
    dataset_stats.append({
        'graph_type': graph_type,
        'size': size,
        'num_graphs': num_graphs,
        'coloring_methods': coloring_methods
    })

# Convert to DataFrame
stats_df = pd.DataFrame(dataset_stats)

# Visualize dataset composition
plt.figure(figsize=(12, 6))
sns.countplot(x='graph_type', hue='size', data=stats_df)
plt.title('Dataset Composition by Graph Type and Size')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display summary table
display(stats_df.groupby(['graph_type', 'size']).sum()['num_graphs'])

# Check availability of coloring methods
method_counts = {}
for methods in stats_df['coloring_methods']:
    for method in methods:
        method_counts[method] = method_counts.get(method, 0) + 1

plt.figure(figsize=(10, 4))
sns.barplot(x=list(method_counts.keys()), y=list(method_counts.values()))
plt.title('Availability of Coloring Methods in Dataset')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()