# Dataset
- Compare correlation

In [2]:
import numpy as np
import pandas as pd
from grad_fw.data_loader import DatasetLoader

def get_dataset_metrics(dataset_name, **kwargs):
    """
    Computes key characteristics for a single dataset:
    - Dimensions (n, p)
    - Condition Number (Kappa)
    - Average & Maximum Feature Correlation
    """
    loader = DatasetLoader()
    
    try:
        # Load Data
        A, _  = loader.load(dataset_name, **kwargs)
        
        if A is None:
            return None

        # 1. Basic Dimensions
        n, p = A.shape
        
        # 2. Condition Number Calculation (Landscape Flatness)
        # Normalize first to ensure scale-invariance
        X_centered = A - np.mean(A, axis=0)
        std = np.std(X_centered, axis=0)
        std[std == 0] = 1.0 # Prevent div by zero
        X_scaled = X_centered / std
        
        # Compute eigenvalues of Gram Matrix (X.T @ X)
        # Note: For very large p, this might be slow. If p > 5000, consider randomized SVD.
        # But for your thesis datasets, this is fine.
        Gram = X_scaled.T @ X_scaled
        evals = np.linalg.eigvalsh(Gram)
        
        lambda_max = evals[-1]
        lambda_min = max(evals[0], 1e-9) # Avoid 0
        cond_number = lambda_max / lambda_min

        # 3. Correlation Statistics (Redundancy)
        df_temp = pd.DataFrame(A)
        corr_matrix = df_temp.corr().abs()
        
        # Exclude diagonal (self-correlation is always 1)
        mask = np.ones(corr_matrix.shape, dtype=bool)
        np.fill_diagonal(mask, 0)
        off_diag = corr_matrix.values[mask]
        
        avg_corr = np.mean(off_diag) if len(off_diag) > 0 else 0.0
        max_corr = np.max(off_diag) if len(off_diag) > 0 else 0.0

        return {
            "Dataset": dataset_name.capitalize(),
            "Samples (n)": n,
            "Features (p)": p,
            "Condition No. (κ)": cond_number,
            "Avg. Corr": avg_corr,
            "Max. Corr": max_corr
        }

    except Exception as e:
        print(f"Error processing {dataset_name}: {e}")
        return None

def generate_characteristics_table():
    # List of your thesis datasets
    datasets = [
        ("synthetic_high_corr", {"n_blocks": 1, "correlation_strength": 0.95}),
        ("synthetic_toeplitz", {}),
        ("residential", {}),
        ("secom", {}),
        ("arrhythmia", {}),
        ("mnist", {}),
        ("madelon", {}),
        ("myocardial", {})
    ]

    results = []
    print("Computing metrics... (This may take a moment for large datasets)")
    
    for name, kwargs in datasets:
        print(f"Processing {name}...")
        metrics = get_dataset_metrics(name, **kwargs)
        if metrics:
            results.append(metrics)

    # Create DataFrame
    df = pd.DataFrame(results)
    
    # --- Formatting for Display ---
    # Scientific notation for Condition Number (it can get HUGE)
    pd.set_option('display.float_format', '{:.2e}'.format)
    
    # We create a display copy to make it pretty
    df_display = df.copy()
    df_display["Condition No. (κ)"] = df_display["Condition No. (κ)"].map(lambda x: f"{x:.2e}")
    df_display["Avg. Corr"] = df_display["Avg. Corr"].map(lambda x: f"{x:.3f}")
    df_display["Max. Corr"] = df_display["Max. Corr"].map(lambda x: f"{x:.3f}")

    print("\n" + "="*60)
    print("DATASET CHARACTERISTICS SUMMARY")
    print("="*60)
    print(df_display.to_string(index=False))
    
    # Generate LaTeX code for your thesis
    print("\n" + "="*60)
    print("LATEX CODE (Copy to Thesis)")
    print("="*60)
    print(df_display.to_latex(index=False, caption="Statistical Characteristics of Benchmark Datasets", label="tab:datasets_stats"))

if __name__ == "__main__":
    generate_characteristics_table()

Computing metrics... (This may take a moment for large datasets)
Processing synthetic_high_corr...
  > Loading dataset: synthetic_high_corr
    Generating Synthetic Data: N=2000, p=500, Blocks=1, Corr=0.95
    Computed Correlation Matrix A: (500, 500)
Processing synthetic_toeplitz...
  > Loading dataset: synthetic_toeplitz
    Generating Toeplitz Trap: p=500, rho=0.9
Processing residential...
  > Loading dataset: residential
    Source: /Users/nautilus/gridfw/data/residential.xlsx
    Raw Data Shape: 372 rows x 103 features
    Computed Correlation Matrix A: (103, 103)
Processing secom...
  > Loading dataset: secom
    Source: /Users/nautilus/gridfw/data/secom.data
    [secom Cleaning] Dropped 112 constant columns.
    Raw Data Shape: 1567 rows x 478 features
    Computed Correlation Matrix A: (478, 478)
Processing arrhythmia...
  > Loading dataset: arrhythmia
    Source: /Users/nautilus/gridfw/data/arrhythmia.data
    [arrhythmia Cleaning] Dropped 17 constant columns.
    Raw Data Sha

In [3]:
import numpy as np
import pandas as pd
from grad_fw.data_loader import DatasetLoader

def analyze_feature_correlation(dataset_name, n_samples=None, **kwargs):
    """
    Loads a dataset and computes the pairwise Pearson correlation between its columns (features).
    Prints statistics to prove feature redundancy.
    
    Args:
        dataset_name (str): Name of the dataset (e.g., 'synthetic_high_corr', 'residential').
        n_samples (int, optional): Limit rows for speed (e.g., 2000). None uses all data.
        **kwargs: Extra arguments for synthetic generation (e.g., n_blocks=1).
    """
    loader = DatasetLoader()
    print(f"\n{'='*60}")
    print(f"ANALYZING DATASET: {dataset_name}")
    print(f"{'='*60}")

    # 1. Load Data
    # Note: We rely on the loader to return 'A' (matrix) and 'X_norm' (the actual data).
    # We only care about X_norm here.
    try:
        _, X_raw = loader.load(dataset_name, **kwargs)
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
        return

    if X_raw is None:
        print("Dataset not found or failed to load.")
        return

    # Subsample rows if requested (does not affect column correlation logic, just speed)
    if n_samples and X_raw.shape[0] > n_samples:
        X_raw = X_raw[:n_samples]

    # 2. Convert to Pandas for easy Correlation calculation
    df = pd.DataFrame(X_raw)
    n_rows, n_cols = df.shape
    
    print(f"\n[1] DATASET DIMENSIONS")
    print(f"    Rows: {n_rows}")
    print(f"    Columns (Features): {n_cols}")

    # Get condition number
    print(f"\n[!!] Condition number : {np.linalg.cond(X_raw)}")
    
    # 3. Compute Full Correlation Matrix (Feature vs Feature)
    print("\n[2] COMPUTING COLUMN CORRELATIONS...")
    corr_matrix = df.corr().abs() # We use absolute value to treat -0.99 same as 0.99

    # 4. Extract Key Statistics
    # Mask the diagonal (which is always 1.0) to find true cross-correlations
    mask = np.ones(corr_matrix.shape, dtype=bool)
    np.fill_diagonal(mask, 0)
    off_diag_corrs = corr_matrix.values[mask]

    avg_corr = np.mean(off_diag_corrs)
    max_corr = np.max(off_diag_corrs)
    
    # 5. Proof: Specific Examples
    print(f"\n[3] REDUNDANCY STATISTICS")
    print(f"    Average Feature Correlation: {avg_corr:.4f}")
    print(f"    Maximum Feature Correlation: {max_corr:.4f}")

    # Show a specific high-correlation pair
    # Find indices of max correlation
    # We flatten the matrix, find argmax, then unravel index
    # (Using a simpler loop for clarity in output)
    
    print(f"\n[4] SAMPLE CORRELATION BLOCK (First 5 Features)")
    print("    This shows how similar the first few features are to each other:")
    print(df.iloc[:, :5].corr().round(4))
    


    print("-" * 60)

# ==========================================
# EXECUTION EXAMPLES
# ==========================================

# 1. Test your "Dense" Synthetic Data (The one for the email)
# This proves the "All columns are clones" argument
analyze_feature_correlation(
    "synthetic_high_corr", 
    n_blocks=1, 
    correlation_strength=0.95
)

# 2. Test Real Data (Residential)
# This likely has lower average correlation, which explains why FW works differently here
analyze_feature_correlation("residential")
analyze_feature_correlation("secom")
analyze_feature_correlation("arrhythmia")
analyze_feature_correlation("mnist")
analyze_feature_correlation("madelon")

# 3. Test Myocardial
# analyze_feature_correlation("myocardial")


ANALYZING DATASET: synthetic_high_corr
  > Loading dataset: synthetic_high_corr
    Generating Synthetic Data: N=2000, p=500, Blocks=1, Corr=0.95
    Computed Correlation Matrix A: (500, 500)

[1] DATASET DIMENSIONS
    Rows: 2000
    Columns (Features): 500

[!!] Condition number : 845.3990247539451

[2] COMPUTING COLUMN CORRELATIONS...

[3] REDUNDANCY STATISTICS
    Average Feature Correlation: 0.9973
    Maximum Feature Correlation: 0.9976

[4] SAMPLE CORRELATION BLOCK (First 5 Features)
    This shows how similar the first few features are to each other:
        0       1       2       3       4
0  1.0000  0.9974  0.9973  0.9973  0.9975
1  0.9974  1.0000  0.9973  0.9974  0.9974
2  0.9973  0.9973  1.0000  0.9973  0.9974
3  0.9973  0.9974  0.9973  1.0000  0.9973
4  0.9975  0.9974  0.9974  0.9973  1.0000
------------------------------------------------------------

ANALYZING DATASET: residential
  > Loading dataset: residential
    Source: https://archive.ics.uci.edu/ml/machine-learn



    Raw Data Shape: 372 rows x 103 features
    Computed Correlation Matrix A: (103, 103)

[1] DATASET DIMENSIONS
    Rows: 372
    Columns (Features): 103

[!!] Condition number : 1.4223510128590456e+17

[2] COMPUTING COLUMN CORRELATIONS...

[3] REDUNDANCY STATISTICS
    Average Feature Correlation: 0.5868
    Maximum Feature Correlation: 1.0000

[4] SAMPLE CORRELATION BLOCK (First 5 Features)
    This shows how similar the first few features are to each other:
        0       1       2       3       4
0  1.0000 -0.2295 -0.1381 -0.2869 -0.3136
1 -0.2295  1.0000  0.9465  0.7695  0.2266
2 -0.1381  0.9465  1.0000  0.6392  0.1544
3 -0.2869  0.7695  0.6392  1.0000  0.5823
4 -0.3136  0.2266  0.1544  0.5823  1.0000
------------------------------------------------------------

ANALYZING DATASET: secom
  > Loading dataset: secom
    Source: https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data




KeyboardInterrupt: 