In [3]:
import numpy as np
import pandas as pd
from grad_fw.data_loader import DatasetLoader

def analyze_feature_correlation(dataset_name, n_samples=None, **kwargs):
    """
    Loads a dataset and computes the pairwise Pearson correlation between its columns (features).
    Prints statistics to prove feature redundancy.
    
    Args:
        dataset_name (str): Name of the dataset (e.g., 'synthetic_high_corr', 'residential').
        n_samples (int, optional): Limit rows for speed (e.g., 2000). None uses all data.
        **kwargs: Extra arguments for synthetic generation (e.g., n_blocks=1).
    """
    loader = DatasetLoader()
    print(f"\n{'='*60}")
    print(f"ANALYZING DATASET: {dataset_name}")
    print(f"{'='*60}")

    # 1. Load Data
    # Note: We rely on the loader to return 'A' (matrix) and 'X_norm' (the actual data).
    # We only care about X_norm here.
    try:
        _, X_raw = loader.load(dataset_name, **kwargs)
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
        return

    if X_raw is None:
        print("Dataset not found or failed to load.")
        return

    # Subsample rows if requested (does not affect column correlation logic, just speed)
    if n_samples and X_raw.shape[0] > n_samples:
        X_raw = X_raw[:n_samples]

    # 2. Convert to Pandas for easy Correlation calculation
    df = pd.DataFrame(X_raw)
    n_rows, n_cols = df.shape
    
    print(f"\n[1] DATASET DIMENSIONS")
    print(f"    Rows: {n_rows}")
    print(f"    Columns (Features): {n_cols}")

    # 3. Compute Full Correlation Matrix (Feature vs Feature)
    print("\n[2] COMPUTING COLUMN CORRELATIONS...")
    corr_matrix = df.corr().abs() # We use absolute value to treat -0.99 same as 0.99

    # 4. Extract Key Statistics
    # Mask the diagonal (which is always 1.0) to find true cross-correlations
    mask = np.ones(corr_matrix.shape, dtype=bool)
    np.fill_diagonal(mask, 0)
    off_diag_corrs = corr_matrix.values[mask]

    avg_corr = np.mean(off_diag_corrs)
    max_corr = np.max(off_diag_corrs)
    
    # 5. Proof: Specific Examples
    print(f"\n[3] REDUNDANCY STATISTICS")
    print(f"    Average Feature Correlation: {avg_corr:.4f}")
    print(f"    Maximum Feature Correlation: {max_corr:.4f}")

    # Show a specific high-correlation pair
    # Find indices of max correlation
    # We flatten the matrix, find argmax, then unravel index
    # (Using a simpler loop for clarity in output)
    
    print(f"\n[4] SAMPLE CORRELATION BLOCK (First 5 Features)")
    print("    This shows how similar the first few features are to each other:")
    print(df.iloc[:, :5].corr().round(4))

    print("-" * 60)

# ==========================================
# EXECUTION EXAMPLES
# ==========================================

# 1. Test your "Dense" Synthetic Data (The one for the email)
# This proves the "All columns are clones" argument
analyze_feature_correlation(
    "synthetic_high_corr", 
    n_blocks=1, 
    correlation_strength=0.95
)

# 2. Test Real Data (Residential)
# This likely has lower average correlation, which explains why FW works differently here
analyze_feature_correlation("residential")
analyze_feature_correlation("secom")
analyze_feature_correlation("arrhythmia")
analyze_feature_correlation("mnist")
analyze_feature_correlation("madelon")

# 3. Test Myocardial
# analyze_feature_correlation("myocardial")


ANALYZING DATASET: synthetic_high_corr
  > Loading dataset: synthetic_high_corr
    Generating Synthetic Data: N=2000, p=500, Blocks=1, Corr=0.95
    Computed Correlation Matrix A: (500, 500)

[1] DATASET DIMENSIONS
    Rows: 2000
    Columns (Features): 500

[2] COMPUTING COLUMN CORRELATIONS...

[3] REDUNDANCY STATISTICS
    Average Feature Correlation: 0.9973
    Maximum Feature Correlation: 0.9976

[4] SAMPLE CORRELATION BLOCK (First 5 Features)
    This shows how similar the first few features are to each other:
        0       1       2       3       4
0  1.0000  0.9973  0.9972  0.9972  0.9973
1  0.9973  1.0000  0.9972  0.9971  0.9973
2  0.9972  0.9972  1.0000  0.9972  0.9974
3  0.9972  0.9971  0.9972  1.0000  0.9973
4  0.9973  0.9973  0.9974  0.9973  1.0000
------------------------------------------------------------

ANALYZING DATASET: residential
  > Loading dataset: residential
    Source: https://archive.ics.uci.edu/ml/machine-learning-databases/00437/Residential-Building-Da



    Raw Data Shape: 372 rows x 103 features
    Computed Correlation Matrix A: (103, 103)

[1] DATASET DIMENSIONS
    Rows: 372
    Columns (Features): 103

[2] COMPUTING COLUMN CORRELATIONS...

[3] REDUNDANCY STATISTICS
    Average Feature Correlation: 0.5868
    Maximum Feature Correlation: 1.0000

[4] SAMPLE CORRELATION BLOCK (First 5 Features)
    This shows how similar the first few features are to each other:
        0       1       2       3       4
0  1.0000 -0.2295 -0.1381 -0.2869 -0.3136
1 -0.2295  1.0000  0.9465  0.7695  0.2266
2 -0.1381  0.9465  1.0000  0.6392  0.1544
3 -0.2869  0.7695  0.6392  1.0000  0.5823
4 -0.3136  0.2266  0.1544  0.5823  1.0000
------------------------------------------------------------

ANALYZING DATASET: secom
  > Loading dataset: secom
    Source: https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data




    [secom Cleaning] Dropped 112 constant columns.
    Raw Data Shape: 1567 rows x 478 features
    Computed Correlation Matrix A: (478, 478)

[1] DATASET DIMENSIONS
    Rows: 1567
    Columns (Features): 478

[2] COMPUTING COLUMN CORRELATIONS...

[3] REDUNDANCY STATISTICS
    Average Feature Correlation: 0.0429
    Maximum Feature Correlation: 1.0000

[4] SAMPLE CORRELATION BLOCK (First 5 Features)
    This shows how similar the first few features are to each other:
        0       1       2       3       4
0  1.0000 -0.0478  0.0076  0.0094 -0.0009
1 -0.0478  1.0000 -0.0304  0.0060  0.0022
2  0.0076 -0.0304  1.0000  0.3238  0.0204
3  0.0094  0.0060  0.3238  1.0000 -0.0540
4 -0.0009  0.0022  0.0204 -0.0540  1.0000
------------------------------------------------------------

ANALYZING DATASET: arrhythmia
  > Loading dataset: arrhythmia
    Source: https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data




    [arrhythmia Cleaning] Dropped 17 constant columns.
    Raw Data Shape: 452 rows x 262 features
    Computed Correlation Matrix A: (262, 262)

[1] DATASET DIMENSIONS
    Rows: 452
    Columns (Features): 262

[2] COMPUTING COLUMN CORRELATIONS...

[3] REDUNDANCY STATISTICS
    Average Feature Correlation: 0.0890
    Maximum Feature Correlation: 1.0000

[4] SAMPLE CORRELATION BLOCK (First 5 Features)
    This shows how similar the first few features are to each other:
        0       1       2       3       4
0  1.0000 -0.0590 -0.1095  0.3816 -0.0040
1 -0.0590  1.0000 -0.1247 -0.2481 -0.3371
2 -0.1095 -0.1247  1.0000 -0.0750 -0.0063
3  0.3816 -0.2481 -0.0750  1.0000  0.1001
4 -0.0040 -0.3371 -0.0063  0.1001  1.0000
------------------------------------------------------------

ANALYZING DATASET: mnist
  > Loading dataset: mnist
    Fetching MNIST from OpenML (this may take a moment)...
    [mnist Cleaning] Dropped 145 constant columns.
    Raw Data Shape: 2000 rows x 639 features
    C