In [1]:
import numpy as np
import pandas as pd
import io
import requests
from ucimlrepo import fetch_ucirepo

# Registry of supported datasets
DATASETS_URL = {
    "residential": "https://archive.ics.uci.edu/ml/machine-learning-databases/00437/Residential-Building-Data-Set.xlsx",
    "secom": "https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data",
    "arrhythmia": "https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data",
}

DATASETS_ID = {"myocardial": 579}


def load_dataset_online(name_or_url):
    """
    General entry point to load any supported CSSP dataset.

    Args:
        name_or_url (str): Either a key ('residential', 'secom') or a direct URL.

    Returns:
        A (np.ndarray): Correlation matrix (p x p).
        X_norm (np.ndarray): Normalized feature matrix (N x p).
    """
    # 1. Resolve URL
    url = DATASETS_URL.get(name_or_url.lower(), name_or_url)

    print(f"  > Loading dataset: {name_or_url}")
    print(f"    Source: {url}")

    try:
        if name_or_url in DATASETS_URL:
            # 2. Download Data (Robust SSL handling)
            response = requests.get(url, verify=False)
            response.raise_for_status()
            content = io.BytesIO(response.content)

            # 3. Dispatch to specific loader based on known URLs
            if "Residential-Building" in url:
                X_raw = _parse_residential(content)
            elif "secom" in url:
                X_raw = _parse_secom(content)
            elif "arrhythmia" in url:
                X_raw = _parse_arrhythmia(content)
            else:
                # Fallback: Try generic CSV loading
                print("    Unknown format. Attempting generic CSV load...")
                df = pd.read_csv(content)
                X_raw = df.select_dtypes(include=[np.number]).to_numpy()

            if X_raw is None:
                return None, None
        elif name_or_url in DATASETS_ID:
            if "myocardial" in name_or_url:
                X = fetch_ucirepo(id=DATASETS_ID["myocardial"]).data.features
                X_raw = _parse_myocardial(X)
        # 4. Standardize and Compute Correlation (Shared Logic)
        return _standardize_and_correlate(X_raw)

    except Exception as e:
        print(f"    CRITICAL ERROR loading data: {e}")
        return None, None


# --- SPECIFIC PARSERS ---


def _parse_residential(content):
    """Parser for UCI Residential Building (Excel, Headers, targets at end)."""
    try:
        # Read Excel (Header is on row 1, index 1)
        df = pd.read_excel(content, header=1)

        # Columns 4 to 107 are the features (V-1 to V-104)
        # Drop first 4 (ID/Dates) and last 2 (Targets)
        X_df = df.iloc[:, 4:107]

        # Force numeric
        X_raw = X_df.apply(pd.to_numeric, errors="coerce").to_numpy(dtype=np.float64)
        return np.nan_to_num(X_raw)

    except Exception as e:
        print(f"    Error parsing Residential Excel: {e}")
        return None


def _parse_secom(content):
    """Parser for SECOM (Space-separated, No Header, Constant Columns)."""
    try:
        # Read CSV with space delimiter
        df = pd.read_csv(content, sep=r"\s+", header=None)
        X_raw = df.to_numpy(dtype=np.float64)

        # Fill NaNs (SECOM has many)
        X_raw = np.nan_to_num(X_raw)

        return _clean_constant_rows(X_raw, "secom")

    except Exception as e:
        print(f"    Error parsing SECOM CSV: {e}")
        return None


def _parse_arrhythmia(content):
    """Parser for Arrhythmia (Comma-separated, '?' for missing data)."""
    try:
        # FIX 1: Use sep="," (default) and handle '?' missing values
        df = pd.read_csv(content, header=None, na_values="?")

        # FIX 2: Arrhythmia often has a 'class' label in the last column
        # Usually for CSSP we only want the features (columns 0-278)
        X_df = df.iloc[:, :-1]

        # Force numeric and fill NaNs with 0
        X_raw = (
            X_df.apply(pd.to_numeric, errors="coerce")
            .fillna(0)
            .to_numpy(dtype=np.float64)
        )

        return _clean_constant_rows(X_raw, "arrhythmia")

    except Exception as e:
        print(f"    Error parsing Arrhythmia CSV: {e}")
        return None

def _parse_myocardial(content):
    try:
        X_raw = content.apply(pd.to_numeric, errors="coerce").fillna(0).to_numpy(dtype=np.float64)
        
        return _clean_constant_rows(X_raw, "myocardial")
        
    except Exception as e:
        print(f"    Error parsing Arrhythmia CSV: {e}")
        return None

# --- SHARED MATH ---


def _clean_constant_rows(X_raw, name):
    """Drop constant rows (Repeated rows)"""
    # SECOM SPECIFIC: Drop constant columns (Variance = 0)
        # If we don't do this, the matrix is singular and solver crashes.
    std_devs = np.std(X_raw, axis=0)
    keep_idx = np.where(std_devs > 1e-9)[0]

    print(
        f"    [{name} Cleaning] Dropped {X_raw.shape[1] - len(keep_idx)} constant columns."
    )
    return X_raw[:, keep_idx]

def _standardize_and_correlate(X_raw):
    """
    Normalizes X (Z-score) and calculates A = (X^T X) / N.
    Used for ALL datasets to ensure consistent math.
    """
    N, p = X_raw.shape
    print(f"    Raw Data Shape: {N} rows x {p} features")

    # Z-score Normalization
    X_mean = np.mean(X_raw, axis=0)
    X_std = np.std(X_raw, axis=0)

    # Safety: Avoid division by zero
    X_std[X_std == 0] = 1.0

    X_norm = (X_raw - X_mean) / X_std

    # Correlation Matrix
    A = (X_norm.T @ X_norm) / N

    print(f"    Computed Correlation Matrix A: {A.shape}")
    return A, X_norm



load_dataset_online("myocardial")


  > Loading dataset: myocardial
    Source: myocardial
    [myocardial Cleaning] Dropped 0 constant columns.
    Raw Data Shape: 1700 rows x 111 features
    Computed Correlation Matrix A: (111, 111)


(array([[ 1.        , -0.37434968,  0.08990689, ..., -0.05034158,
         -0.05099073, -0.03022833],
        [-0.37434968,  1.        ,  0.03998751, ..., -0.00922243,
          0.02960847,  0.05883139],
        [ 0.08990689,  0.03998751,  1.        , ..., -0.04475086,
         -0.01392607,  0.01097591],
        ...,
        [-0.05034158, -0.00922243, -0.04475086, ...,  1.        ,
         -0.10236636, -0.21385819],
        [-0.05099073,  0.02960847, -0.01392607, ..., -0.10236636,
          1.        ,  0.05558756],
        [-0.03022833,  0.05883139,  0.01097591, ..., -0.21385819,
          0.05558756,  1.        ]], shape=(111, 111)),
 array([[ 1.28601462,  0.77216846,  1.73022388, ...,  0.59818682,
         -0.13403012, -0.50091895],
        [-0.54708801,  0.77216846,  0.53405446, ...,  0.59818682,
         -0.13403012,  1.99633095],
        [-0.79705655,  0.77216846, -0.66211495, ...,  0.59818682,
         -0.13403012, -0.50091895],
        ...,
        [-0.54708801,  0.77216846,  

In [33]:
from ucimlrepo import fetch_ucirepo

myocardial = fetch_ucirepo(id=579)
X = myocardial.data.features
Y = myocardial.data.targets


In [34]:
import numpy as np
X_raw = X.apply(pd.to_numeric, errors="coerce").to_numpy(dtype=np.float64)


In [36]:
X_raw

array([[77.,  1.,  2., ...,  1.,  0.,  0.],
       [55.,  1.,  1., ...,  1.,  0.,  1.],
       [52.,  1.,  0., ...,  1.,  0.,  0.],
       ...,
       [55.,  1.,  3., ...,  1.,  0.,  0.],
       [79.,  0.,  2., ...,  1.,  0.,  0.],
       [63.,  1.,  2., ...,  0.,  0.,  0.]], shape=(1700, 111))

In [27]:
X_raw = np.nan_to_num(X_raw)

In [28]:
# Drop constant columns
std_devs = np.std(X_raw, axis=0)
keep_idx = np.where(std_devs > 1e-9)[0]

In [29]:
dropped_count = X_raw.shape[1] - len(keep_idx)
if dropped_count > 0:
    print(f"Cleaning Dropped {dropped_count} constant column")

In [30]:
X_clean = X_raw[: , keep_idx]

In [None]:
# import os
# from pathlib import Path


# def load_dataset_local(name):
#     """Load dataset from local"""
#     try:
#         DATA_DIR = Path(__file__).resolve().parent / "data"
#     except NameError:
#         DATA_DIR = Path(os.getcwd()).resolve().parent / "data"
#     try:
#         if "protein" in name:
#             df = pd.read_csv(DATA_DIR / "protein" / "secondary-structure.data")
#             return df
#     except Exception as e:
#         print(f"    Error parsing Residential Excel: {e}")
#         return None

    
# load_dataset_local("protein")

    Error parsing Residential Excel: Error tokenizing data. C error: Expected 1 fields in line 4, saw 2

