In [28]:

import os
import ast
import warnings
import numpy as np
import pandas as pd
import wfdb
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from scipy.integrate import trapezoid
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import class_weight

In [29]:
BASE = r"C:\Users\iamsn\ECG Classifier\ECG Classifier"
TARGET_FS = 100  


if not os.path.exists(BASE):
    raise FileNotFoundError(f"Base directory not found: {BASE}")
    
csv_path = os.path.join(BASE, "ptbxl_database.csv")
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"Database CSV not found: {csv_path}")


print("Loading PTB-XL database...")
df_meta = pd.read_csv(csv_path)
print(f"Loaded {len(df_meta)} ECG records from database")
print(f"Database columns: {list(df_meta.columns)}")


print(f"\nSample metadata:")
print(df_meta.head())


Loading PTB-XL database...
Loaded 21837 ECG records from database
Database columns: ['ecg_id', 'patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site', 'device', 'recording_date', 'report', 'scp_codes', 'heart_axis', 'infarction_stadium1', 'infarction_stadium2', 'validated_by', 'second_opinion', 'initial_autogenerated_report', 'validated_by_human', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr']

Sample metadata:
   ecg_id  patient_id   age  sex  height  weight  nurse  site     device  \
0       1     15709.0  56.0    1     NaN    63.0    2.0   0.0  CS-12   E   
1       2     13243.0  19.0    0     NaN    70.0    2.0   0.0  CS-12   E   
2       3     20372.0  37.0    1     NaN    69.0    2.0   0.0  CS-12   E   
3       4     17014.0  24.0    0     NaN    82.0    2.0   0.0  CS-12   E   
4       5     17448.0  19.0    1     NaN    70.0    2.0   0.0  CS-12   E   

        recording_date

In [30]:
print("Analyzing data availability...")
available_files = []
missing_files = []


sample_ids = df_meta.ecg_id.values[:1000]
print(f"Checking availability of first {len(sample_ids)} ECG files...")

for i, ecg_id in enumerate(sample_ids):
    if i % 200 == 0:  # Progress indicator
        print(f"  Checked {i}/{len(sample_ids)} files...")
    
    row = df_meta[df_meta.ecg_id == ecg_id].iloc[0]
    rec_path = os.path.join(BASE, row.filename_lr)
    
    if os.path.exists(rec_path + '.hea') and os.path.exists(rec_path + '.dat'):
        available_files.append(ecg_id)
    else:
        missing_files.append(ecg_id)

print(f"\n Data Availability Summary:")
print(f"Available files: {len(available_files)}")
print(f"Missing files: {len(missing_files)}")
print(f"Availability rate: {len(available_files)/len(sample_ids)*100:.1f}%")
if len(available_files) == 0:
    raise RuntimeError("No ECG data files found! Please check your data directory.")
elif len(available_files) < 100:
    print(f"Warning: Only {len(available_files)} files available. This may not be sufficient for training.")

# Use only available files for processing
df_meta_available = df_meta[df_meta.ecg_id.isin(available_files)]
print(f"\n Will process {len(df_meta_available)} available ECG records")

Analyzing data availability...
Checking availability of first 1000 ECG files...
  Checked 0/1000 files...
  Checked 200/1000 files...
  Checked 400/1000 files...
  Checked 600/1000 files...
  Checked 800/1000 files...

 Data Availability Summary:
Available files: 30
Missing files: 970
Availability rate: 3.0%

 Will process 30 available ECG records


In [31]:
print("Available ECG Records Summary:")
print(f"Shape: {df_meta_available.shape}")
print(f"Columns: {list(df_meta_available.columns)}")
print("\nFirst few available records:")
df_meta_available.head()

Available ECG Records Summary:
Shape: (30, 28)
Columns: ['ecg_id', 'patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site', 'device', 'recording_date', 'report', 'scp_codes', 'heart_axis', 'infarction_stadium1', 'infarction_stadium2', 'validated_by', 'second_opinion', 'initial_autogenerated_report', 'validated_by_human', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr']

First few available records:


Unnamed: 0,ecg_id,patient_id,age,sex,height,weight,nurse,site,device,recording_date,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,...,True,,", I-V1,",,,,,3,records100/00000/00001_lr,records500/00000/00001_hr
1,2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,...,True,,,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr
2,3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,...,True,,,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr
3,4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,...,True,", II,III,AVF",,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr
4,5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,...,True,", III,AVR,AVF",,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr


In [32]:
# === ECG Data Loading Functions ===

def load_and_normalize(ecg_id, df_source=None):
    """
    Load and normalize ECG signal data.
    
    Args:
        ecg_id: ECG record ID
        df_source: Source dataframe (defaults to df_meta_available)
    
    Returns:
        Normalized ECG signal array
    """
    if df_source is None:
        df_source = df_meta_available
    
    # Find the record
    matching_rows = df_source[df_source.ecg_id == ecg_id]
    if len(matching_rows) == 0:
        raise ValueError(f"ECG ID {ecg_id} not found in metadata")
    
    row = matching_rows.iloc[0]
    rec_path = os.path.join(BASE, row.filename_lr)
    
    # Validate files exist
    if not os.path.exists(rec_path + '.hea'):
        raise FileNotFoundError(f"Header file not found: {rec_path}.hea")
    if not os.path.exists(rec_path + '.dat'):
        raise FileNotFoundError(f"Data file not found: {rec_path}.dat")
    try:
        # Load ECG signal
        sig, meta = wfdb.rdsamp(rec_path)
        
        # Validate sampling frequency
        if meta['fs'] != TARGET_FS:
            print(f" Warning: Expected {TARGET_FS} Hz, got {meta['fs']} Hz for record {ecg_id}")
        
        # Validate signal shape
        if sig is None or len(sig) == 0:
            raise ValueError(f"Empty signal for record {ecg_id}")
        
        if sig.shape[1] != 12:
            print(f" Warning: Expected 12 leads, got {sig.shape[1]} leads for record {ecg_id}")
        
        # Normalize signal (per lead)
        sig_norm = StandardScaler().fit_transform(sig)
        
        return sig_norm
        
    except Exception as e:
        raise RuntimeError(f"Failed to load ECG {ecg_id}: {str(e)}")
print("ECG loading functions defined")

ECG loading functions defined


In [33]:
def extract_features_from_ecg(sig, fs=100):
    """
    Extract comprehensive features from ECG signal.
    
    Args:
        sig: ECG signal array (n_samples, n_leads)
        fs: Sampling frequency
    
    Returns:
        Dictionary of extracted features
    """
    if sig is None or len(sig) == 0:
        raise ValueError("Empty signal provided")
    
    feat = {}
    n_leads = sig.shape[1] if len(sig.shape) > 1 else 1
    
    # Ensure signal is 2D
    if len(sig.shape) == 1:
        sig = sig.reshape(-1, 1)
        n_leads = 1
    
    for lead in range(n_leads):
        s = sig[:, lead]
        
        # Handle potential NaN or infinite values
        if np.any(np.isnan(s)) or np.any(np.isinf(s)):
            print(f"Warning: NaN or infinite values found in lead {lead}")
            s = np.nan_to_num(s, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Time-domain statistical features
        try:
            feat[f'L{lead}_mean'] = np.mean(s)
            feat[f'L{lead}_std'] = np.std(s)
            feat[f'L{lead}_min'] = np.min(s)
            feat[f'L{lead}_max'] = np.max(s)
            feat[f'L{lead}_median'] = np.median(s)
            feat[f'L{lead}_range'] = np.max(s) - np.min(s)
            feat[f'L{lead}_skew'] = skew(s)
            feat[f'L{lead}_kurtosis'] = kurtosis(s)
            feat[f'L{lead}_rms'] = np.sqrt(np.mean(s**2))
            
            # Additional statistical features
            feat[f'L{lead}_var'] = np.var(s)
            feat[f'L{lead}_q25'] = np.percentile(s, 25)
            feat[f'L{lead}_q75'] = np.percentile(s, 75)
            feat[f'L{lead}_iqr'] = feat[f'L{lead}_q75'] - feat[f'L{lead}_q25']
            
        except Exception as e:
            print(f" Warning: Error computing time-domain features for lead {lead}: {e}")
            # Set default values
            for feature_name in ['mean', 'std', 'min', 'max', 'median', 'range', 'skew', 'kurtosis', 'rms', 'var', 'q25', 'q75', 'iqr']:
                feat[f'L{lead}_{feature_name}'] = 0.0
        
        # Frequency-domain features
        try:
            # Use appropriate nperseg based on signal length
            nperseg = min(256, len(s) // 4)
            if nperseg < 4:
                nperseg = len(s)
            
            f, Pxx = welch(s, fs=fs, nperseg=nperseg)
            
            # Clinical frequency bands
            mask_total = (f >= 0.5) & (f <= 40)
            mask_low = (f >= 0.5) & (f <= 4)
            mask_mid = (f >= 4) & (f <= 15)
            mask_high = (f >= 15) & (f <= 40)
            
            if np.any(mask_total):
                feat[f'L{lead}_bandpower_total'] = trapezoid(Pxx[mask_total], f[mask_total])
                feat[f'L{lead}_dominant_freq'] = f[mask_total][np.argmax(Pxx[mask_total])]
            else:
                feat[f'L{lead}_bandpower_total'] = 0.0
                feat[f'L{lead}_dominant_freq'] = 0.0
            
            # Band-specific power
            for band_name, mask in [('low', mask_low), ('mid', mask_mid), ('high', mask_high)]:
                if np.any(mask):
                    feat[f'L{lead}_bandpower_{band_name}'] = trapezoid(Pxx[mask], f[mask])
                else:
                    feat[f'L{lead}_bandpower_{band_name}'] = 0.0
                    
        except Exception as e:
            print(f"Warning: Error computing frequency-domain features for lead {lead}: {e}")
            # Set default values
            for feature_name in ['bandpower_total', 'dominant_freq', 'bandpower_low', 'bandpower_mid', 'bandpower_high']:
                feat[f'L{lead}_{feature_name}'] = 0.0
    
    return feat

def validate_features(features_dict):
    """Validate extracted features for NaN or infinite values."""
    invalid_features = []
    for key, value in features_dict.items():
        if np.isnan(value) or np.isinf(value):
            invalid_features.append(key)
            features_dict[key] = 0.0  # Replace with default value
    
    if invalid_features:
        print(f"Replaced {len(invalid_features)} invalid feature values with 0.0")
    
    return features_dict

print("Feature extraction functions defined")

Feature extraction functions defined


In [34]:
print("Analyzing diagnostic labels (SCP codes)...")

# Convert SCP codes to proper format
scp_codes_sample = df_meta_available['scp_codes'].head(20)
print("Sample SCP codes (raw):")
for i, code in enumerate(scp_codes_sample):
    print(f"  {i+1}: {code}")

# Parse SCP codes and analyze distribution
try:
    df_meta_available_copy = df_meta_available.copy()
    df_meta_available_copy['scp_codes_parsed'] = df_meta_available_copy['scp_codes'].apply(ast.literal_eval)
    
    # Count normal vs abnormal
    normal_count = 0
    abnormal_count = 0
    
    for codes in df_meta_available_copy['scp_codes_parsed']:
        if 'NORM' in codes:
            normal_count += 1
        else:
            abnormal_count += 1
    
    print(f"\n Label Distribution:")
    print(f"  Normal (NORM): {normal_count} ({normal_count/len(df_meta_available_copy)*100:.1f}%)")
    print(f"   Abnormal: {abnormal_count} ({abnormal_count/len(df_meta_available_copy)*100:.1f}%)")
    
    # Check class balance
    if normal_count == 0 or abnormal_count == 0:
        print("Warning: Severely imbalanced dataset - only one class present!")
    elif min(normal_count, abnormal_count) / max(normal_count, abnormal_count) < 0.1:
        print("Warning: Highly imbalanced dataset - consider class weighting")
    
    # Show most common abnormal codes
    all_abnormal_codes = []
    for codes in df_meta_available_copy['scp_codes_parsed']:
        if 'NORM' not in codes:
            all_abnormal_codes.extend(list(codes.keys()))
    
    if all_abnormal_codes:
        from collections import Counter
        common_codes = Counter(all_abnormal_codes).most_common(10)
        print(f"\nMost common abnormal codes:")
        for code, count in common_codes:
            print(f"  {code}: {count}")
            
except Exception as e:
    print(f"Error parsing SCP codes: {e}")
    print("Using fallback label analysis...")
    print(f"Total available records: {len(df_meta_available)}")

Analyzing diagnostic labels (SCP codes)...
Sample SCP codes (raw):
  1: {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
  2: {'NORM': 80.0, 'SBRAD': 0.0}
  3: {'NORM': 100.0, 'SR': 0.0}
  4: {'NORM': 100.0, 'SR': 0.0}
  5: {'NORM': 100.0, 'SR': 0.0}
  6: {'NORM': 100.0, 'SR': 0.0}
  7: {'NORM': 100.0, 'SR': 0.0}
  8: {'IMI': 35.0, 'ABQRS': 0.0, 'SR': 0.0}
  9: {'NORM': 100.0, 'SR': 0.0}
  10: {'NORM': 100.0, 'SR': 0.0}
  11: {'NORM': 80.0, 'SARRH': 0.0}
  12: {'NORM': 80.0, 'SBRAD': 0.0}
  13: {'NORM': 100.0, 'SR': 0.0}
  14: {'NORM': 100.0, 'SR': 0.0}
  15: {'NORM': 100.0, 'SARRH': 0.0}
  16: {'NORM': 100.0, 'SR': 0.0}
  17: {'AFLT': 100.0, 'ABQRS': 0.0, 'AFIB': 0.0}
  18: {'AFLT': 100.0}
  19: {'NORM': 100.0, 'SR': 0.0}
  20: {'AFLT': 100.0, 'ABQRS': 0.0}

 Label Distribution:
  Normal (NORM): 21 (70.0%)
   Abnormal: 9 (30.0%)

Most common abnormal codes:
  SR: 5
  AFLT: 4
  ABQRS: 3
  NST_: 2
  DIG: 2
  IMI: 1
  AFIB: 1
  NDT: 1
  LVH: 1


In [35]:
print("Starting feature extraction pipeline...")

# Use only available ECG IDs and limit processing for efficiency
max_samples = min(1000, len(df_meta_available))  # Process up to 1000 samples
ecg_ids_to_process = df_meta_available.ecg_id.values[:max_samples]

print(f"Processing {len(ecg_ids_to_process)} ECG records...")

features_all = []
successful_count = 0
failed_count = 0
error_summary = {}

# Process ECGs with progress tracking
for i, ecg_id in enumerate(ecg_ids_to_process):
    # Progress indicator
    if i % 100 == 0:
        print(f"  Progress: {i}/{len(ecg_ids_to_process)} ({i/len(ecg_ids_to_process)*100:.1f}%)")
        
    try:
        # Load and normalize ECG signal
        sig = load_and_normalize(ecg_id, df_meta_available)
        
        # Extract features
        feats = extract_features_from_ecg(sig, fs=TARGET_FS)
        
        # Validate features
        feats = validate_features(feats)
        
        # Add metadata
        feats['ecg_id'] = ecg_id
        features_all.append(feats)
        successful_count += 1
        
    except Exception as e:
        failed_count += 1
        error_type = type(e).__name__
        error_summary[error_type] = error_summary.get(error_type, 0) + 1
        
        # Print first few errors for debugging
        if failed_count <= 5:
            print(f"Error processing {ecg_id}: {str(e)[:100]}...")

print(f"\n Feature Extraction Summary:")
print(f"  Successfully processed: {successful_count}")
print(f"  Failed: {failed_count}")
print(f"  Success rate: {successful_count/(successful_count+failed_count)*100:.1f}%")

if error_summary:
    print(f"  Error breakdown:")
    for error_type, count in error_summary.items():
        print(f"    {error_type}: {count}")

# Validate we have enough data
if successful_count == 0:
    raise RuntimeError("No ECG records were successfully processed!")
elif successful_count < 50:
    print(f"Warning: Only {successful_count} records processed. This may not be sufficient for reliable ML training.")

# Convert to DataFrame
print("\nConverting features to DataFrame...")
features_df = pd.DataFrame(features_all).set_index('ecg_id')

print(f"Feature matrix created:")
print(f"  Shape: {features_df.shape}")
print(f"  Features per record: {features_df.shape[1]}")
print(f"  Memory usage: {features_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Validate feature matrix
print(f"\nFeature matrix validation:")
nan_count = features_df.isnull().sum().sum()
inf_count = np.isinf(features_df.select_dtypes(include=[np.number])).sum().sum()
print(f"  NaN values: {nan_count}")
print(f"  Infinite values: {inf_count}")

if nan_count > 0:
    print("Warning: NaN values found - replacing with 0")
    features_df = features_df.fillna(0)

if inf_count > 0:
    print("Warning: Infinite values found - replacing with 0")
    features_df = features_df.replace([np.inf, -np.inf], 0)

# Preview features
print(f"\n Sample features (first 5 columns):")
print(features_df.iloc[:3, :5])

print(f"\n Feature statistics:")
print(features_df.describe().iloc[:, :5])

Starting feature extraction pipeline...
Processing 30 ECG records...
  Progress: 0/30 (0.0%)

 Feature Extraction Summary:
  Successfully processed: 30
  Failed: 0
  Success rate: 100.0%

Converting features to DataFrame...
Feature matrix created:
  Shape: (30, 216)
  Features per record: 216
  Memory usage: 0.0 MB

Feature matrix validation:
  NaN values: 0
  Infinite values: 0

 Sample features (first 5 columns):
             L0_mean  L0_std    L0_min    L0_max  L0_median
ecg_id                                                     
1      -1.705303e-16     1.0 -1.802458  6.461912  -0.096384
2       9.237056e-17     1.0 -2.599301  4.992821  -0.305926
3      -4.973799e-17     1.0 -1.330285  6.846310  -0.282427

 Feature statistics:
            L0_mean        L0_std     L0_min     L0_max  L0_median
count  3.000000e+01  3.000000e+01  30.000000  30.000000  30.000000
mean  -2.469136e-17  1.000000e+00  -2.313966   5.415311  -0.206003
std    8.214848e-17  7.456158e-16   0.874352   0.927771   

In [36]:
print("Detailed feature analysis:")
print(f"Total features extracted: {len(features_df.columns)}")

# Group features by type
feature_types = {}
for col in features_df.columns:
    if '_mean' in col or '_std' in col or '_min' in col or '_max' in col:
        feature_types.setdefault('Statistical', []).append(col)
    elif '_bandpower' in col or '_freq' in col:
        feature_types.setdefault('Frequency', []).append(col)
    else:
        feature_types.setdefault('Other', []).append(col)

print("\nFeature breakdown by type:")
for ftype, features in feature_types.items():
    print(f"  {ftype}: {len(features)} features")

# Check for constant features (no variation)
constant_features = []
for col in features_df.select_dtypes(include=[np.number]).columns:
    if features_df[col].nunique() <= 1:
        constant_features.append(col)

if constant_features:
    print(f"\n Warning: Found {len(constant_features)} constant features that should be removed:")
    for feat in constant_features[:5]:  # Show first 5
        print(f"    {feat}")
    if len(constant_features) > 5:
        print(f"    ... and {len(constant_features)-5} more")
else:
    print("\n No constant features found")

# Show feature correlation (sample)
if len(features_df.columns) > 1:
    sample_corr = features_df.iloc[:, :10].corr()
    print(f"\nSample feature correlations (first 10 features):")
    print(sample_corr.iloc[:5, :5].round(3))

Detailed feature analysis:
Total features extracted: 216

Feature breakdown by type:
  Statistical: 48 features
  Other: 108 features
  Frequency: 60 features

 No constant features found

Sample feature correlations (first 10 features):
           L0_mean  L0_std  L0_min  L0_max  L0_median
L0_mean      1.000  -0.403  -0.432  -0.413      0.208
L0_std      -0.403   1.000   0.434   0.231     -0.251
L0_min      -0.432   0.434   1.000   0.785     -0.436
L0_max      -0.413   0.231   0.785   1.000     -0.274
L0_median    0.208  -0.251  -0.436  -0.274      1.000


In [37]:
# === Label Preparation ===

print("Preparing labels for machine learning...")

# Filter metadata to match processed ECG records
print(f"Matching metadata to {len(features_df)} processed ECG records...")
df_meta_filtered = df_meta_available[df_meta_available.ecg_id.isin(features_df.index)].copy()
df_meta_filtered = df_meta_filtered.set_index('ecg_id').loc[features_df.index]

print(f"Matched {len(df_meta_filtered)} metadata records to feature matrix")

# Validate metadata alignment
if len(df_meta_filtered) != len(features_df):
    print(f"Warning: Metadata count ({len(df_meta_filtered)}) != Feature count ({len(features_df)})")

# Convert scp_codes from string to actual dict (safe eval)
print("Parsing SCP diagnostic codes...")
try:
    df_meta_filtered['scp_codes_parsed'] = df_meta_filtered['scp_codes'].apply(ast.literal_eval)
    print("Successfully parsed SCP codes")
except Exception as e:
    print(f"Error parsing SCP codes: {e}")
    # Fallback: create dummy labels if parsing fails
    print("Creating fallback labels...")
    df_meta_filtered['scp_codes_parsed'] = [{'UNKNOWN': 1.0}] * len(df_meta_filtered)

# Validate SCP codes structure
sample_codes = df_meta_filtered['scp_codes_parsed'].iloc[0]
print(f"Sample SCP codes structure: {sample_codes}")
print(f"SCP codes type: {type(sample_codes)}")

print(f"\nProcessed metadata shape: {df_meta_filtered.shape}")
print(f"Available columns: {list(df_meta_filtered.columns)}")

Preparing labels for machine learning...
Matching metadata to 30 processed ECG records...
Matched 30 metadata records to feature matrix
Parsing SCP diagnostic codes...
Successfully parsed SCP codes
Sample SCP codes structure: {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
SCP codes type: <class 'dict'>

Processed metadata shape: (30, 28)
Available columns: ['patient_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site', 'device', 'recording_date', 'report', 'scp_codes', 'heart_axis', 'infarction_stadium1', 'infarction_stadium2', 'validated_by', 'second_opinion', 'initial_autogenerated_report', 'validated_by_human', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'pacemaker', 'strat_fold', 'filename_lr', 'filename_hr', 'scp_codes_parsed']


In [38]:
# Display filtered metadata summary
print("Filtered Metadata Summary:")
print(f"Shape: {df_meta_filtered.shape}")
print(f"Index type: {type(df_meta_filtered.index)}")
print(f"Index name: {df_meta_filtered.index.name}")

# Show first few records
print(f"\nFirst 3 records:")
display_cols = ['age', 'sex', 'scp_codes_parsed'] if 'scp_codes_parsed' in df_meta_filtered.columns else df_meta_filtered.columns[:3]
print(df_meta_filtered[display_cols].head(3))

# Verify index alignment with features
print(f"\nIndex alignment check:")
print(f"Features index sample: {list(features_df.index[:5])}")
print(f"Metadata index sample: {list(df_meta_filtered.index[:5])}")
index_match = len(set(features_df.index) & set(df_meta_filtered.index))
print(f"Matching indices: {index_match}/{len(features_df)}")

Filtered Metadata Summary:
Shape: (30, 28)
Index type: <class 'pandas.core.indexes.base.Index'>
Index name: ecg_id

First 3 records:
         age  sex                          scp_codes_parsed
ecg_id                                                     
1       56.0    1  {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
2       19.0    0              {'NORM': 80.0, 'SBRAD': 0.0}
3       37.0    1                {'NORM': 100.0, 'SR': 0.0}

Index alignment check:
Features index sample: [1, 2, 3, 4, 5]
Metadata index sample: [1, 2, 3, 4, 5]
Matching indices: 30/30


In [39]:
# === Label Creation ===

print("Creating binary classification labels (Normal vs Abnormal)...")

# Create labels based on SCP codes
def create_label(scp_codes):
    """Convert SCP codes to binary label: 0=Normal, 1=Abnormal"""
    try:
        if isinstance(scp_codes, str):
            # If still string, try to parse
            scp_codes = ast.literal_eval(scp_codes)
        
        if isinstance(scp_codes, dict):
            # Check if NORM is present
            return 0 if 'NORM' in scp_codes else 1
        else:
            # Fallback for unexpected format
            print(f"Unexpected SCP code format: {type(scp_codes)}")
            return 1  # Default to abnormal
            
    except Exception as e:
        print(f"Error processing SCP codes: {e}")
        return 1  # Default to abnormal if error

# Apply label creation
if 'scp_codes_parsed' in df_meta_filtered.columns:
    labels = df_meta_filtered['scp_codes_parsed'].apply(create_label)
else:
    # Fallback to original scp_codes column
    labels = df_meta_filtered['scp_codes'].apply(create_label)

# Add labels to features dataframe
features_df = features_df.copy()  # Ensure we can modify
features_df['label'] = labels

# Validate label creation
print(f"Labels created successfully")
print(f"Label distribution:")
label_counts = features_df['label'].value_counts().sort_index()
for label, count in label_counts.items():
    label_name = "Normal" if label == 0 else "Abnormal"
    percentage = count / len(features_df) * 100
    print(f"  {label} ({label_name}): {count} ({percentage:.1f}%)")

# Check for missing labels
missing_labels = features_df['label'].isnull().sum()
if missing_labels > 0:
    print(f"Warning: {missing_labels} records have missing labels")
    features_df['label'] = features_df['label'].fillna(1)  # Default to abnormal

# Class balance analysis
if len(label_counts) == 2:
    minority_class = label_counts.min()
    majority_class = label_counts.max()
    imbalance_ratio = minority_class / majority_class
    
    if imbalance_ratio < 0.1:
        print(f"Severe class imbalance detected (ratio: {imbalance_ratio:.3f})")
        print(f"Consider using class weights or sampling techniques")
    elif imbalance_ratio < 0.3:
        print(f"Moderate class imbalance detected (ratio: {imbalance_ratio:.3f})")
    else:
        print(f"Reasonable class balance (ratio: {imbalance_ratio:.3f})")
else:
    print(f"Unexpected number of classes: {len(label_counts)}")

print(f"\nFinal dataset shape: {features_df.shape}")
print(f"Features: {features_df.shape[1] - 1}")  # Subtract 1 for label column
print(f"Samples: {features_df.shape[0]}")

Creating binary classification labels (Normal vs Abnormal)...
Labels created successfully
Label distribution:
  0 (Normal): 21 (70.0%)
  1 (Abnormal): 9 (30.0%)
Reasonable class balance (ratio: 0.429)

Final dataset shape: (30, 217)
Features: 216
Samples: 30


In [40]:
print("Label Analysis:")
print(f"Label column type: {features_df['label'].dtype}")
print(f"Unique labels: {sorted(features_df['label'].unique())}")
print(f"Label statistics:")
print(features_df['label'].describe())

print(f"\nFirst 10 labels:")
for i, (idx, label) in enumerate(features_df['label'].head(10).items()):
    label_name = "Normal" if label == 0 else "Abnormal"
    print(f"  ECG {idx}: {label} ({label_name})")

# Verify no missing labels
print(f"\nLabel validation:")
print(f"Missing labels: {features_df['label'].isnull().sum()}")
print(f"Invalid labels (not 0 or 1): {(~features_df['label'].isin([0, 1])).sum()}")

Label Analysis:
Label column type: int64
Unique labels: [np.int64(0), np.int64(1)]
Label statistics:
count    30.000000
mean      0.300000
std       0.466092
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: label, dtype: float64

First 10 labels:
  ECG 1: 0 (Normal)
  ECG 2: 0 (Normal)
  ECG 3: 0 (Normal)
  ECG 4: 0 (Normal)
  ECG 5: 0 (Normal)
  ECG 6: 0 (Normal)
  ECG 7: 0 (Normal)
  ECG 8: 1 (Abnormal)
  ECG 9: 0 (Normal)
  ECG 10: 0 (Normal)

Label validation:
Missing labels: 0
Invalid labels (not 0 or 1): 0


In [41]:
# === Final Dataset Overview ===

print("Complete Dataset Summary:")
print(f"Shape: {features_df.shape}")
print(f"Features: {features_df.shape[1] - 1} (excluding label)")
print(f"Samples: {features_df.shape[0]}")
print(f"Memory: {features_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Show column structure
feature_cols = [col for col in features_df.columns if col != 'label']
print(f"\n Feature columns (first 10):")
for i, col in enumerate(feature_cols[:10]):
    print(f"  {i+1:2d}. {col}")
if len(feature_cols) > 10:
    print(f"     ... and {len(feature_cols)-10} more features")

# Data quality check

total_values = features_df.shape[0] * features_df.shape[1]
null_values = features_df.isnull().sum().sum()
inf_values = np.isinf(features_df.select_dtypes(include=[np.number])).sum().sum()

print(f"  Null values: {null_values} ({null_values/total_values*100:.2f}%)")
print(f"  Infinite values: {inf_values}")
print(f"  Data completeness: {(1 - null_values/total_values)*100:.1f}%")

# Show sample of data

sample_cols = feature_cols[:5] + ['label']
sample_data = features_df[sample_cols].head(3)
print(sample_data.round(4))



try:
    output_path = os.path.join(BASE, "processed_ecg_features.csv")
    features_df.to_csv(output_path)
    
except Exception as e:
    print(f"Warning: Could not save dataset: {e}")

features_df.head()

Complete Dataset Summary:
Shape: (30, 217)
Features: 216 (excluding label)
Samples: 30
Memory: 0.0 MB

 Feature columns (first 10):
   1. L0_mean
   2. L0_std
   3. L0_min
   4. L0_max
   5. L0_median
   6. L0_range
   7. L0_skew
   8. L0_kurtosis
   9. L0_rms
  10. L0_var
     ... and 206 more features
  Null values: 0 (0.00%)
  Infinite values: 0
  Data completeness: 100.0%
        L0_mean  L0_std  L0_min  L0_max  L0_median  label
ecg_id                                                   
1          -0.0     1.0 -1.8025  6.4619    -0.0964      0
2           0.0     1.0 -2.5993  4.9928    -0.3059      0
3          -0.0     1.0 -1.3303  6.8463    -0.2824      0


Unnamed: 0_level_0,L0_mean,L0_std,L0_min,L0_max,L0_median,L0_range,L0_skew,L0_kurtosis,L0_rms,L0_var,...,L11_var,L11_q25,L11_q75,L11_iqr,L11_bandpower_total,L11_dominant_freq,L11_bandpower_low,L11_bandpower_mid,L11_bandpower_high,label
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.705303e-16,1.0,-1.802458,6.461912,-0.096384,8.264371,2.346514,10.101222,1.0,1.0,...,1.0,-0.685034,0.31556,1.000594,0.737153,3.2,0.305328,0.378149,0.048817,0
2,9.237056000000001e-17,1.0,-2.599301,4.992821,-0.305926,7.592123,1.960769,4.740093,1.0,1.0,...,1.0,-0.521839,0.115798,0.637638,1.076266,0.8,0.28531,0.48555,0.292536,0
3,-4.9737990000000006e-17,1.0,-1.330285,6.84631,-0.282427,8.176595,3.694111,17.659544,1.0,1.0,...,1.0,-0.637029,0.48602,1.123049,0.71334,3.2,0.1816,0.368032,0.154126,0
4,4.9737990000000006e-17,1.0,-4.156208,3.673837,-0.168618,7.830046,0.022307,3.891946,1.0,1.0,...,1.0,-0.597286,0.30902,0.906307,0.773249,1.2,0.175284,0.392417,0.194564,0
5,-4.9737990000000006e-17,1.0,-3.182715,5.365592,-0.232521,8.548308,1.974023,6.902985,1.0,1.0,...,1.0,-0.515773,0.050236,0.566009,0.913503,3.2,0.213653,0.491305,0.197951,0


In [42]:
# Model

In [43]:
from sklearn.model_selection import train_test_split


X = features_df.drop(columns='label')
y = features_df['label']

print(f"Features: {X.shape},  Labels: {y.shape}")
print("Class counts:", y.value_counts().to_dict())


const_cols = [c for c in X if X[c].nunique() <= 1]
if const_cols:
    X = X.drop(columns=const_cols)
    print(f"Dropped {len(const_cols)} constant columns; {X.shape[1]} features remain.")


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42,
)

print(f"Train: {X_train.shape[0]}  |  Test: {X_test.shape[0]}")
print("Train class counts:", y_train.value_counts().to_dict())
print("Test  class counts:", y_test.value_counts().to_dict())


Features: (30, 216),  Labels: (30,)
Class counts: {0: 21, 1: 9}
Train: 24  |  Test: 6
Train class counts: {0: 17, 1: 7}
Test  class counts: {0: 4, 1: 2}


In [45]:
import time
class_w = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_w = dict(zip(np.unique(y_train), class_w))

rf = RandomForestClassifier(
    n_estimators     = 100,
    max_depth        = 10,
    min_samples_split= 5,
    min_samples_leaf = 2,
    class_weight     = class_w,
    random_state     = 42,
    n_jobs           = -1
)

t0 = time.time()
rf.fit(X_train, y_train)
print(f"Training finished in {time.time() - t0:.2f}s")
cv_scores = cross_val_score(rf, X_train, y_train,
                            cv=5, scoring='accuracy', n_jobs=-1)
print(f"CV accuracy: {cv_scores.mean():.4f}  ± {cv_scores.std():.4f}")
print("Fold scores:", cv_scores.round(4))

Training finished in 0.11s
CV accuracy: 0.9100  ± 0.1114
Fold scores: [0.8  1.   1.   1.   0.75]
