# V3: Heart Sound Classification using Pre-processed Wavelet Features

This notebook implements a machine learning pipeline to classify heart sounds. It uses statistical features derived from **pre-processed Discrete Wavelet Transform (DWT) coefficients** stored in the `/wavelet` directory.

The workflow is as follows:

1.  **Configuration**: Set up paths and parameters.
2.  **Load Labels**: Load all labels from the `REFERENCE.csv` files into a single dictionary.
3.  **Load Data & Extract Features**: Load the `.npz` files from the `/wavelet` directory and compute statistical features from the DWT coefficients.
4.  **Model Training**: Train a RandomForestClassifier on the extracted features.
5.  **Evaluation**: Evaluate the model's performance using an accuracy score.

In [1]:
import os
import numpy as np
import pandas as pd
import pywt
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- 1. Configuration ---
# Adjust this path based on your Docker container's file structure
BASE_PROJECT_DIR = '/workspace' 
WAVELET_DATA_DIR = os.path.join(BASE_PROJECT_DIR, 'wavelet')
TRAINING_SETS = [f'training-{letter}' for letter in ['a', 'b', 'c', 'd', 'e', 'f']]

In [2]:
# --- 2. Load All Labels ---
all_labels_dict = {}
print("Loading labels from all training sets...")
for set_id in TRAINING_SETS:
    labels_path = os.path.join(BASE_PROJECT_DIR, set_id, 'REFERENCE.csv')
    if not os.path.exists(labels_path):
        print(f"Labels file not found for {set_id}, skipping.")
        continue
    
    labels_df = pd.read_csv(labels_path, header=None, names=['filename', 'label'])
    # Convert labels: -1 to 0 (abnormal), 1 to 1 (normal)
    labels_df['label'] = labels_df['label'].apply(lambda x: 1 if x == 1 else 0)
    # Add to the main dictionary
    for index, row in labels_df.iterrows():
        all_labels_dict[row['filename']] = row['label']

print(f"Total labels loaded: {len(all_labels_dict)}")

Loading labels from all training sets...
Total labels loaded: 3240


In [4]:
def extract_features_from_coeffs(coeffs_list):
    """Calculates statistical features from a list of coefficient arrays."""
    features = []
    for coeffs in coeffs_list:
        features.extend([
            np.mean(coeffs),
            np.std(coeffs),
            skew(coeffs),
            kurtosis(coeffs),
            np.sum(np.square(coeffs)) # Energy
        ])
    return np.array(features)

# --- 3. Load Data & Extract Features ---
all_features = []
all_labels = []

print(f"Loading data from: {WAVELET_DATA_DIR}")
for filename in sorted(os.listdir(WAVELET_DATA_DIR)):
    if filename.endswith('.npz'):
        file_id = os.path.splitext(filename)[0]
        
        # Ensure the label for this file exists
        if file_id in all_labels_dict:
            file_path = os.path.join(WAVELET_DATA_DIR, filename)
            try:
                # Load the pre-processed coefficients
                with np.load(file_path) as data:
                    cA = data['cA']
                    cD = data['cD']
                
                # Extract statistical features
                features = extract_features_from_coeffs([cA, cD])
                
                all_features.append(features)
                all_labels.append(all_labels_dict[file_id])
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

print("Feature extraction complete.")

Loading data from: /workspace/wavelet
Feature extraction complete.


In [5]:
# --- 4. Model Training ---
X = np.array(all_features)
y = np.array(all_labels)

if X.shape[0] > 0:
    print(f"Total samples: {X.shape[0]}, Features per sample: {X.shape[1]}")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train RandomForest model
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train_scaled, y_train)

    print("Model training complete.")
else:
    print("No features were loaded. Please check the data paths and file contents.")

Total samples: 3240, Features per sample: 10
Model training complete.


In [6]:
# --- 5. Evaluation ---
if X.shape[0] > 0:
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"--- Model Evaluation ---")
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
else:
    print("Cannot evaluate model as no data was loaded.")

--- Model Evaluation ---
Model Accuracy: 92.13%
