# V3-NN: Heart Sound Classification using a Neural Network

This notebook trains a **Neural Network (Multi-Layer Perceptron)** to classify heart sounds using **pre-processed wavelet features** stored in the `wavelet/` directory.

The workflow is as follows:

1.  **Configuration**: Set up the path to the pre-processed data.
2.  **Data Loading**: Load the `.npz` files containing wavelet coefficients and corresponding labels.
3.  **Data Preparation**: Split the data into training and testing sets and apply feature scaling.
4.  **Model Definition**: Define and build a sequential neural network with multiple layers using TensorFlow/Keras.
5.  **Model Training & Evaluation**: Compile, train, and evaluate the model's performance.

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

2025-07-29 10:21:58.584994: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-29 10:21:58.595708: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-29 10:21:58.660724: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-29 10:21:58.707908: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753784518.759107    7294 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753784518.77

In [None]:
# --- 1. Configuration ---
# Adjust the path based on the Docker container's file structure if needed
WAVELET_DATA_DIR = '/workspace/wavelet_v2/'

In [None]:
# --- 2. Data Loading ---
X = []
y = []

print(f"Loading data from {WAVELET_DATA_DIR}...")

# Check if the directory exists
if not os.path.isdir(WAVELET_DATA_DIR):
    print(f"Error: Directory not found at {os.path.abspath(WAVELET_DATA_DIR)}")
    print("Please run the preprocessing script first: python preprocess_wavelet.py")
else:
    files = os.listdir(WAVELET_DATA_DIR)
    npz_files = [f for f in files if f.endswith('.npz')]
    
    if not npz_files:
        print(f"No .npz files found in {WAVELET_DATA_DIR}. Please run preprocessing first.")
    else:
        print(f"Found {len(npz_files)} .npz files")
        
        # Load all wavelet feature files
        for file_name in npz_files:
            path = os.path.join(WAVELET_DATA_DIR, file_name)
            try:
                data = np.load(path)
                
                # Load features and labels using the new format
                if 'features' in data and 'label' in data:
                    X.append(data['features'])
                    y.append(data['label'])
                else:
                    print(f"Warning: Expected keys 'features' and 'label' not found in {file_name}")
                    
            except Exception as e:
                print(f"Error loading {file_name}: {e}")
        
        if X:
            # Convert to numpy arrays
            X = np.array(X)
            y = np.array(y)
            
            print(f"Successfully loaded {len(X)} samples.")
            print(f"Feature matrix shape: {X.shape}")
            print(f"Label distribution: Normal (0): {np.sum(y == 0)}, Abnormal (1): {np.sum(y == 1)}")
            print(f"Class balance: {np.sum(y == 0)/(len(y))*100:.1f}% normal, {np.sum(y == 1)/(len(y))*100:.1f}% abnormal")
        else:
            print("No valid data could be loaded.")

Loading data from /workspace/wavelet/...
Examining file: a0001.npz
Available keys: ['cA', 'cD']
Key 'cA': shape = (35669,), dtype = float32
Key 'cD': shape = (35669,), dtype = float32

Attempting to load data...


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3240,) + inhomogeneous part.

In [None]:
# --- 3. Data Preparation ---
if len(X) > 0:
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print(f'Training set shape: {X_train_scaled.shape}')
    print(f'Test set shape: {X_test_scaled.shape}')
else:
    print("Skipping model training as no data was loaded.")

In [None]:
# --- 4. Model Definition ---
# We define the model inside a condition to avoid errors if data loading failed
if len(X) > 0:
    model = Sequential([
        # Input layer: Dense with ReLU activation. The input_shape must match the number of features.
        Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        # Dropout layer to prevent overfitting
        Dropout(0.5), 
        # Hidden layer
        Dense(32, activation='relu'),
        # Another dropout layer
        Dropout(0.5),
        # Output layer: Dense with a single neuron and sigmoid activation for binary classification
        Dense(1, activation='sigmoid')
    ])

    model.summary()

In [None]:
# --- 5. Model Training & Evaluation ---
if len(X) > 0:
    # Compile the model
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    # Define early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    print("Starting model training...")
    history = model.fit(
        X_train_scaled, 
        y_train, 
        epochs=100, # Increased epochs, with early stopping
        batch_size=32, 
        validation_data=(X_test_scaled, y_test),
        callbacks=[early_stopping],
        verbose=1
    )

    print("--- Final Model Evaluation ---")
    loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

    # --- Plotting Training History ---
    pd.DataFrame(history.history).plot(figsize=(8, 5))
    plt.grid(True)
    plt.gca().set_ylim(0, 1) # Set the y-axis range to [0, 1]
    plt.title('Model Training History')
    plt.xlabel('Epochs')
    plt.ylabel('Value')
    plt.show()