In [None]:
# Mount Google Drive and install required packages
from google.colab import drive
import zipfile
import os

# Mount Google Drive
drive.mount('/content/drive')

# Install required packages with correct names
!pip install wfdb PyWavelets scikit-learn tensorflow keras numpy pandas matplotlib seaborn
!pip install EMD-signal

# Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import wfdb
import pywt  # This imports from PyWavelets package
from PyEMD import EMD
import warnings
warnings.filterwarnings('ignore')

print("All packages installed successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
All packages installed successfully!


In [None]:
# Extract the apnea-ecg.zip file from Google Drive
zip_path = '/content/drive/MyDrive/apnea-ecg.zip'
extract_path = '/content/apnea_ecg_data/'

# Create extraction directory
os.makedirs(extract_path, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Dataset extracted successfully!")

# List the contents to verify extraction
for root, dirs, files in os.walk(extract_path):
    print(f"Directory: {root}")
    print(f"Files: {files[:10]}")  # Show first 10 files
    break


Dataset extracted successfully!
Directory: /content/apnea_ecg_data/
Files: []


In [None]:
class AdvancedECGPreprocessor:
    def __init__(self, sampling_rate=100):
        self.fs = sampling_rate
        self.emd = EMD()

    def extract_hjorth_features(self, signal):
        """Extract Hjorth parameters: Activity, Mobility, Complexity"""
        try:
            # Activity (variance)
            activity = np.var(signal)
            if activity == 0:
                return [0, 0, 0]

            # Mobility
            diff1 = np.diff(signal)
            mobility = np.sqrt(np.var(diff1) / activity)

            # Complexity
            diff2 = np.diff(diff1)
            if len(diff2) == 0 or np.var(diff1) == 0 or mobility == 0:
                return [activity, mobility, 0]

            complexity = np.sqrt(np.var(diff2) / np.var(diff1)) / mobility

            return [activity, mobility, complexity]
        except:
            return [0, 0, 0]

    def wavelet_emd_decomposition(self, signal):
        """Perform Wavelet-EMD decomposition with error handling"""
        try:
            # Wavelet decomposition into 4 sub-bands
            coeffs = pywt.wavedec(signal, 'db4', level=3)

            features = []

            # Process each wavelet sub-band
            for i, coeff in enumerate(coeffs):
                if len(coeff) > 10:  # Ensure sufficient length for EMD
                    try:
                        # EMD decomposition to get 5 IMFs
                        imfs = self.emd(coeff, max_imf=5)

                        # Extract Hjorth features from each IMF
                        for imf in imfs:
                            if len(imf) > 5:
                                hjorth_features = self.extract_hjorth_features(imf)
                                features.extend(hjorth_features)
                    except:
                        # Fallback: use original coefficient Hjorth features
                        hjorth_features = self.extract_hjorth_features(coeff)
                        features.extend(hjorth_features)

            # Ensure consistent feature length
            if len(features) == 0:
                features = np.zeros(45)  # Default feature vector
            else:
                features = np.array(features)
                # Pad or truncate to consistent length
                if len(features) < 45:
                    features = np.pad(features, (0, 45 - len(features)), 'constant')
                elif len(features) > 45:
                    features = features[:45]

            return features
        except:
            return np.zeros(45)  # Return default features on error

    def extract_rr_features(self, ecg_signal):
        """Extract traditional R-R interval features with fixed boolean indexing"""
        try:
            from scipy.signal import find_peaks

            # Normalize signal
            ecg_normalized = (ecg_signal - np.mean(ecg_signal)) / np.std(ecg_signal)

            # Find R-peaks
            peaks, _ = find_peaks(ecg_normalized, height=0.5, distance=50)

            if len(peaks) < 2:
                return np.zeros(10)  # Return zeros if insufficient peaks

            # Calculate R-R intervals
            rr_intervals = np.diff(peaks) / self.fs * 1000  # Convert to milliseconds

            if len(rr_intervals) == 0:
                return np.zeros(10)

            # FIXED: Calculate pNN50 without boolean indexing error
            rr_diff = np.diff(rr_intervals)
            if len(rr_diff) == 0:
                pnn50 = 0
            else:
                pnn50 = np.sum(np.abs(rr_diff) > 50)  # Fixed: removed boolean indexing

            # Time domain features
            features = [
                np.mean(rr_intervals),           # Mean RR
                np.std(rr_intervals),            # SDNN
                np.sqrt(np.mean(rr_diff**2)) if len(rr_diff) > 0 else 0,  # RMSSD
                pnn50,                           # pNN50 (fixed)
                np.max(rr_intervals) - np.min(rr_intervals),  # Range
                np.percentile(rr_intervals, 25), # Q1
                np.percentile(rr_intervals, 75), # Q3
                np.var(rr_intervals),            # Variance
                len(peaks) / (len(ecg_signal) / self.fs),  # Heart rate
                np.mean(np.abs(rr_diff)) if len(rr_diff) > 0 else 0     # Mean absolute deviation
            ]

            # Ensure no NaN or infinite values
            features = [f if np.isfinite(f) else 0 for f in features]

            return np.array(features)
        except Exception as e:
            print(f"    RR feature extraction error: {e}")
            return np.zeros(10)

# Initialize fixed preprocessor
preprocessor = AdvancedECGPreprocessor()
print("Fixed advanced preprocessor initialized!")


Fixed advanced preprocessor initialized!


In [None]:
def load_physionet_data_fixed(data_path):
    """Load PhysioNet Apnea-ECG data with proper segment-annotation alignment"""

    # Find all .dat files (ECG recordings)
    ecg_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith('.dat'):
                ecg_files.append(os.path.join(root, file))

    print(f"Found {len(ecg_files)} ECG files")

    X_features = []
    X_raw = []
    y_labels = []

    for file_path in ecg_files[:15]:  # Process first 15 files for demo
        try:
            # Extract record name
            record_name = os.path.basename(file_path).replace('.dat', '')
            record_path = file_path.replace('.dat', '')

            print(f"Processing {record_name}...")

            # Read ECG signal
            record = wfdb.rdrecord(record_path)
            ecg_signal = record.p_signal[:, 0]  # First channel

            # Read annotations
            try:
                annotation = wfdb.rdann(record_path, 'apn')
                apnea_labels = annotation.symbol
            except:
                print(f"No annotations found for {record_name}, skipping...")
                continue

            # Calculate segments and ensure alignment
            segment_length = 6000  # 1 minute at 100Hz
            num_signal_segments = len(ecg_signal) // segment_length
            num_annotation_labels = len(apnea_labels)

            # Use the minimum to avoid index errors
            num_segments_to_process = min(num_signal_segments, num_annotation_labels)

            print(f"  Signal segments: {num_signal_segments}, Annotation labels: {num_annotation_labels}")
            print(f"  Processing {num_segments_to_process} segments")

            if num_segments_to_process == 0:
                print(f"  Skipping {record_name} - no valid segments")
                continue

            segments_processed = 0
            # Process segments
            for i in range(num_segments_to_process):
                start_idx = i * segment_length
                end_idx = start_idx + segment_length

                # Ensure we don't exceed signal length
                if end_idx > len(ecg_signal):
                    break

                segment = ecg_signal[start_idx:end_idx]

                # Skip if segment is too short
                if len(segment) < segment_length:
                    continue

                try:
                    # Extract advanced features using the fixed preprocessor
                    wavelet_emd_features = preprocessor.wavelet_emd_decomposition(segment)
                    rr_features = preprocessor.extract_rr_features(segment)

                    # Combine all features
                    combined_features = np.concatenate([wavelet_emd_features, rr_features])

                    # Skip if features are invalid
                    if len(combined_features) == 0 or np.any(np.isnan(combined_features)) or np.any(np.isinf(combined_features)):
                        print(f"    Skipping segment {i} - invalid features")
                        continue

                    X_features.append(combined_features)
                    X_raw.append(segment)

                    # Label encoding (A=apnea, N=normal)
                    label = 1 if apnea_labels[i] == 'A' else 0
                    y_labels.append(label)
                    segments_processed += 1

                except Exception as e:
                    print(f"  Error processing segment {i}: {e}")
                    continue

            print(f"  Successfully processed {segments_processed} segments from {record_name}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

    return np.array(X_features), np.array(X_raw), np.array(y_labels)

# Load and process data with the fixed function
print("Loading PhysioNet data with fixed alignment...")
X_features, X_raw, y = load_physionet_data_fixed(extract_path)

print(f"\nFinal Results:")
print(f"Loaded {len(X_features)} segments")
print(f"Feature shape: {X_features.shape}")
print(f"Raw signal shape: {X_raw.shape}")
print(f"Labels shape: {y.shape}")
if len(y) > 0:
    print(f"Apnea ratio: {np.mean(y):.2%}")
    print(f"Normal segments: {np.sum(y == 0)}")
    print(f"Apnea segments: {np.sum(y == 1)}")
else:
    print("No segments loaded - check data path and files")


Loading PhysioNet data with fixed alignment...
Found 78 ECG files
Processing a01r...
  Signal segments: 492, Annotation labels: 489
  Processing 489 segments
    Skipping segment 7 - invalid features
    Skipping segment 226 - invalid features
    Skipping segment 351 - invalid features
  Successfully processed 486 segments from a01r
Processing c01...
  Signal segments: 483, Annotation labels: 484
  Processing 483 segments
  Successfully processed 483 segments from c01
Processing a04...
  Signal segments: 496, Annotation labels: 492
  Processing 492 segments
  Successfully processed 492 segments from a04
Processing x06...
No annotations found for x06, skipping...
Processing b01...
  Signal segments: 486, Annotation labels: 487
  Processing 486 segments
  Successfully processed 486 segments from b01
Processing b05...
  Signal segments: 432, Annotation labels: 433
  Processing 432 segments
  Successfully processed 432 segments from b05
Processing a01...
  Signal segments: 492, Annotation

In [None]:
# Save your loaded data to Google Drive
import numpy as np
import pickle

print("Saving dataset to Google Drive...")

try:
    # Save arrays
    np.save('/content/drive/MyDrive/X_raw_complete.npy', X_raw)
    np.save('/content/drive/MyDrive/X_features_complete.npy', X_features)
    np.save('/content/drive/MyDrive/y_labels_complete.npy', y)

    # Save dataset info
    dataset_info = {
        'total_segments': len(y),
        'feature_shape': X_features.shape,
        'raw_shape': X_raw.shape,
        'apnea_ratio': np.mean(y),
        'normal_segments': np.sum(y == 0),
        'apnea_segments': np.sum(y == 1)
    }

    with open('/content/drive/MyDrive/dataset_info.pkl', 'wb') as f:
        pickle.dump(dataset_info, f)

    print("✅ Dataset saved successfully!")
    print(f"Files saved:")
    print(f"  - X_raw_complete.npy: {X_raw.shape}")
    print(f"  - X_features_complete.npy: {X_features.shape}")
    print(f"  - y_labels_complete.npy: {y.shape}")
    print(f"  - dataset_info.pkl: metadata")

except Exception as e:
    print(f"❌ Error saving dataset: {e}")

# Verify the save worked
try:
    test_load = np.load('/content/drive/MyDrive/X_raw_complete.npy')
    print(f"✅ Verification: Successfully saved {len(test_load)} segments")
except:
    print("❌ Verification failed - data may not be saved properly")


Saving dataset to Google Drive...
✅ Dataset saved successfully!
Files saved:
  - X_raw_complete.npy: (5289, 6000)
  - X_features_complete.npy: (5289, 55)
  - y_labels_complete.npy: (5289,)
  - dataset_info.pkl: metadata
✅ Verification: Successfully saved 5289 segments
