In [None]:
!pip install numpy pandas scikit-learn tensorflow matplotlib seaborn



In [None]:
import os
import numpy as np
import pandas as pd
from scipy import signal
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from collections import deque
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

CONFIG = {
    'sampling_rate': 360,
    'window_size': 180,
    'risk_window_minutes': 10,
    'prediction_horizon_hours': 6
}
CLASSES = {0: 'Normal', 1: 'SVE', 2: 'VEB', 3: 'Fusion', 4: 'Unknown'}
RISK_WEIGHTS = {'Normal': 0.0, 'SVE': 2.0, 'VEB': 5.0, 'Fusion': 8.0, 'Unknown': 3.0}


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("taejoongyoon/mitbit-arrhythmia-database")

print("Path to dataset files:", path)

import os
import numpy as np
import pandas as pd

def load_mitbih_kaggle_easy(data_dir, window=180, lead=1):
    """
    Load MIT-BIH Kaggle files where:
    - Signal CSVs have header row and columns e.g. ['sample #', 'MLII', 'V5']
    - Annotation TXT files have multi-columns with header and spaces/tabs

    Args:
        data_dir: folder path to Kaggle MIT-BIH files
        window: samples per beat to extract (180 by default)
        lead: ECG channel number to use (default 1 = MLII based on sample)
    Returns:
        beats: np.ndarray (num_beats, window)
        labels: np.ndarray (num_beats,)
    """
    beats = []
    labels = []

    records = sorted([
        fname.replace('.csv', '')
        for fname in os.listdir(data_dir)
        if fname.endswith('.csv')
    ])
    print(f"Found {len(records)} records: {records[:5]}...")

    for rec in records:
        sig_path = os.path.join(data_dir, f"{rec}.csv")
        ann_path = os.path.join(data_dir, f"{rec}annotations.txt")

        if not os.path.exists(sig_path):
            print(f"Missing {sig_path}, skipping.")
            continue
        if not os.path.exists(ann_path):
            print(f"Missing {ann_path}, skipping.")
            continue

        # Read CSV signal file with header row
        sig = pd.read_csv(sig_path, header=0)
        if lead >= len(sig.columns):
            raise ValueError(f"Lead index {lead} out of range in {sig_path}")
        ecg = sig.iloc[:, lead].values

        # Read annotation file with header and whitespace delimiter
        ann = pd.read_csv(ann_path, delimiter=r'\s+', header=0)
        # Annotation file columns:
        # Time, Sample #, Type, Sub Chan, Num, Aux (based on your sample)

        # Extract sample indices and annotation type
        rpeaks = ann['Sample #'].astype(int).values
        annots = ann['Type'].values

        rec_beats = 0/
        for idx, label in zip(rpeaks, annots):
            start = idx - window // 2
            end = idx + window // 2
            if start < 0 or end > len(ecg):
                print(f"Skipping beat idx {idx} (window out of range)")
                continue/
        print(f"Loaded {rec_beats} beats from record {rec}")

    if len(beats) == 0:
        print("No beats loaded! Please check file contents and paths.")
        return np.zeros((0, window)), np.array([])

    return np.stack(beats), np.array(labels)



Using Colab cache for faster access to the 'mitbit-arrhythmia-database' dataset.
Path to dataset files: /kaggle/input/mitbit-arrhythmia-database


In [None]:
data_dir = "/kaggle/input/mitbit-arrhythmia-database/mitbih_database/" # <-- CHANGE THIS
beats, labels = load_mitbih_kaggle_easy(data_dir, window=180, lead=1)
print(f'Beats shape: {beats.shape}, Labels shape: {labels.shape}')
print('Sample label examples:', labels[:10])

Found 48 records: ['100', '101', '102', '103', '104']...


KeyError: 'Sample #'