# Beaver Tutorial 6: ECG Arrhythmia Classification (Data Owner)

Host real ECG data with a synthetic mock for safe iteration.

Run this alongside `06-ecg-ds.ipynb` in a separate tab.

## Step 1: Setup

In [None]:
!uv pip install pandas numpy scikit-learn matplotlib sdv -q

In [1]:
# Uncomment for quick local testing without SyftBox
# import os
# import tempfile
# from beaver import Twin
# import beaver
# # Create temp folder for session
# temp_dir = tempfile.mkdtemp()

# # Set environment for local mode
# os.environ["BEAVER_LOCAL_MODE"] = "1"
# os.environ["BEAVER_USER"] = "alice@example.com"
# os.environ["BEAVER_SESSION_ID"] = "test_session"
# os.environ["BEAVER_LOCAL_SESSION_DIR"] = temp_dir
# os.environ["BEAVER_AUTO_ACCEPT"] = "1"

# print(f"Session dir: {temp_dir}")
# bv = beaver.ctx()
# session = bv.active_session()

In [None]:
import beaver
from beaver import Twin

bv = beaver.ctx()
session = bv.active_session()
session.reset(force=True)

print(f"You: {bv.user}")
print(f"Peer: {session.peer}")

## Step 2: Load Real Data and Build Synthetic Mock with SDV

In [None]:
!uv pip install ../python

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer

In [3]:
from beaver import sample_data

In [4]:
# Download ECG data
ds = sample_data.ecg_arrhythmia
real_path = ds.real.download()

✓ Already downloaded: /Users/madhavajay/.biovault/cache/beaver/sample-data/ecg_arrhythmia/MIT-BIH Arrhythmia Database.csv


In [5]:
label_col = "type"

def build_mock_from_real(real_df, seed=42, sample_size=5000, max_feature_cols=20):
    """Build synthetic mock data using SDV GaussianCopula."""
    # Sample if too large
    if len(real_df) > sample_size:
        real_df = real_df.sample(sample_size, random_state=seed).reset_index(drop=True)

    # Select feature columns (exclude record and label)
    feature_cols = [c for c in real_df.columns if c not in ("record", label_col)]
    if max_feature_cols is not None:
        feature_cols = feature_cols[:max_feature_cols]

    # Build smaller dataframe for synthesis
    cols = ["record", label_col] + feature_cols
    cols = [c for c in cols if c in real_df.columns]  # only existing cols
    df_small = real_df[cols].copy()

    # Setup SDV metadata
    metadata = Metadata.detect_from_dataframe(data=df_small, table_name="ecg")
    metadata.update_column(table_name="ecg", column_name=label_col, sdtype="categorical")
    if "record" in df_small.columns:
        metadata.update_column(table_name="ecg", column_name="record", sdtype="categorical")

    # Fit and generate synthetic data
    synthesizer = GaussianCopulaSynthesizer(metadata)
    synthesizer.fit(df_small)
    mock_df = synthesizer.sample(len(df_small))

    return real_df[cols].reset_index(drop=True), mock_df, feature_cols

In [6]:
raw_df = pd.read_csv(real_path).dropna()
raw_df

Unnamed: 0,record,type,0_pre-RR,0_post-RR,0_pPeak,0_tPeak,0_rPeak,0_sPeak,0_qPeak,0_qrs_interval,...,1_qPeak,1_qrs_interval,1_pq_interval,1_qt_interval,1_st_interval,1_qrs_morph0,1_qrs_morph1,1_qrs_morph2,1_qrs_morph3,1_qrs_morph4
0,101,N,76,313.0,0.074347,-0.160548,1.036401,-0.285662,-0.026824,41,...,0.025930,2,18,22,2,0.025930,0.025930,0.025930,0.025436,0.025436
1,101,N,313,315.0,-0.052079,-0.264784,0.886597,-0.366298,-0.059710,21,...,-0.042009,26,27,62,9,-0.042009,-0.029498,0.005012,0.030892,0.002986
2,101,N,315,321.0,-0.062151,-0.296983,0.991859,-0.410306,-0.065686,22,...,0.009528,3,8,12,1,0.009528,0.009528,0.008786,0.008786,0.008368
3,101,N,321,336.0,-0.063322,-0.281386,1.034903,-0.403880,-0.071750,22,...,-0.020536,6,9,16,1,-0.020536,-0.020257,-0.018965,-0.016968,-0.014555
4,101,N,336,344.0,-0.062915,1.046914,1.046408,1.046408,-0.074639,11,...,0.016053,16,5,31,10,0.016053,0.006742,0.002782,-0.007798,-0.051155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100684,234,N,240,241.0,-0.110230,1.420339,1.403346,1.403346,-0.115682,14,...,-0.069234,5,6,20,9,-0.069234,-0.068378,-0.065373,-0.060779,-0.054383
100685,234,N,241,243.0,-0.119014,1.424782,1.420328,1.420328,-0.134406,13,...,-0.069530,7,5,20,8,-0.069530,-0.069290,-0.066317,-0.053437,-0.045120
100686,234,N,243,252.0,0.173524,-0.443214,1.436878,-0.714642,-0.136431,38,...,-0.089396,9,7,24,8,-0.089396,-0.088155,-0.078611,-0.066763,-0.058650
100687,234,N,252,244.0,-0.119820,-0.401522,1.337051,-0.688560,-0.125076,27,...,-0.065487,3,1,12,8,-0.065487,-0.065487,-0.064941,-0.064941,-0.061120


In [7]:
real_df, mock_df, feature_cols = build_mock_from_real(raw_df)


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [8]:
print(f"Real rows: {len(real_df)} | Mock rows: {len(mock_df)}")
print(f"Features: {len(feature_cols)}")
print(f"Real classes: {real_df[label_col].value_counts().to_dict()}")
print(f"Mock classes: {mock_df[label_col].value_counts().to_dict()}")

Real rows: 5000 | Mock rows: 5000
Features: 20
Real classes: {'N': 4509, 'VEB': 324, 'SVEB': 138, 'F': 28, 'Q': 1}
Mock classes: {'N': 4539, 'VEB': 308, 'SVEB': 117, 'F': 35, 'Q': 1}


## Step 3: Publish Twin (Mock Public, Real Private)

In [9]:
!uv pip install pyarrow

[2mUsing Python 3.13.5 environment at: /Users/madhavajay/dev/biovault-beaver/workspace1/.venv[0m
[2mAudited [1m1 package[0m [2min 2ms[0m[0m


In [10]:
ecg = Twin(public=mock_df, private=real_df, name="ecg")
session.remote_vars["ecg"] = ecg

print("Published ECG Twin (mock public, real private)")

🌍 Using PUBLIC data from Twin 'ecg...'
📢 Published Twin 'ecg' (public side available at: data/bd288016e232495d809006c88523845b.beaver)
Published ECG Twin (mock public, real private)


## Step 4: Wait for Analysis Request

In [11]:
request = bv.wait_for_request(ecg, timeout=600)
request

⏳ Waiting for request on 'ecg'...


KeyboardInterrupt: 

## Step 5: Review Mock vs Real and Approve

In [None]:
result = request.run_both()

mock_metrics = result.data.public or {}
real_metrics = result.data.private or {}

print(f"Mock accuracy: {mock_metrics.get('accuracy', 'N/A')}")
print(f"Real accuracy: {real_metrics.get('accuracy', 'N/A')}")

result.data.show_figures("private")
result.approve()
print("Approved results")