In [None]:
import polars as pl, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
import matplotlib.image as mpimg
import random

!ls /kaggle/input/physionet-ecg-image-digitization

## Config

In [None]:
ROOT = Path('/kaggle/input/physionet-ecg-image-digitization')
SEED = 10304
random.seed(SEED)

In [None]:
train_df = pl.read_csv(ROOT / 'train.csv')
test_df = pl.read_csv(ROOT / 'test.csv')
random_train_id = random.choice(train_df['id'])
random_test_id = random.choice(test_df['id'])

train_df.head()

In [None]:
test_df.head()

## Sampling frequency distribution

In [None]:
plt.hist(
    train_df['fs'].to_numpy(),
    bins=train_df['fs'].unique().count(),
    edgecolor='black'
)
plt.title("Histogram of `fs` column")
plt.xlabel("fs")
plt.ylabel("Frequency")
plt.show()

## Show a random image

### Train

In [None]:
n = {
 1: 'Original color ECG image generated by ECG-image-kit',
 3: 'Image printed in color and scanned in color',
 4: 'Image printed in color and scanned in black and white',
 5: 'Mobile photo of color printed image',
 6: 'Mobile photo of ECG on the screen of laptop',
 9: 'Mobile photo of stained and soaked printed ECG',
 10: 'Mobile photo of printed ECG with extensive damage',
 11: 'Scan of printed ECG image with mold in color',
 12: 'Scan of printed ECG image with mold in black and white'
}


fig, axes = plt.subplots(3,3, figsize=(12,8))
fig.suptitle(f"ECG Variations for ID {random_train_id}", fontsize=14, weight='bold')

for i, ax in zip(n.keys(), axes.ravel()):
    img_path = ROOT / 'train' / str(random_train_id) / f'{random_train_id}-{i:04d}.png'
    ax.imshow(mpimg.imread(img_path))
    ax.set_title(n[i], fontsize=9)
    ax.axis('off')

### Test

In [None]:
plt.imshow(
    mpimg.imread(ROOT / 'test' / f'{random_test_id}.png')
)
plt.show()

### Training dataframe

In [None]:
fs = train_df.filter(pl.col("id") == random_train_id).row(0)[1]          # sampling frequency
sig_len = train_df.filter(pl.col("id") == random_train_id).row(0)[2]     # total number of samples

random_train_id_df = pl.read_csv(
    ROOT / "train" / str(random_train_id) / f"{random_train_id}.csv"
).with_columns(pl.Series("time", np.arange(sig_len) / fs))

random_train_id_df = random_train_id_df.select(["time"] + [c for c in random_train_id_df.columns if c != "time"])
random_train_id_df = random_train_id_df.select([
    pl.when(pl.col(c).is_not_null())
      .then(pl.col(c).cast(pl.Float64, strict=False))
      .otherwise(None)
      .alias(c)
    for c in random_train_id_df.columns
])

random_train_id_df

### Null range

In [None]:
def get_non_null_ranges(df: pl.DataFrame):
    ranges = {}
    n = df.height
    
    for col in df.columns:
        mask = df[col].is_not_null().to_numpy()
        if mask.any():
            start = mask.argmax()  # first True index
            end = n - mask[::-1].argmax()  # last True index (exclusive)
            ranges[col] = (start, end, end - start)
        else:
            ranges[col] = None
    return ranges


ranges = get_non_null_ranges(random_train_id_df)
for col, (start, end, length) in ranges.items():
    print(f"{col:>4s}: non-null from {start} to {end-1} (length={length})")

### Null count

In [None]:
random_train_id_df.select([
    pl.col(c).null_count().alias(c) for c in random_train_id_df.columns
])

## Plot (train)

In [None]:
def extract_non_null_df(df: pl.DataFrame, col_name: str) -> pl.DataFrame:
    mask = df[col_name].is_not_null().to_numpy()
    if not mask.any():
        return None
    start = mask.argmax()
    end = len(mask) - mask[::-1].argmax()
    return df.slice(start, end - start).select(["time", col_name])

split_dfs = {}

for col in [c for c in random_train_id_df.columns if c != "time"]:
    sub_df = extract_non_null_df(random_train_id_df, col)
    if sub_df is not None:
        split_dfs[col] = sub_df

fig, axes = plt.subplots(6, 2, figsize=(14, 12))
axes = axes.flatten()  # flatten grid into a list of 12 axes

for i, (lead, df_lead) in enumerate(split_dfs.items()):
    # Convert to numpy arrays for plotting
    time_vals = df_lead["time"].to_numpy()
    signal_vals = df_lead[lead].to_numpy()

    # Plot each lead
    ax = axes[i]
    ax.plot(time_vals, signal_vals, linewidth=0.8)
    ax.set_title(lead)
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("mV")
    ax.grid(True)

# Hide any unused subplots (in case fewer than 12)
for j in range(i + 1, len(axes)):
    axes[j].axis("off")

plt.tight_layout()
plt.show()