## Takeaways

1) Some of the images of the printouts are not aligned with ordinary images axis-wise. For example, some signals are placed vertically in the image such that they should be read from top to bottom.  
2) Point *1)* is extended to the fact that there are more than two cases of how signals are placed in the image, regarding its aspect ratio. For example, 1. signals from left to right & image width larger than height, 2. signals from left to right & image height larger than width, 3. signals from top to bottom & image height larger than width(in `'/kaggle/input/physionet-ecg-image-digitization/train/2042290760/2042290760-0006.png'`). The 4th case could be found, but I haven't yet noticed it. These can be found, especially, in mobile-captured images.  
3) Some of the signals in `0001` images extend their coverage down to the text `25mm/s` and `10mm/mV`, and up to the top of the image. Thus, cropping the upper and lower parts might not be a reasonable approach for pre-processing. In particular, `V6` lead seems to have very outstanding outlier in its values, which can be found in `Section 7)`  

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
from PIL import Image
from matplotlib import pyplot as plt
import seaborn as sns

import os 
import random

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1) Set the random seed

In [None]:
# Reference: https://www.kaggle.com/code/rhythmcam/random-seed-everything?scriptVersionId=77985442&cellId=2
# basic random seed
DEFAULT_RANDOM_SEED = 1105

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# tensorflow random seed 
import tensorflow as tf 
def seedTF(seed=DEFAULT_RANDOM_SEED):
    tf.random.set_seed(seed)
    
# torch random seed
import torch
def seedTorch(seed=DEFAULT_RANDOM_SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
      
# basic + tensorflow + torch 
def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    seedTF(seed)
    seedTorch(seed)

## 2) Set the paths

In [None]:
root_str = "/kaggle/input/physionet-ecg-image-digitization"
root = Path(root_str)
folder_train = root / "train"
folder_test = root / "test"
path_train_csv = root / "train.csv"
path_test_csv = root / "test.csv"

df_train = pd.read_csv(path_train_csv)

## 3) Load data

In [None]:
def load_file_paths(root_folder: Path, suffix: str = None):
    file_paths = root_folder.glob("**/*")
    result = []
    for p in file_paths:

        # files only
        if p.is_file():

            # if condition not met, skip
            if suffix is not None and not p.name.endswith(suffix):
                continue
            result.append(p)
    return result                

In [None]:
def plot_images(
    paths: list[Path | str],
    num_row: int,
    num_col: int,
    fig_size: tuple[int, int] = None
):
    fig, axes = plt.subplots(
        nrows=num_row, 
        ncols=num_col,
        # sharex="all",
        # sharey="all",
    )
    if fig_size is None:
        factor = 3.5
        fig_size = (factor * num_col, factor * num_row)
    fig.set_size_inches(fig_size)

    for i in range(num_row):
        for j in range(num_col):
            img_idx = i*num_row + j
            axes[i][j].imshow(Image.open(paths[img_idx]))
    
    plt.show()

In [None]:
paths_0001 = load_file_paths(folder_train, "0001.png")
paths_0003 = load_file_paths(folder_train, "0003.png")
paths_0004 = load_file_paths(folder_train, "0004.png")
paths_0005 = load_file_paths(folder_train, "0005.png")
paths_0006 = load_file_paths(folder_train, "0006.png")
paths_0009 = load_file_paths(folder_train, "0009.png")
paths_0010 = load_file_paths(folder_train, "0010.png")
paths_0011 = load_file_paths(folder_train, "0011.png")
paths_0012 = load_file_paths(folder_train, "0012.png")
paths_csv = load_file_paths(folder_train, ".csv")

## 4) Analysis

#### Shape consistency

In [None]:
# Data preparation
shapes = {}
aspect_ratios = {}
paths_group = [
    paths_0001, paths_0003, paths_0004, paths_0005, paths_0006,
    paths_0009, paths_0010, paths_0011, paths_0012
]

for idx, paths in enumerate(paths_group):
    i = 0
    tmp_type = "none"
    for p in paths:
        if i == 0:
            tmp_type = p.name[-8:-4]
            shapes[tmp_type] = list()
            aspect_ratios[tmp_type] = list()
            i += 1
            
        tmp_img = Image.open(p)
        tmp_ar = tmp_img.height / tmp_img.width
        shapes[tmp_type].append((tmp_img.height, tmp_img.width))
        aspect_ratios[tmp_type].append(tmp_ar)
    print(f"{idx}", end=" ")
print("Getting shape information done!")

In [None]:
# set the variables
nunique_shapes = pd.DataFrame(shapes).nunique()
nunique_ars = pd.DataFrame(aspect_ratios).nunique()
ars = np.array([vs for vs in aspect_ratios.values()]).reshape(-1)
min_val = min(nunique_shapes.min(), nunique_ars.min()) - 1
max_val = max(nunique_shapes.max(), nunique_ars.max()) + 1

# plot
fig, axes = plt.subplots(2, 1)
fig.set_size_inches(10, 8)

# -- 1) Overview
nunique_shapes.plot(kind="bar", color="orange", ax=axes[0], label="Shape")
nunique_ars.plot(kind="bar", color="skyblue", ax=axes[0], alpha=0.5, label="AR")
axes[0].set_xlabel("Image Type")
axes[0].set_ylabel("Unique Count")
axes[0].set_yticks(range(min_val, max_val + 1))
axes[0].legend()
axes[0].axhline(1, color="red", linestyle="--", alpha=0.5)
axes[0].text(len(nunique_shapes), 1, "Above this line has multiple kinds within its type", color="red", alpha=0.7)
axes[0].set_title("Unique Count Per Image Type")

# -- 2) Aspect Ratio distribution
axes[1].hist(ars, bins=500)
axes[1].set_xlabel("AR values")
axes[1].set_xticks(
    ticks=np.linspace(min(ars), max(ars), 20).round(2), 
    labels=np.linspace(min(ars), max(ars), 20).round(2), 
    rotation=45
)
axes[1].set_ylabel("AR Count")
axes[1].set_title("AR Distribution")
axes[1].legend()
plt.tight_layout()
plt.show()

Images with AR `1.33` are found quite a lot. This means there are rotated images

#### Image examples

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0001)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0003)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0004)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0005)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0006)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0009)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0010)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0011)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

In [None]:
num_to_show = 8
selected_paths = np.asarray(paths_0012)
selected_indices = np.random.choice(np.arange(len(selected_paths)), num_to_show)
plot_images(
    selected_paths[selected_indices], 
    2, 
    num_to_show // 2 if num_to_show % 2 == 0 else num_to_show // 2 + 1
)

# Print the size
rep_idx = selected_indices[0]
rep_img = Image.open(selected_paths[rep_idx])
print()
print(f"Selected indices: {selected_indices}")
print(f"Shape and Aspect Ratio for the image with index [{rep_idx}]")
print(f"Shape: {rep_img.size}", f"Aspect Ratio: {rep_img.height / rep_img.width:.3f}")

## 5) Signal Foorprints

In [None]:
# set the example
rep_grey = Image.open(paths_0001[0]).convert("L")
heatmap = np.zeros_like(rep_grey, dtype=bool)
threshold_from_bg = 75

# gather information
for p in paths_0001:
    tmp_img_arr = np.array(Image.open(p).convert("L")).astype(float)
    tmp_img_arr = np.where(tmp_img_arr < threshold_from_bg, True, False).astype(bool)
    heatmap += tmp_img_arr

# display the example
t = np.array(rep_grey)
t = np.where(t < threshold_from_bg, t, 255)
plt.figure()
plt.title("Example of How To Extract Signals From BG")
plt.imshow(t)
plt.show()

In [None]:
heatmap_reverse = np.abs(1 - heatmap)
plt.figure(figsize=(20, 10))
plt.title("All Footprints of Signals From 0001 Images")
sns.heatmap(heatmap_reverse)
plt.show()

## 6) How To Construct `0001` From the Raw Signals

In [None]:
# set the example id
example_id = "1063816858"
example_dir = folder_train / example_id
example_img = example_dir / (example_id + "-0001.png")
example_csv = example_dir / (example_id + ".csv")

df_example = pd.read_csv(example_csv)

In [None]:
# gather data
lead_order = [
    ["I", "aVR", "V1", "V4"],
    ["II", "aVL", "V2", "V5"],
    ["III", "aVF", "V3", "V6"],
    ["II"]
]

example = [[] for _ in range(4)]
example_fs = len(df_example) / 10
half_sig_len = int(example_fs * 5)
is_fs_odd = example_fs % 2 == 1

for row_idx, row in enumerate(lead_order):
    for lead in row:
        mask = df_example[lead].isna()
        values = df_example[lead][~mask]
        if lead == "II" and row_idx == 1:
            values = values[:half_sig_len // 2]
                
        example[row_idx].extend(values.to_list())

In [None]:
# plot them
fig, axes = plt.subplot_mosaic("04;14;24;34")
fig.set_size_inches(16, 6)
y_lim_max = 2
y_lim_min = -2
for i in range(5):
    if i < 4:
        axes[str(i)].plot(example[i])
        axes[str(i)].set_ylim(y_lim_min, y_lim_max)
        if i == 0:
            axes[str(i)].set_title("Signals From Raw Data")
    else:
        axes[str(i)].imshow(Image.open(example_img))
        axes[str(i)].set_title("Signals From Original Image")
plt.tight_layout()
plt.show()

## 7) Min, Max Values For Each Lead

In [None]:
ending_leads = [row[-1] for row in lead_order]
min_dict = {}
max_dict = {}
for p in paths_csv:
    df_tmp = pd.read_csv(p)
    for row_idx, row in enumerate(lead_order):

        # skip the redundant check
        if row_idx == 3:
            continue

        for lead in row:
            if min_dict.get(lead) is None:
                min_dict[lead] = []
            if max_dict.get(lead) is None:
                max_dict[lead] = []
            min_dict[lead].append(df_tmp[lead].min())
            max_dict[lead].append(df_tmp[lead].max())

In [None]:
print("Min and Max Values for Each Lead\n")
for row_idx, row in enumerate(lead_order):
    if row_idx == 3:
        continue
    for lead in row:
        print(f"{lead.rjust(4)} = Min : {min(min_dict[lead])} / Max : {max(max_dict[lead])}")