In [None]:
import os

import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from scipy.signal import resample

## 1. Data Preparation

In [None]:
df_test = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/test.csv')

In [None]:
df_test

## 2. Prediction

In [None]:
# (x1, y1, x2, y2)
CROP_COORD_DICT = {
    # row 1
    'I': (125, 580, 605, 850),
    'aVR': (615, 580, 1095, 850),
    'V1': (1105, 580, 1590, 850),
    'V4': (1600, 580, 2085, 850),
    # row 2
    # 'II': (125, 850, 605, 1130),
    'aVL': (615, 850, 1095, 1130),
    'V2': (1105, 850, 1590, 1130),
    'V5': (1600, 850, 2085, 1130),
    # row 3
    'III': (125, 1130, 605, 1410),
    'aVF': (615, 1130, 1095, 1410),
    'V3': (1105, 1130, 1590, 1410),
    'V6': (1600, 1130, 2085, 1410),
    # row 4
    'II': (125, 1410, 2085, 1620),
}

I_REF_SCALE = 0.719 / 51.41695713533565
CALIB_SCALE_DICT = {
    # row 1
    'I': I_REF_SCALE,
    'aVR': I_REF_SCALE,
    'V1': I_REF_SCALE,
    'V4': I_REF_SCALE,
    # row 2
    # 'II': ,
    'aVL': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['aVL'][3] - CROP_COORD_DICT['aVL'][1]),
    'V2': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['V2'][3] - CROP_COORD_DICT['V2'][1]),
    'V5': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['V5'][3] - CROP_COORD_DICT['V5'][1]),
    # row 3
    'III': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['III'][3] - CROP_COORD_DICT['III'][1]),
    'aVF': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['aVF'][3] - CROP_COORD_DICT['aVF'][1]),
    'V3': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['V3'][3] - CROP_COORD_DICT['V3'][1]),
    'V6': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['V6'][3] - CROP_COORD_DICT['V6'][1]),
    # row 4
    'II': I_REF_SCALE * (CROP_COORD_DICT['I'][3] - CROP_COORD_DICT['I'][1]) / (CROP_COORD_DICT['II'][3] - CROP_COORD_DICT['II'][1]),
}


def clean_and_resample(y: np.ndarray, num: int):
    s = pd.Series(y)
    s = s.interpolate(limit_direction="both")
    s = s.ffill().bfill()
    y_filled = s.to_numpy()
    return resample(y_filled, num)


def min_max_scale(y: np.ndarray):
    return (y - y.min()) / max(y.max() - y.min(), 1e-6)


def adjust_scale(y: np.ndarray, lead: str):
    y_scaled = CALIB_SCALE_DICT[lead] * y
    y_scaled = y_scaled - y_scaled.mean()
    return y_scaled


def extract_y_coord(pil_image_cropped, lead) -> np.ndarray:
    cropped_np = np.asarray(pil_image_cropped)[:, :, :3]
    # mask = ((cropped_np < 60) & (cropped_np > 5)).all(axis=-1)
    mask = (cropped_np < 60).all(axis=-1)
    text_region_mask = np.ones_like(mask)
    text_region_mask_max_x = int(len(lead)*0.035*text_region_mask.shape[1]) if lead != 'II' else int(0.25*len(lead)*0.035*text_region_mask.shape[1])
    text_region_mask_min_y = int(0.56*text_region_mask.shape[0])
    text_region_mask[text_region_mask_min_y:, :text_region_mask_max_x] = 0
    # plt.imshow(mask)
    # plt.show()
    # plt.imshow(text_region_mask)
    # plt.show()
    mask &= text_region_mask
    y_idx = np.arange(mask.shape[0])[:, None]
    sum_y = (mask * y_idx).sum(axis=0)
    count_y = mask.sum(axis=0)
    y_coords = np.full(mask.shape[1], np.nan, dtype=float)
    y_coords[count_y > 0] = sum_y[count_y > 0] / count_y[count_y > 0]
    if np.all(np.isnan(y_coords)):
        print("[Warning] extract_y_coord: all NaN detected, filling with zeros.")
        y_coords[:] = 0.0
    return -y_coords, mask


def predict_single_id(base_id: int, df_test: pd.DataFrame = df_test, debug: bool = False) -> pd.DataFrame:
    df_test_tgt_id = df_test[df_test['id'] == base_id]
    pil_image = Image.open(f"/kaggle/input/physionet-ecg-image-digitization/test/{base_id}.png")
    df_pred_list = []
    for i, sr_row in df_test_tgt_id.iterrows():
        lead = sr_row.lead
        n_rows = sr_row.number_of_rows
        crop_bbox = CROP_COORD_DICT[lead]
        pil_image_cropped = pil_image.crop(crop_bbox)
        y_coords, mask = extract_y_coord(pil_image_cropped, lead)
        y_processed = clean_and_resample(y_coords, n_rows)
        y_processed = adjust_scale(y_processed, lead)
        if debug:
            plt.plot(y_processed)
            plt.title(f'extracted signal / {base_id}_{lead}')
            plt.show()
            plt.imshow(mask)
            plt.title(f'mask / {base_id}_{lead}')
            plt.show()
            display(pil_image_cropped)
            print(100*'=')
        df_pred = pd.DataFrame({
            'id': [f'{base_id}_{row_id}_{lead}' for row_id in range(len(y_processed))],
            'value': y_processed
        })
        df_pred_list.append(df_pred)
    df_pred = pd.concat(df_pred_list)
    return df_pred

In [None]:
uniq_ids = df_test['id'].unique().tolist()

debug = False if os.getenv('KAGGLE_IS_COMPETITION_RERUN') else True
df_pred = pd.concat([predict_single_id(base_id, debug=debug) for base_id in uniq_ids])

In [None]:
df_pred

## 3. Submission

In [None]:
df_pred.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')

## 4. Train Sample Evaluation

In [None]:
def predict_single_id_train(base_id: int, df_test: pd.DataFrame = df_test, debug: bool = False) -> pd.DataFrame:
    df_train_tgt_id = pd.read_csv(f'/kaggle/input/physionet-ecg-image-digitization/train/{base_id}/{base_id}.csv')
    pil_image = Image.open(f"/kaggle/input/physionet-ecg-image-digitization/train/{base_id}/{base_id}-0001.png")
    df_pred_list = []
    for lead, crop_bbox in CROP_COORD_DICT.items():
        n_rows = len(df_train_tgt_id[lead].dropna())
        pil_image_cropped = pil_image.crop(crop_bbox)
        y_coords, mask = extract_y_coord(pil_image_cropped, lead)
        y_processed = clean_and_resample(y_coords, n_rows)
        y_processed = adjust_scale(y_processed, lead)
        if debug:
            plt.plot(y_processed)
            plt.title(f'{base_id}_{lead}')
            plt.show()
            display(pil_image_cropped)
            print(100*'=')
        df_pred = pd.DataFrame({
            'id': [f'{base_id}_{row_id}_{lead}' for row_id in range(len(y_processed))],
            'value': y_processed
        })
        df_pred['base_id'] = base_id
        df_pred['lead'] = lead
        df_pred_list.append(df_pred)
    df_pred = pd.concat(df_pred_list)
    return df_pred

In [None]:
sample_base_id = 1006427285
# sample_base_id = 1220748004
# sample_base_id = 1475354244
# sample_base_id = 1643754023

In [None]:
df_gt_tgt_id = pd.read_csv(f'/kaggle/input/physionet-ecg-image-digitization/train/{sample_base_id}/{sample_base_id}.csv')

In [None]:
df_pred_tgt_id = predict_single_id_train(sample_base_id)

In [None]:
for lead, crop_bbox in CROP_COORD_DICT.items():
    df_pred_tgt_id[(df_pred_tgt_id['lead'] == lead)]['value'].reset_index(drop=True).plot(label='pred')
    df_gt_tgt_id[lead].dropna().reset_index(drop=True).plot(label='gt')
    plt.legend()
    plt.title(lead)
    plt.show()

## 