In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import cv2
import os
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from scipy.signal import medfilt

In [None]:
LEADS=['I','II','III','aVR','aVL','aVF','V1','V2','V3','V4','V5','V6']
DEVICE=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
IMG_HEIGHT=256
IMG_WIDTH=512
BATCH_SIZE=8
EPOCHS=5
LR=1e-4
LAMBDA_KG=0.1

In [None]:
KG_EDGES=[
    ('I','II'),('I','III'),('II','III'),
    ('aVR','aVL'),('aVR','aVF'),('aVL','aVF'),
    ('I','aVL'),('II','aVF'),('III','aVF'),
    ('V1','V2'),('V2','V3'),('V3','V4'),('V4','V5'),('V5','V6'),
    ('V1','V3'),('V2','V4'),('V3','V5'),('V4','V6'),
    ('I','V6'),('aVL','V5'),('aVL','V6'),
]

In [None]:
def build_kg_edge_index(leads_list):
    lead_to_idx={lead:idx for idx,lead in enumerate(leads_list)}
    edge_pairs=[]
    for lead1,lead2 in KG_EDGES:
        if lead1 in lead_to_idx and lead2 in lead_to_idx:
            idx1=lead_to_idx[lead1]
            idx2=lead_to_idx[lead2]
            edge_pairs.append((idx1,idx2))
            edge_pairs.append((idx2,idx1))
    return torch.tensor(edge_pairs,dtype=torch.long)

In [None]:
def get_adaptive_threshold(ima):
    mean_brightness=np.mean(ima[:,:,2])
    if mean_brightness>200:
        return 170
    elif mean_brightness<150:
        return 160

In [None]:
def preprocess_ecg_image(image_path):
    ima=cv2.imread(image_path)
    if ima is None:
        return None
    crop_top=420 if ima.shape[0] == 1700 else 400
    threshold=get_adaptive_threshold(ima) if get_adaptive_threshold(ima) is not None else 165
    ima_crop=ima[crop_top:,:,2]>threshold
    ima_crop=ima_crop.astype(np.float32)
    ima_resized=cv2.resize(ima_crop,(IMG_WIDTH,IMG_HEIGHT))
    return ima_resized

In [None]:
class ECGDataset(Dataset):
    def __init__(self,df,data_dir,is_train=True):
        self.df=df;
        self.data_dir=data_dir
        self.is_train=is_train
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        ecg_id=row['id']
        img_path=os.path.join(self.data_dir, str(ecg_id), f"{ecg_id}-0001.png")
        img=preprocess_ecg_image(img_path)
        if img is None:
            img=np.zeros((IMG_HEIGHT, IMG_WIDTH), dtype=np.float32)
        img_tensor=torch.from_numpy(img).unsqueeze(0).float()
        if self.is_train:
            csv_path=os.path.join(self.data_dir, str(ecg_id), f"{ecg_id}.csv")
            df_signal=pd.read_csv(csv_path)
            signals=[]
            for lead in LEADS:
                if lead in df_signal.columns:
                    signal=df_signal[lead].dropna().values.astype(np.float32)
                    signal=(signal - signal.mean()) / (signal.std() + 1e-8)
                    signal_resampled = np.interp(
                        np.linspace(0, 1, 2500),
                        np.linspace(0, 1, len(signal)),
                        signal
                    )
                else:
                    signal_resampled=np.zeros(2500, dtype=np.float32)
                signals.append(signal_resampled)
            target=torch.from_numpy(np.stack(signals)).float()
            return img_tensor, target
        return img_tensor, ecg_id        

In [None]:
class DoubleConv(nn.Module):
    def __init__(self,in_ch,out_ch):
        super().__init__()
        self.conv=nn.Sequential(
            nn.Conv2d(in_ch,out_ch,3,padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch,out_ch,3,padding=1), 
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True)
        )
    def forward(self,x):
        return self.conv(x)

In [None]:
class ECGUNetWithKG(nn.Module):
    def __init__(self,n_leads=12,output_length=2500,embedding_dim=128):
        super().__init__()
        self.n_leads=n_leads
        self.output_length=output_length
        self.embedding_dim=embedding_dim
        self.enc1=DoubleConv(1,64)
        self.pool1=nn.MaxPool2d(2)
        self.enc2=DoubleConv(64,128)
        self.pool2=nn.MaxPool2d(2)
        self.enc3=DoubleConv(128,256)
        self.pool3=nn.MaxPool2d(2)
        self.enc4=DoubleConv(256,512)
        self.pool4=nn.MaxPool2d(4)
        self.bottleneck=DoubleConv(512,1024)
        self.up1=nn.ConvTranspose2d(1024,512,4,stride=4)
        self.dec1=DoubleConv(1024,512)
        self.up2=nn.ConvTranspose2d(512,256,2,stride=2)
        self.dec2=DoubleConv(512,256)
        self.up3=nn.ConvTranspose2d(256,128,2,stride=2)
        self.dec3=DoubleConv(256,128)
        self.up4=nn.ConvTranspose2d(128,64,2,stride=2)
        self.dec4=DoubleConv(128,64)
        self.global_pool=nn.AdaptiveAvgPool2d((1,output_length//10))
        self.lead_heads=nn.ModuleList([
            nn.Sequential(
                nn.Linear(64*(output_length//10),512),
                nn.ReLU(),
                nn.Dropout(0,3),
                nn.Linear(512,output_length)
            ) for _ in range(n_leads)
        ])
        self.embedding_layers=nn.ModuleList([
            nn.Sequential(
                nn.Linear(output_length,256),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(256,embedding_dim)
            )for _ in range(n_leads)
        ])
    def forward(self,x,return_embeddings=False):
        e1=self.enc1(x)
        e2=self.enc2(self.pool1(e1))
        e3=self.enc3(self.pool2(e2))
        e4=self.enc4(self.pool3(e3))
        b=self.bottleneck(self.pool4(e4))
        d1=self.dec1(torch.cat([self.up1(b),e4],dim=1))
        d2=self.dec2(torch.cat([self.up2(d1),e3],dim=1))
        d3=self.dec3(torch.cat([self.up3(d2),e2],dim=1))
        d4=self.dec4(torch.cat([self.up4(d3),e1],dim=1))
        features=self.global_pool(d4)
        features=features.view(features.size(0),-1)
        outputs=[]
        for head in self.lead_heads:
            lead_output=head(features)
            outputs.append(lead_output)
        predictions=torch.stack(outputs,dim=1)
        if return_embeddings:
            embeddings=[]
            for i,emb_layer in enumerate(self.embedding_layers):
                lead_embedding=emb_layer(predictions[:,i,:])
                embeddings.append(lead_embedding)
            embeddings=torch.stack(embeddings,dim=1)
            return predictions,embeddings
        return predictions
        

In [None]:
def knowledge_graph_loss(embeddings,edge_index):
    if edge_index.size()==0:
        return torch.tensor(0.0,device=embeddings.device)
    batch_size=embeddings.size(0)
    total_loss=0.0
    for b in range(batch_size):
        batch_embeddings=embeddings[b]
        source_embeddings=batch_embeddings[edge_index[:,0]]
        target_embeddings=batch_embeddings[edge_index[:,1]]
        distances=torch.norm(source_embeddings-target_embeddings,p=2,dim=1)
        total_loss+=distances.mean()
    return total_loss/batch_size
        

In [None]:
def train(model,train_loader,val_loader,edge_index,epochs=EPOCHS,lambda_kg=LAMBDA_KG):
    optimizer=torch.optim.Adam(model.parameters(),lr=LR)
    reconstruction_criterion=nn.MSELoss()
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=3)
    edge_index=edge_index.to(DEVICE)
    best_val_loss=float('inf')
    train_kg_loss=0 
    for epoch in range(epochs):
        model.train()
        train_recon_loss=0
        train_total_loss=0
        for batch in tqdm(train_loader,desc=f"Epoch{epoch+1}/{epochs}"):
          imgs,targets=batch
          imgs=imgs.to(DEVICE)
          targets=targets.to(DEVICE)
          optimizer.zero_grad()
          predictions,embeddings=model(imgs,return_embeddings=True)
          recon_loss=reconstruction_criterion(predictions,targets)
          kg_loss=knowledge_graph_loss(embeddings,edge_index)
          total_loss=recon_loss+lambda_kg*kg_loss
          total_loss.backward()
          optimizer.step()
          train_recon_loss+=recon_loss.item()
          train_kg_loss+=kg_loss.item()
          train_total_loss+=total_loss.item()
          train_recon_loss/=len(train_loader)
          train_kg_loss/=len(train_loader)
          train_total_loss/=len(train_loader)
          model.eval()
          val_recon_loss=0
          val_kg_loss=0
          val_total_loss=0
          with torch.no_grad():
            for batch in val_loader:
                imgs,targets=batch
                imgs=imgs.to(DEVICE)
                targets=targets.to(DEVICE) 
                predictions,embeddings=model(imgs,return_embeddings=True)
                recon_loss=reconstruction_criterion(predictions,targets)
                kg_loss=knowledge_graph_loss(embeddings,edge_index)
                total_loss=recon_loss+lambda_kg*kg_loss
                val_recon_loss+=recon_loss.item()
                val_kg_loss+=kg_loss.item()
                val_total_loss+=total_loss.item()
          val_recon_loss /= len(val_loader)
          val_kg_loss /= len(val_loader)
          val_total_loss /= len(val_loader)
          scheduler.step(val_total_loss)
          print(f'Epoch {epoch+1}:')
          print(f'  Train - Recon: {train_recon_loss:.4f}, KG: {train_kg_loss:.4f}, Total: {train_total_loss:.4f}')
          print(f'  Val   - Recon: {val_recon_loss:.4f}, KG: {val_kg_loss:.4f}, Total: {val_total_loss:.4f}')
        
          if val_total_loss < best_val_loss:
            best_val_loss = val_total_loss
            torch.save(model.state_dict(), 'best_unet_kg_model.pth')
            print(f'Model saved (val_total_loss={val_total_loss:.4f})')
        

In [None]:
def predict_and_save(model,test_df,output_path):
    model.eval()
    if os.path.exists(output_path):
        os.remove(output_path)
    pd.DataFrame(columns=["id","value"]).to_csv(output_path,index=False)
    old_id=None
    cached_signals=None
    for index,row in tqdm(test_df.iterrows(),total=len(test_df),desc="Processing"):
        current_id=row["id"]
        if current_id!=old_id:
            old_id=current_id
            img_path = f'/kaggle/input/physionet-ecg-image-digitization/test/{current_id}.png'
            img = preprocess_ecg_image(img_path)
            if img is None:
                img=np.zeros((IMG_HEIGHT,IMG_WIDTH),dtype=np.float32)
            img_tensor=torch.from_numpy(img).unsqueeze(0).unsqueeze(0).float().to(DEVICE)
            with torch.no_grad():
                outputs=model(img_tensor,return_embeddings=False)
                cached_signals=outputs[0].cpu().numpy()
        lead_name=row['lead']
        number_of_rows=row["number_of_rows"]
        if lead_name not in LEADS:
            print("unknown")
            continue
        lead_idx=LEADS.index(lead_name)
        signal=cached_signals[lead_idx]
        signal_resampled=np.interp(
            np.linspace(0,1,number_of_rows),
            np.linspace(0,1,len(signal)),
            signal
        )
        signal_resampled=medfilt(signal_resampled,kernel_size=5)
        mean_val=np.nanmean(signal_resampled)
        if np.isnan(mean_val):
            mean_val=0.0
        signal_resampled=np.nan_to_num(signal_resampled,nan=mean_val)
        chunk=[]
        for t in range(number_of_rows):
            chunk.append({
                "id": f"{current_id}_{t}_{lead_name}",
                "value": float(signal_resampled[t])
            })
        if chunk:
            pd.DataFrame(chunk).to_csv(output_path, mode='a', index=False, header=False)
    print(f"âœ“ Submission saved to {output_path}")

In [None]:
if __name__ == '__main__':
    train_df = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/train.csv')
    test_df = pd.read_csv('/kaggle/input/physionet-ecg-image-digitization/test.csv')
    
    print(f"Train DF shape: {train_df.shape}")
    print(f"Test DF shape: {test_df.shape}")
    
    kg_edge_index = build_kg_edge_index(LEADS)
    print(f"Knowledge Graph: {len(KG_EDGES)} unique edges, {kg_edge_index.size(0)} directed edges")
    split_idx = int(0.8 * len(train_df))
    train_data = train_df[:split_idx]
    val_data = train_df[split_idx:]
    train_dataset = ECGDataset(train_data, '/kaggle/input/physionet-ecg-image-digitization/train', is_train=True)
    val_dataset = ECGDataset(val_data, '/kaggle/input/physionet-ecg-image-digitization/train', is_train=True)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    model = ECGUNetWithKG(n_leads=12, output_length=2500, embedding_dim=128).to(DEVICE)
    print(f'Model parameters: {sum(p.numel() for p in model.parameters()):,}')    
    train(model, train_loader, val_loader, kg_edge_index, epochs=EPOCHS, lambda_kg=LAMBDA_KG) 
    model.load_state_dict(torch.load('best_unet_kg_model.pth'))
    predict_and_save(model, test_df, 'submission.csv')