In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import json
import cv2
import numpy as np

In [2]:
device = torch.device("cuda")
device

device(type='cuda')

Creating torch Dataset

In [3]:
class KeypointsData(Dataset):
    def __init__(self, img_dir, data_file):
        self.img_dir = img_dir
        with open(data_file, 'r') as f:
            self.data = json.load(f)

        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.458, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        item = self.data[idx]
        img = cv2.imread(f"{self.img_dir}/{item['id']}.png")
        h, w = img.shape[:2]

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = self.transform(img)
        kps = np.array(item['kps']).flatten()
        kps = kps.astype(np.float32)

        kps[::2] *= 224.0 / w
        kps[1::2] *= 224.0 / h

        return img, kps

In [4]:
train_dataset = KeypointsData('data/images', 'data/data_train.json')
val_dataset = KeypointsData('data/images', 'data/data_val.json')

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_laoder = DataLoader(val_dataset, batch_size=8, shuffle=True)

Model

In [5]:
model = models.resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, 14*2)



In [6]:
model = model.to(device)

In [7]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [8]:
epochs = 20
for epoch in range(epochs):
    for i, (imgs, kps) in enumerate(train_loader):
        imgs = imgs.to(device)
        kps = kps.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, kps)
        loss.backward()
        optimizer.step()
        
        if i%10 == 0:
            print(f"Epoch {epoch}, iter {i}, loss: {loss.item()}")

Epoch 0, iter 0, loss: 14877.4228515625
Epoch 0, iter 10, loss: 15018.525390625
Epoch 0, iter 20, loss: 14354.2421875
Epoch 0, iter 30, loss: 13765.30859375
Epoch 0, iter 40, loss: 13376.8251953125
Epoch 0, iter 50, loss: 13272.4169921875
Epoch 0, iter 60, loss: 12993.8779296875
Epoch 0, iter 70, loss: 12034.55859375
Epoch 0, iter 80, loss: 12132.248046875
Epoch 0, iter 90, loss: 11596.3603515625
Epoch 0, iter 100, loss: 11310.21484375
Epoch 0, iter 110, loss: 10999.0908203125
Epoch 0, iter 120, loss: 10990.232421875
Epoch 0, iter 130, loss: 9436.390625
Epoch 0, iter 140, loss: 9965.013671875
Epoch 0, iter 150, loss: 8889.2978515625
Epoch 0, iter 160, loss: 8830.0703125
Epoch 0, iter 170, loss: 9116.833984375
Epoch 0, iter 180, loss: 8249.5830078125
Epoch 0, iter 190, loss: 7854.21142578125
Epoch 0, iter 200, loss: 7942.404296875
Epoch 0, iter 210, loss: 8030.85400390625
Epoch 0, iter 220, loss: 8091.71484375
Epoch 0, iter 230, loss: 6526.54150390625
Epoch 0, iter 240, loss: 7013.46289

In [9]:
torch.save(model.state_dict(), "keypoint_20epoch.pth")

In [10]:
import torch
import numpy as np
from torch.utils.data import DataLoader
import torchvision.models as models

# 1) Device & model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=False)
model.fc = torch.nn.Linear(model.fc.in_features, 14*2)
model.load_state_dict(torch.load('models/keypoints_model.pth', map_location='cpu'))
model.to(device).eval()

# 2) Val loader
val_ds = KeypointsData('data/images', 'data/data_val.json')
val_loader = DataLoader(val_ds, batch_size=16, shuffle=False)

mse_criterion = torch.nn.MSELoss(reduction='sum')
total_se = 0.0       # sum of squared errors
total_coords = 0     # number of coordinate predictions
all_euc = []         # to accumulate euclidean errors per keypoint

with torch.no_grad():
    for imgs, gt_kps in val_loader:
        imgs    = imgs.to(device)
        gt_kps  = gt_kps.to(device)   # shape [B, 28]

        pred_kps = model(imgs)        # [B, 28]

        # --- MSE ---
        se = mse_criterion(pred_kps, gt_kps).item()
        total_se    += se
        total_coords += gt_kps.numel()

        # --- Euclidean per-keypoint ---
        pk = pred_kps.cpu().numpy().reshape(-1, 14, 2)
        gk = gt_kps   .cpu().numpy().reshape(-1, 14, 2)
        euc = np.linalg.norm(pk - gk, axis=2)  # [B, 14]
        all_euc.append(euc)

# Compute metrics
mse  = total_se / total_coords
rmse = np.sqrt(mse)
mean_euc = np.vstack(all_euc).mean()

print(f"Val MSE per coordinate:      {mse:.4f}")
print(f"Val RMSE per coordinate:     {rmse:.2f} pixels")
print(f"Mean Euclidean error:        {mean_euc:.2f} pixels")




Val MSE per coordinate:      12.2960
Val RMSE per coordinate:     3.51 pixels
Mean Euclidean error:        2.94 pixels


In [7]:
import numpy as np

# Assuming you still have `all_euc` from your validation loop:
# all_euc is a list of arrays, each [batch_size, 14]
errors = np.vstack(all_euc)   # shape [N_images × B, 14]

mean_per_kp = errors.mean(axis=0)   # one mean per keypoint
for i, err in enumerate(mean_per_kp):
    print(f"KP{i:2d}: {err:.2f} px")


KP 0: 3.37 px
KP 1: 3.98 px
KP 2: 3.42 px
KP 3: 3.52 px
KP 4: 3.06 px
KP 5: 3.17 px
KP 6: 3.56 px
KP 7: 3.21 px
KP 8: 2.67 px
KP 9: 3.12 px
KP10: 1.76 px
KP11: 2.22 px
KP12: 2.31 px
KP13: 1.74 px
