In [None]:
import h5py

# Specify the path to your .h5 file
file_path = r"C:\Users\20200337\Desktop\PROGRAMMATION\02_DOG_RACE_PREDICTION\1000_images_web_dataset.h5"

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5_file:
    # Function to recursively explore groups and datasets
    def explore_h5_group(group, indent=0):
        for key in group:
            item = group[key]
            print("  " * indent + f"- {key}: {type(item)}")
            if isinstance(item, h5py.Group):
                # Recursively explore groups
                explore_h5_group(item, indent + 1)
            elif isinstance(item, h5py.Dataset):
                # Print dataset shape and dtype
                print("  " * (indent + 1) + f"Shape: {item.shape}, Data type: {item.dtype}")

    # Start exploration from the root group
    print("Exploring HDF5 file structure:")
    explore_h5_group(h5_file)


Exploring HDF5 file structure:
- class_names: <class 'h5py._hl.dataset.Dataset'>
  Shape: (120,), Data type: object
- images: <class 'h5py._hl.dataset.Dataset'>
  Shape: (705334, 224, 224, 3), Data type: float32
- labels: <class 'h5py._hl.dataset.Dataset'>
  Shape: (705334,), Data type: int32


---

In [5]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Cell 1: Imports
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from tqdm import tqdm
from PIL import Image
import logging

In [8]:
import torch

In [9]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[DEBUG] Using device:", DEVICE)

[DEBUG] Using device: cpu


In [10]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[]


In [17]:
!pip uninstall facenet-pytorch


^C


In [14]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
INFO: pip is looking at multiple versions of torchaudio to determine which version is compatible with other requirements. This could take a while.
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.5.1%2Bcu124-cp311-cp311-win_amd64.whl (4.1 MB)
     ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
     ----------------------------------- ---- 3.7/4.1 MB 19.8 MB/s eta 0:00:01
     ---------------------------------------- 4.1/4.1 MB 19.0 MB/s eta 0:00:00
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.5.0%2Bcu124-cp311-cp311-win_amd64.whl (4.1 MB)
     ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
     -------------------------------- ------- 3.4/4.1 MB 20.2 MB/s eta 0:00:01
     ---------------------------------------- 4.1/4.1 MB 17.7 MB/s eta 0:00:00
  Downloading https://download.pytorch.org/whl/cu124/torchaudio-2.4.1%2Bcu124-cp311-cp31

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
facenet-pytorch 2.6.0 requires torch<2.3.0,>=2.2.0, but you have torch 2.5.1+cu124 which is incompatible.
facenet-pytorch 2.6.0 requires torchvision<0.18.0,>=0.17.0, but you have torchvision 0.20.1+cu124 which is incompatible.


In [1]:
import torch

print("[DEBUG] CUDA availability:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("[DEBUG] GPU Name:", torch.cuda.get_device_name(0))
else:
    print("[ERROR] CUDA is not available. Check your installation.")


[DEBUG] CUDA availability: True
[DEBUG] GPU Name: NVIDIA GeForce RTX 4070 Laptop GPU


In [2]:
# %% Cell 1: Imports and setup
import os
import h5py
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data.dataset import Subset
import torch.backends.cudnn as cudnn
from tqdm.auto import tqdm

# Ensure reproducibility (though not perfect due to CUDA)
torch.manual_seed(42)
np.random.seed(42)
cudnn.benchmark = True  # Can speed up training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [3]:
# %% Cell 2: Custom Dataset class for HDF5 data
class DogH5Dataset(Dataset):
    def __init__(self, h5_file_path):
        self.h5_file_path = h5_file_path
        # We open the file in __init__ to load class_names (static info)
        # but images/labels can be lazily accessed if needed.
        self.h5_file = h5py.File(self.h5_file_path, 'r')
        self.images = self.h5_file['images']  # shape: (N, 224, 224, 3), float16
        self.labels = self.h5_file['labels']  # shape: (N,), int64
        self.class_names = self.h5_file['class_names'][:]
        
        # Convert class_names from object arrays if needed
        self.class_names = [cn.decode('utf-8') if isinstance(cn, bytes) else cn for cn in self.class_names]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # images are float16, convert to float32 and transpose
        img = self.images[idx].astype(np.float32)  # shape (224,224,3)
        # PyTorch expects (C,H,W)
        img = np.transpose(img, (2, 0, 1))  # (3,224,224)
        label = self.labels[idx]  # int
        # Convert to tensor
        img_t = torch.from_numpy(img)
        label_t = torch.tensor(label, dtype=torch.long)
        return img_t, label_t

    def close(self):
        self.h5_file.close()


In [4]:
# %% Cell 3: Define a custom CNN model
class CustomCNN(nn.Module):
    def __init__(self, num_classes=120):
        super(CustomCNN, self).__init__()
        # A relatively simple architecture inspired by classical CNNs:
        # Input: 3 x 224 x 224
        # We'll do a few convolutional blocks and then fully connected layers.

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),  # (64,112,112)
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),       # (64,56,56)

            nn.Conv2d(64, 128, kernel_size=3, padding=1),           # (128,56,56)
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # (128,28,28)

            nn.Conv2d(128, 256, kernel_size=3, padding=1),          # (256,28,28)
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # (256,14,14)

            nn.Conv2d(256, 512, kernel_size=3, padding=1),          # (512,14,14)
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((7,7))                             # (512,7,7)
        )

        # Flatten: 512 * 7 * 7 = 512 * 49 = 25088
        self.classifier = nn.Sequential(
            nn.Linear(512*7*7, 1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


In [5]:
# %% Cell 4: Load dataset and create dataloaders
file_path = r"C:\Users\20200337\Desktop\PROGRAMMATION\02_DOG_RACE_PREDICTION\dog_dataset_no_aug.h5"
dataset = DogH5Dataset(file_path)

# Let's do a train/val split
dataset_size = len(dataset)  # should be 17586
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 128
num_workers = 0

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

num_classes = len(dataset.class_names)


In [6]:
# %% Cell 5: Define training/evaluation routines and early stopping

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    total_samples = 0

    pbar = tqdm(dataloader, desc="Training", leave=False)
    for images, labels in pbar:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        _, preds = torch.max(outputs, 1)
        running_loss += loss.item() * images.size(0)
        running_corrects += torch.sum(preds == labels)
        total_samples += images.size(0)
    
    epoch_loss = running_loss / total_samples
    epoch_acc = running_corrects.double().item() / total_samples
    return epoch_loss, epoch_acc

def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    total_samples = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * images.size(0)
            running_corrects += torch.sum(preds == labels)
            total_samples += images.size(0)
    epoch_loss = running_loss / total_samples
    epoch_acc = running_corrects.double().item() / total_samples
    return epoch_loss, epoch_acc

def save_checkpoint(model, epoch, val_loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'val_loss': val_loss
    }, path)

# Early stopping parameters
patience = 50  # you can choose an appropriate patience


In [7]:
# %% Cell 6: Main training loop with early stopping and checkpointing

model = CustomCNN(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

best_val_loss = float('inf')
epochs_no_improve = 0
max_epochs = 1000

checkpoint_dir = "./checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

for epoch in range(1, max_epochs+1):
    print(f"Epoch {epoch}/{max_epochs}")
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

    # Evaluate on validation only every 10 epochs to save time
    if epoch % 10 == 0:
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            # Save checkpoint
            checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch:04d}_valloss_{val_loss:.4f}.pth")
            save_checkpoint(model, epoch, val_loss, checkpoint_path)
            print(f"Checkpoint saved: {checkpoint_path}")
        else:
            epochs_no_improve += 1
            print(f"No improvement for {epochs_no_improve} validation checks.")

        # Early stopping
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break


Epoch 1/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 4.7418, Train Acc: 0.0183
Epoch 2/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 4.5191, Train Acc: 0.0363
Epoch 3/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 4.3262, Train Acc: 0.0467
Epoch 4/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 4.1672, Train Acc: 0.0625
Epoch 5/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 4.0389, Train Acc: 0.0725
Epoch 6/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.9091, Train Acc: 0.0857
Epoch 7/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.8038, Train Acc: 0.1032
Epoch 8/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.7067, Train Acc: 0.1144
Epoch 9/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.6049, Train Acc: 0.1244
Epoch 10/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.4976, Train Acc: 0.1436
Val Loss: 3.6825, Val Acc: 0.1208
Checkpoint saved: ./checkpoints\model_epoch_0010_valloss_3.6825.pth
Epoch 11/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.3952, Train Acc: 0.1609
Epoch 12/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.3041, Train Acc: 0.1726
Epoch 13/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.1816, Train Acc: 0.1966
Epoch 14/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 3.0847, Train Acc: 0.2118
Epoch 15/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.9975, Train Acc: 0.2255
Epoch 16/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.9161, Train Acc: 0.2394
Epoch 17/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.8235, Train Acc: 0.2656
Epoch 18/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.7165, Train Acc: 0.2778
Epoch 19/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.6152, Train Acc: 0.2994
Epoch 20/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.5315, Train Acc: 0.3209
Val Loss: 3.1830, Val Acc: 0.2123
Checkpoint saved: ./checkpoints\model_epoch_0020_valloss_3.1830.pth
Epoch 21/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.4645, Train Acc: 0.3322
Epoch 22/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.3614, Train Acc: 0.3563
Epoch 23/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.2850, Train Acc: 0.3670
Epoch 24/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.2013, Train Acc: 0.3875
Epoch 25/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.1121, Train Acc: 0.4039
Epoch 26/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 2.0376, Train Acc: 0.4232
Epoch 27/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.9742, Train Acc: 0.4421
Epoch 28/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.8878, Train Acc: 0.4611
Epoch 29/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.7879, Train Acc: 0.4894
Epoch 30/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.7359, Train Acc: 0.4969
Val Loss: 2.9044, Val Acc: 0.2820
Checkpoint saved: ./checkpoints\model_epoch_0030_valloss_2.9044.pth
Epoch 31/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.6378, Train Acc: 0.5277
Epoch 32/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.5760, Train Acc: 0.5415
Epoch 33/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.5170, Train Acc: 0.5517
Epoch 34/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.4490, Train Acc: 0.5713
Epoch 35/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.3665, Train Acc: 0.5965
Epoch 36/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.2904, Train Acc: 0.6175
Epoch 37/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.2442, Train Acc: 0.6262
Epoch 38/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.1938, Train Acc: 0.6434
Epoch 39/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.1081, Train Acc: 0.6645
Epoch 40/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

Train Loss: 1.0644, Train Acc: 0.6818
Val Loss: 3.4900, Val Acc: 0.2300
No improvement for 1 validation checks.
Epoch 41/1000


Training:   0%|          | 0/110 [00:00<?, ?it/s]

KeyboardInterrupt: 