In [None]:
import pandas as pd
import numpy as np
import os
from joblib import Parallel, delayed
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision import transforms

# Transforms for ResNet (224x224)
my_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


class MyImageDataset(Dataset):
    def __init__(self, parquet_path, transform=None):
        """
        parquet_path: path to the Parquet file with columns [filepath, label].
        transform: optional torchvision transforms.
        """
        # Load the Parquet file into a DataFrame
        self.data = pd.read_parquet(parquet_path)
        initial_count = len(self.data)

        # Function to check if a file is valid (exists and > 0 bytes)
        def is_valid(fp):
            try:
                return os.path.exists(fp) and os.path.getsize(fp) > 0
            except Exception:
                return False

        # Use Joblib to parallelize the file validity checks over all filepaths
        valid_flags = Parallel(n_jobs=-1)(
            delayed(is_valid)(fp) for fp in self.data['filepath']
        )

        # Filter the DataFrame to keep only valid entries
        self.data = self.data[valid_flags].reset_index(drop=True)
        filtered_count = len(self.data)
        print(f"Filtered dataset: kept {filtered_count} out of {initial_count} entries "
              f"(removed {initial_count - filtered_count}).")

        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        img_path = row['filepath']
        label = row['label']  # 0 or 1

        # Open the image and convert it to RGB
        image = Image.open(img_path).convert('RGB')

        # Apply transforms if provided
        if self.transform:
            image = self.transform(image)

        # Convert label to a tensor
        label = torch.tensor(label, dtype=torch.long)

        return image, label

In [None]:
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

parquet_file = 'train_data_path_test.parquet'  # The Parquet file you created
train_dataset = MyImageDataset(parquet_file, transform=my_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,      # adjust based on GPU memory
    shuffle=True,       # randomize order of samples
    num_workers=8,      # use multiple CPU cores to speed up loading
    pin_memory=True
)


parquet_file = 'val_data_path_test.parquet'  # The Parquet file you created
val_dataset = MyImageDataset(parquet_file, transform=my_transform)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,      # adjust based on GPU memory
    shuffle=True,       # randomize order of samples
    num_workers=8,      # use multiple CPU cores to speed up loading
    pin_memory=True
)

In [None]:
import time
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet34
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import CosineAnnealingLR

# -----------------------------
# 1) Hyperparameters & Setup
# -----------------------------
EPOCHS = 200
EARLY_STOP_PATIENCE = 10
LR = 1e-4
WEIGHT_DECAY = 1e-5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT_DIR = './checkpoints_simple'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Create a TensorBoard writer
writer = SummaryWriter(log_dir='runs/resnet34_simple')

# -----------------------------
# 2) Model, Loss, Optimizer, Scheduler
# -----------------------------
model = resnet34(pretrained=False)
# ResNet34 default final layer has 1000 outputs; replace with 2 for binary classification
model.fc = nn.Linear(model.fc.in_features, 2)
model.to(DEVICE)

criterion = nn.CrossEntropyLoss().to(DEVICE)  # Moved criterion to the same device
optimizer = optim.AdamW(model.parameters(), lr=LR)
scheduler = CosineAnnealingLR(optimizer, T_max=15, eta_min=1e-5)

# -----------------------------
# 3) Helper to compute accuracy
# -----------------------------
def compute_accuracy(outputs, labels):
    """
    outputs: (batch_size, 2) raw logits
    labels: (batch_size,) ground truth in {0,1}
    returns accuracy in [0..1].
    """
    preds = outputs.argmax(dim=1)
    correct = (preds == labels).sum().item()
    total = labels.size(0)
    return correct / total

# -----------------------------
# 4) Training & Validation Loop
# -----------------------------
best_val_loss = float('inf')
epochs_no_improve = 0

for epoch in range(1, EPOCHS + 1):
    start_time = time.time()

    # ========== TRAIN ==========
    model.train()
    train_loss, train_correct, train_samples = 0.0, 0, 0

    for images, labels in train_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        # Forward
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Metrics
        batch_size = labels.size(0)
        train_loss += loss.item() * batch_size
        train_correct += (outputs.argmax(dim=1) == labels).sum().item()
        train_samples += batch_size

    avg_train_loss = train_loss / train_samples
    avg_train_acc  = train_correct / train_samples

    # ========== VALIDATION ==========
    model.eval()
    val_loss, val_correct, val_samples = 0.0, 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)

            batch_size = labels.size(0)
            val_loss += loss.item() * batch_size
            val_correct += (outputs.argmax(dim=1) == labels).sum().item()
            val_samples += batch_size

    avg_val_loss = val_loss / val_samples
    avg_val_acc  = val_correct / val_samples

    # ========== LOGGING ==========
    writer.add_scalar('Loss/Train', avg_train_loss, epoch)
    writer.add_scalar('Loss/Val',   avg_val_loss,   epoch)
    writer.add_scalar('Acc/Train',  avg_train_acc,  epoch)
    writer.add_scalar('Acc/Val',    avg_val_acc,    epoch)

    elapsed = time.time() - start_time
    print(f"Epoch {epoch}/{EPOCHS} "
          f"Train Loss: {avg_train_loss:.4f} Acc: {avg_train_acc:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} Acc: {avg_val_acc:.4f} | "
          f"Time: {elapsed:.1f}s")

    # ========== CHECKPOINT (Saving More Info) ==========
    checkpoint_dict = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'best_val_loss': best_val_loss
    }
    checkpoint_path = os.path.join(CHECKPOINT_DIR, f"resnet34_epoch{epoch}.pth")
    torch.save(checkpoint_dict, checkpoint_path)
    print(f"  --> Saved checkpoint to {checkpoint_path}")

    # ========== EARLY STOPPING + SAVE BEST MODEL ==========
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0

        # Save the best model so far (same extra info)
        best_checkpoint_dict = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_val_loss': best_val_loss
        }
        torch.save(best_checkpoint_dict, os.path.join(CHECKPOINT_DIR, "best_model.pth"))
        print(f"  --> New best model saved (val_loss={avg_val_loss:.4f})")
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= EARLY_STOP_PATIENCE:
        print(f"Early stopping at epoch {epoch}. Best val_loss={best_val_loss:.4f}")
        break

    # Step the scheduler at the end of each epoch
    scheduler.step()

# End of training
writer.close()
print("Training complete!")

2025-03-25 08:40:11.510037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742892012.152548    4168 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742892012.365831    4168 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-25 08:40:14.591239: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/200 Train Loss: 0.7105 Acc: 0.5048 | Val Loss: 0.7597 Acc: 0.4985 | Time: 140.9s
  --> Saved checkpoint to ./checkpoints_simple/resnet34_epoch1.pth
  --> New best model saved (val_loss=0.7597)
Epoch 2/200 Train Loss: 0.6926 Acc: 0.5454 | Val Loss: 0.7198 Acc: 0.5000 | Time: 106.0s
  --> Saved checkpoint to ./checkpoints_simple/resnet34_epoch2.pth
  --> New best model saved (val_loss=0.7198)
Epoch 3/200 Train Loss: 0.6651 Acc: 0.6028 | Val Loss: 0.7895 Acc: 0.5035 | Time: 104.6s
  --> Saved checkpoint to ./checkpoints_simple/resnet34_epoch3.pth
Epoch 4/200 Train Loss: 0.6240 Acc: 0.6478 | Val Loss: 0.7445 Acc: 0.4955 | Time: 110.7s
  --> Saved checkpoint to ./checkpoints_simple/resnet34_epoch4.pth


KeyboardInterrupt: 

After your code finishes (or even while it’s still training), open a terminal (or a separate notebook cell) and run:
tensorboard --logdir=Downloads/runs/resnet34_simple
Then, in your web browser, go to:
http://localhost:6006
You’ll see the TensorBoard UI, where you can view the train/val loss and train/val accuracy curves under the “Scalars” tab.

Loading a Checkpoint to Resume Training

In [None]:
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet34
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter

# 1) Decide which device to use
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT_DIR = './checkpoints_simple'

# 2) Load the checkpoint from disk
checkpoint_path = os.path.join(CHECKPOINT_DIR, 'resnet34_epoch6.pth')
checkpoint = torch.load(checkpoint_path, map_location=DEVICE)

# 3) Rebuild the same model architecture
model_resumed = resnet34(pretrained=False)
model_resumed.fc = nn.Linear(model_resumed.fc.in_features, 2)  # 2-class final layer
model_resumed.to(DEVICE)

# 4) Rebuild optimizer & schedule
criterion_resumed = nn.CrossEntropyLoss().to(DEVICE)
optimizer_resumed = optim.AdamW(model_resumed.parameters(), lr=1e-4)
scheduler_resumed = CosineAnnealingLR(optimizer_resumed, T_max=15, eta_min=1e-5)

# 5) Load the saved states
model_resumed.load_state_dict(checkpoint['model_state_dict'])
optimizer_resumed.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler_resumed.load_state_dict(checkpoint['scheduler_state_dict'])

start_epoch = checkpoint['epoch'] + 1
best_val_loss = checkpoint['best_val_loss']

print(f"Resuming training from epoch {start_epoch}, best_val_loss so far: {best_val_loss:.4f}")

# 6) Set up TensorBoard writer
# You can change 'runs/resnet34_resume' to any folder you like
writer = SummaryWriter(log_dir='runs/resnet34_resume')

# 7) Continue training loop
EPOCHS = 10  # or however many total epochs you want to run now

for epoch in range(start_epoch, EPOCHS + 1):
    start_time = time.time()

    # ---------- TRAIN ----------
    model_resumed.train()
    train_loss, train_correct, train_samples = 0.0, 0, 0

    for images, labels in train_loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)

        outputs = model_resumed(images)
        loss = criterion_resumed(outputs, labels)

        optimizer_resumed.zero_grad()
        loss.backward()
        optimizer_resumed.step()

        batch_size = labels.size(0)
        train_loss += loss.item() * batch_size
        train_correct += (outputs.argmax(dim=1) == labels).sum().item()
        train_samples += batch_size

    avg_train_loss = train_loss / train_samples
    avg_train_acc  = train_correct / train_samples

    # ---------- VALID ----------
    model_resumed.eval()
    val_loss, val_correct, val_samples = 0.0, 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)

            outputs = model_resumed(images)
            loss = criterion_resumed(outputs, labels)

            batch_size = labels.size(0)
            val_loss += loss.item() * batch_size
            val_correct += (outputs.argmax(dim=1) == labels).sum().item()
            val_samples += batch_size

    avg_val_loss = val_loss / val_samples
    avg_val_acc  = val_correct / val_samples

    elapsed = time.time() - start_time
    print(f"Epoch {epoch}/{EPOCHS} | "
          f"Train Loss: {avg_train_loss:.4f}, Acc: {avg_train_acc:.4f} | "
          f"Val Loss: {avg_val_loss:.4f}, Acc: {avg_val_acc:.4f} | "
          f"Time: {elapsed:.1f}s")

    # ---------- TensorBoard Logging ----------
    writer.add_scalar('Loss/Train', avg_train_loss, epoch)
    writer.add_scalar('Loss/Val',   avg_val_loss,   epoch)
    writer.add_scalar('Acc/Train',  avg_train_acc,  epoch)
    writer.add_scalar('Acc/Val',    avg_val_acc,    epoch)

    # ---------- Save a checkpoint each epoch ----------
    checkpoint_dict = {
        'epoch': epoch,
        'model_state_dict': model_resumed.state_dict(),
        'optimizer_state_dict': optimizer_resumed.state_dict(),
        'scheduler_state_dict': scheduler_resumed.state_dict(),
        'best_val_loss': best_val_loss
    }

    epoch_ckpt_path = os.path.join(CHECKPOINT_DIR, f"resnet34_epoch{epoch}.pth")
    torch.save(checkpoint_dict, epoch_ckpt_path)
    print(f"  --> Saved checkpoint: {epoch_ckpt_path}")

    # ---------- If improved, save best model ----------
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        print(f"  --> New best val_loss: {best_val_loss:.4f}")
        torch.save(checkpoint_dict, os.path.join(CHECKPOINT_DIR, "best_model.pth"))
        print(f"  --> Saved best model: {os.path.join(CHECKPOINT_DIR, 'best_model.pth')}")

    # ---------- Step scheduler ----------
    scheduler_resumed.step()

print("Resumed training complete!")
writer.close()
print("TensorBoard writer closed.")

2025-03-25 16:19:35.249203: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742919575.921325    4173 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742919576.248892    4173 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-25 16:19:39.067784: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Resuming training from epoch 7, best_val_loss so far: 0.7198
Epoch 7/10 | Train Loss: 0.2856, Acc: 0.8812 | Val Loss: 1.5615, Acc: 0.4950 | Time: 132.4s
  --> Saved checkpoint: ./checkpoints_simple/resnet34_epoch7.pth
Epoch 8/10 | Train Loss: 0.1934, Acc: 0.9204 | Val Loss: 1.5751, Acc: 0.4940 | Time: 114.0s
  --> Saved checkpoint: ./checkpoints_simple/resnet34_epoch8.pth
Epoch 9/10 | Train Loss: 0.1288, Acc: 0.9502 | Val Loss: 2.0548, Acc: 0.5005 | Time: 123.5s
  --> Saved checkpoint: ./checkpoints_simple/resnet34_epoch9.pth
Epoch 10/10 | Train Loss: 0.0557, Acc: 0.9798 | Val Loss: 2.0109, Acc: 0.4835 | Time: 119.8s
  --> Saved checkpoint: ./checkpoints_simple/resnet34_epoch10.pth
Resumed training complete!
TensorBoard writer closed.
