# Notebook for training and evaluating the dataset on a benchmark model DenseNet169

In [1]:
!pip install tensorboard

Collecting tensorboard
  Using cached tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Using cached Markdown-3.6-py3-none-any.whl.metadata (7.0 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Using cached tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Using cached werkzeug-3.0.3-py3-none-any.whl.metadata (3.7 kB)
Using cached tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
Using cached Markdown-3.6-py3-none-any.whl (105 kB)
Using cached tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl (6.6 MB)
Using cached werkzeug-3.0.3-py3-none-any.whl (227 kB)
Installing collected packages: werkzeug, tensorboard-data-server, markdown, tensorboard
Successfully installed markdown-3.6 tensorboard-2.16.2 tensorboard-data-server-0.7.2 werkzeug-3.0.3


In [2]:
import os
from datetime import datetime
import zipfile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split, DataLoader, Subset, ConcatDataset
from torchvision import datasets, transforms, models
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import numpy as np
from sklearn.metrics import confusion_matrix
from google.cloud import storage
import random
import math

In [3]:
def set_seed(seed=420):
    """Sets the seed for reproducibility."""
    # Python RNG
    random.seed(seed)
    
    # PyTorch RNGs
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
    # Numpy RNG
    np.random.seed(seed)
    
    # OS RNG
    os.environ['PYTHONHASHSEED'] = str(seed)

def worker_init_fn(worker_id):    
    """Ensure that the data loading process is deterministic."""
    np.random.seed(np.random.get_state()[1][0] + worker_id)

set_seed(420)

In [4]:
project_id = 'deep-learning-420208'

In [5]:
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [6]:
class AugmentationTrafficSignLoader:
    def __init__(self, root):
        self.root = root
        self.augmentations = [folder for folder in os.listdir(root)
                              if os.path.isdir(os.path.join(root, folder))]

    def calculate_mean_and_variance(self, training_root, percentage_of_whole):
    
        transform = transforms.Compose([
                transforms.Resize((96, 96)),
                transforms.ToTensor()
            ])

        test_dataset = datasets.ImageFolder(root=training_root, transform=transform)

        indices = random.sample(population=list(range(len(test_dataset))), k=math.floor(len(test_dataset)*percentage_of_whole))
        sample = Subset(test_dataset, indices)
        loader = DataLoader(sample)

        mean = 0.0
        variance = 0.0
        total_images = 0

        for images, _ in loader:
            # Rearrange batch to be the shape of [B, C, W * H]
            images = images.view(images.size(0), images.size(1), -1)
            # Update total_images
            total_images += images.size(0)
            # Compute mean and variance here
            mean += images.mean(2).sum(0) 
            variance += images.var(2).sum(0)

        # Final mean and variance
        mean /= total_images
        variance /= total_images

        return mean, variance.sqrt()
    
    def augmentation_generator(self):
        while self.augmentations:
            current_aug = self.augmentations.pop()
            current_aug_path = os.path.join(self.root, current_aug)
            mean, std = self.calculate_mean_and_variance(current_aug_path, 0.25)
            #print(mean, std)
            transform = transforms.Compose([
                transforms.Resize((96, 96)),
                transforms.ToTensor(),
                transforms.Normalize(mean=mean, std=std)
            ])
            yield os.path.basename(current_aug), datasets.ImageFolder(root=current_aug_path, transform=transform), mean, std

In [10]:
class FineTuneDenseNet169(nn.Module):
    def __init__(self, num_classes):
        super(FineTuneDenseNet169, self).__init__()
        
        # Load DenseNet169 with pretrained weights
        self.densenet169 = models.densenet169(weights=models.DenseNet169_Weights.IMAGENET1K_V1)
        
        # Remove the last classifier layer (which is a Linear layer)
        num_ftrs = self.densenet169.classifier.in_features
        
        # Add new classifier layers
        self.densenet169.classifier = nn.Sequential(
            nn.Linear(num_ftrs, 256),  # First new fully connected layer
            nn.ReLU(inplace=True),     # Activation layer
            nn.Linear(256, num_classes) # Second new fully connected layer mapping to the 33 classes
        )

    def forward(self, x):
        # Forward through the modified DenseNet169 model
        x = self.densenet169(x)
        return x

In [11]:
def upload_directory_to_gcs(bucket_name, source_directory, destination_blob_prefix):
    """Uploads a local directory and its subdirectories to a GCS bucket."""
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket_name)

    for dirpath, dirnames, filenames in os.walk(source_directory):
        for filename in filenames:
            local_file_path = os.path.join(dirpath, filename)
            relative_path = os.path.relpath(local_file_path, source_directory)
            blob_path = os.path.join(destination_blob_prefix, relative_path)
            blob = bucket.blob(blob_path)
            blob.upload_from_filename(local_file_path)
            #print(f"File {local_file_path} uploaded to {blob_path}.")

In [None]:
benchmarks = [
    ('FineTuneDenseNet169', FineTuneDenseNet169), 
]

for bm_name, bm in benchmarks:
    
    TRAINING_DATA_ROOT, TEST_DATA_ROOT = 'benchmark-reduced/synthetic', 'benchmark-reduced/test_data'
    
    def calculate_accuracy(model, data_loader, device):
        """Calculates accuracy on given dataset"""
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in data_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        return 100 * correct / total

    def evaluate_and_save_results(model, loader, device, classes, bucket_name, prefix, aug_name):
        """Evaluates the model and saves results and the model itself to Google Cloud Storage under a specific subfolder."""
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for images, labels in loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Compute the confusion matrix
        cm = confusion_matrix(all_labels, all_preds, labels=np.arange(len(classes)))

        # Folder prefix including augmentation name
        full_prefix = f"{prefix}/results/{aug_name}"
        os.makedirs(full_prefix)

        # Save confusion matrix
        cm_path = f"{full_prefix}/confusion_matrix_{aug_name}.npy"
        np.save(cm_path, cm)

        # Save predictions
        preds_path = f"{full_prefix}/predictions_{aug_name}.npy"
        np.save(preds_path, np.array(all_preds))

        # Save the model
        model_path = f"{full_prefix}/model_{aug_name}.pth"
        torch.save(model.state_dict(), model_path)

        return cm

    def train_and_evaluate(model, train_loader, val_loader, test_loader, criterion, optimizer, device, writer, prefix, epochs=5, patience=20, min_delta=0.001):
        best_val_loss = float('inf')
        best_val_acc = 0
        epochs_no_improve = 0
        early_stop = False
        # Path to save the best model
        # Ensure the directory exists
        os.makedirs(prefix)
        best_model_path = f'{prefix}/best_model.pth'

        for epoch in tqdm(range(epochs)):
            model.train()
            running_loss = 0.0
            for images, labels in train_loader:
                images, labels = images.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * images.size(0)

            train_loss = running_loss / len(train_loader.dataset)

            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for images, labels in val_loader:
                    images, labels = images.to(device), labels.to(device)
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item() * images.size(0)
            val_loss /= len(val_loader.dataset)

            train_accuracy = calculate_accuracy(model, train_loader, device)
            val_accuracy = calculate_accuracy(model, val_loader, device)
            test_accuracy = calculate_accuracy(model, test_loader, device)

            writer.add_scalars('Loss', {'Train': train_loss, 'Validation': val_loss}, epoch)
            writer.add_scalars('Accuracy', {'Train': train_accuracy, 'Validation': val_accuracy}, epoch)

            if val_loss < best_val_loss - min_delta:
                #best_val_loss = val_loss
                epochs_no_improve = 0
                torch.save(model.state_dict(), best_model_path)  # Save the best model checkpoint
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= patience:
                    print(f"Early stopping triggered after {epoch + 1} epochs.")
                    early_stop = True
                    break

            print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%, Test Acc: {test_accuracy:.2f}%')

        model.load_state_dict(torch.load(best_model_path, map_location=device))
        test_accuracy = calculate_accuracy(model, test_loader, device)
        print(test_accuracy)
        writer.add_scalar('Accuracy/test', test_accuracy, 1)
        writer.close()
        return test_accuracy


    BUCKET_NAME = 'sign-recognition-metrics'
    PREFIX = f'metrics_{bm_name}/' + datetime.now().strftime('%Y%m%d-%H%M%S')

    train_augmentation_generator = AugmentationTrafficSignLoader(root=TRAINING_DATA_ROOT).augmentation_generator()
    for aug_name, aug_variant, mean, std in train_augmentation_generator:
        # Ensure GPU memory is clean before starting the setup
        torch.cuda.empty_cache()
        print(aug_name)

        # Data loading setup
        num_classes = len(aug_variant.classes)

        #Model initialization and setup
        model = bm(num_classes)
        model.to(device)

        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-3)
        criterion = nn.CrossEntropyLoss()


        #print(aug_variant.classes)
        train_size = int(0.8 * len(aug_variant))
        val_size = len(aug_variant) - train_size
        train_dataset, val_dataset = random_split(aug_variant, [train_size, val_size])
        train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=16, worker_init_fn=worker_init_fn)
        val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=16, worker_init_fn=worker_init_fn)

        # Load the test dataset
        transform = transforms.Compose([
                    transforms.Resize((96, 96)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=mean, std=std)
                ])
        test_dataset = datasets.ImageFolder(root=TEST_DATA_ROOT, transform=transform)
        idx_to_class = {v: k for k, v in test_dataset.class_to_idx.items()}
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=16, worker_init_fn=worker_init_fn)

        # TensorBoard Setup
        writer_dir = os.path.join(PREFIX, 'runs', aug_name)
        writer = SummaryWriter(writer_dir)

        model_dir = f"{PREFIX}/models/{aug_name}"
        # Training and Evaluation
        test_accuracy = train_and_evaluate(model, train_loader, val_loader, test_loader, criterion, optimizer, device, writer, model_dir, epochs=50)
        cm = evaluate_and_save_results(model, test_loader, device, idx_to_class, BUCKET_NAME, PREFIX, aug_name)

        # Upload results
        upload_directory_to_gcs(BUCKET_NAME, PREFIX, PREFIX)

        # Clear memory
        torch.cuda.empty_cache()