# Benchmarking the performance of Pelican part2

## Dataset
[ImageNet](https://www.kaggle.com/c/imagenet-object-localization-challenge/overview)
Using this [script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh) to prepare the data first. Then train it using ResNet50.

## Hardware
Google Colab T4 GPU with high RAM

In [1]:
!pip install torchdata
!pip install pelicanfs fsspec

Collecting torchdata
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2->torchdata)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2->torchdata)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=2->torchdata)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x8

In [2]:
import torch
print(torch.cuda.is_available())


True


In [3]:
import os
from PIL import Image
import torch
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import Dataset, DataLoader
import fsspec
import numpy as np
import pandas as pd
from pelicanfs.core import PelicanFileSystem
import time


class RemoteImageFolder(Dataset):
    def __init__(self, root, transform=None):
        self.root = root
        self.transform = transform
        self.fs = PelicanFileSystem()  # Initialize PelicanFileSystem
        self.samples = self.make_dataset()
        print(f"Found {len(self.samples)} samples in {self.root}")

    def make_dataset(self):
        samples = []
        for rootpath, _, dirnames in self.fs.walk(self.root):
          for dirctory in dirnames:
            dirpath = os.path.join(rootpath, dirctory)
        for subpath, _, filenames in self.fs.walk(dirpath):
            for image in filenames:
                if image.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'gif')):
                    samples.append((os.path.join(subpath, image), os.path.basename(dirctory)))
        return samples

    def default_loader(self, path):
        with self.fs.open(path, 'rb') as f:
            return Image.open(f).convert('RGB')

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, target = self.samples[idx]
        sample = self.default_loader(path)
        if self.transform is not None:
            sample = self.transform(sample)
        return sample, target

# Define transformations for training and validation
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

val_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

start_time = time.time()
# Define the Pelican paths
trainfile_path = "pelican://osg-htc.org/chtc/PUBLIC/hzhao292/ILSVRC/Data/CLS-LOC/train"
valfile_path = "pelican://osg-htc.org/chtc/PUBLIC/hzhao292/ILSVRC/Data/CLS-LOC/val"

# Load the datasets
train_dataset = RemoteImageFolder(root=trainfile_path, transform=train_transforms)
val_dataset = RemoteImageFolder(root=valfile_path, transform=val_transforms)

# Create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True)
end_time = time.time()
elapsed_time = end_time - start_time
print(f'Perparing data time: {elapsed_time:.2f} seconds')


Found 1300 samples in pelican://osg-htc.org/chtc/PUBLIC/hzhao292/ILSVRC/Data/CLS-LOC/train
Found 50 samples in pelican://osg-htc.org/chtc/PUBLIC/hzhao292/ILSVRC/Data/CLS-LOC/val
Perparing data time: 4.33 seconds


In [4]:
import torch.nn as nn
import torch.optim as optim
from torchvision import models

import torch.multiprocessing as mp
import time

# Set multiprocessing start method to 'spawn'
mp.set_start_method('spawn', force=True)

# Load a pre-trained model (e.g., ResNet-50)
model = models.resnet50(pretrained=True)

# Modify the final layer to match the number of classes in ImageNet
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 1000)  # ImageNet has 1000 classes

# Move the model to the GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        start_time = time.time()
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                data_loader = train_loader
            else:
                model.eval()   # Set model to evaluate mode
                data_loader = val_loader

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for inputs, labels in data_loader:
                inputs = inputs.to(device)
           #     labels = labels.tensor(labels, dtype=torch.long).to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                # Track history only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
        #            loss = criterion(outputs, labels)

                    # Backward + optimize only if in training phase
                    if phase == 'train':
 #                       loss.backward()
                        optimizer.step()

                # Statistics
  #              running_loss += loss.item() * inputs.size(0)
          #      running_corrects += torch.sum(preds == labels.data)

#            epoch_loss = running_loss / len(data_loader.dataset)
 #           epoch_acc = running_corrects.double() / len(data_loader.dataset)

 #           print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Elapsed time: {elapsed_time:.2f} seconds')

    return model

# Train the model
model = train_model(model, {'train': train_loader, 'val': val_loader}, criterion, optimizer, num_epochs=25)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 190MB/s]


Epoch 0/24
----------
Elapsed time: 2365.96 seconds
Epoch 1/24
----------
Elapsed time: 151.37 seconds
Epoch 2/24
----------
Elapsed time: 151.33 seconds
Epoch 3/24
----------
Elapsed time: 152.03 seconds
Epoch 4/24
----------
Elapsed time: 152.25 seconds
Epoch 5/24
----------
Elapsed time: 152.44 seconds
Epoch 6/24
----------
Elapsed time: 152.33 seconds
Epoch 7/24
----------
Elapsed time: 152.55 seconds
Epoch 8/24
----------
Elapsed time: 152.16 seconds
Epoch 9/24
----------
Elapsed time: 152.95 seconds
Epoch 10/24
----------


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'resnet50_imagenet.pth')
