# Dataloading benchmarking using Fashion-MNIST 

We are using [Fashion-MNIST](https://www.kaggle.com/datasets/zalando-research/fashionmnist) dataset for benchmarking our datapipe performance.

Model is from this [kaggle notebook](https://www.kaggle.com/code/pankajj/fashion-mnist-with-pytorch-93-accuracy)

## Dataset Size:
| Size       | File Size | Zipped size |
|:-----------|:----------|:-----------------|
| Train data | 127M      | 33M              |
| Test data  | 22M       | 5.4M             |


The benchmark results were run using a MacBook Pro 2021 equipped with a chip Apple M1 Pro and 16 GB of memory.

# Section1 : Preparation

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset
import time
import fsspec
import numpy as np
import pandas as pd
from pelicanfs.core import PelicanFileSystem
from zipfile import ZipFile 


## Defining the model

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class FashionDataset(Dataset):  
    def __init__(self, data, transform = None):
        """Method to initilaize variables.""" 
        self.fashion_MNIST = list(data.values)
        self.transform = transform
        
        label = []
        image = []
        
        for i in self.fashion_MNIST:
             # first column is of labels.
            label.append(i[0])
            image.append(i[1:])
        self.labels = np.asarray(label)
        # Dimension of Images = 28 * 28 * 1. where height = width = 28 and color_channels = 1.
        self.images = np.asarray(image).reshape(-1, 28, 28, 1).astype('float32')

    def __getitem__(self, index):
        label = self.labels[index]
        image = self.images[index]
        
        if self.transform is not None:
            image = self.transform(image)

        return image, label

    def __len__(self):
        return len(self.images)

class FashionCNN(nn.Module):
    
    def __init__(self):
        super(FashionCNN, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc1 = nn.Linear(in_features=64*6*6, out_features=600)
        self.drop = nn.Dropout(0.25)
        self.fc2 = nn.Linear(in_features=600, out_features=120)
        self.fc3 = nn.Linear(in_features=120, out_features=10)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.drop(out)
        out = self.fc2(out)
        out = self.fc3(out)
        
        return out

    
model = FashionCNN()
model.to(device)

error = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [3]:
def training():
    num_epochs = 3
    count = 0
    # Lists for visualization of loss and accuracy 
    loss_list = []
    iteration_list = []
    accuracy_list = []

    # Lists for knowing classwise accuracy
    predictions_list = []
    labels_list = []

    for epoch in range(num_epochs):
        start_time = time.time()
        for images, labels in train_loader:
            # Transfering images and labels to GPU if available
            images, labels = images.to(device), labels.to(device)
    
            # Use the actual batch size when reshaping
            batch_size = images.size(0)  # Get the current batch size
            train = Variable(images.view(batch_size, 1, 28, 28))
        
            # Forward pass 
            outputs = model(train)
            loss = error(outputs, labels)
        
            # Initializing a gradient as 0 so there is no mixing of gradient among the batches
            optimizer.zero_grad()
        
            #Propagating the error backward
            loss.backward()
        
            # Optimizing the parameters
            optimizer.step()
    
            count += 1
    
            # Testing
            if not (count % 50):    # It's same as "if count % 50 == 0"
                total = 0
                correct = 0
        
                for images, labels in test_loader:
                    images, labels = images.to(device), labels.to(device)
                    labels_list.append(labels)
                    
                    batch_size = images.size(0)
                    test = Variable(images.view(batch_size, 1, 28, 28))
            
                    outputs = model(test)
            
                    predictions = torch.max(outputs, 1)[1].to(device)
                    predictions_list.append(predictions)
                    correct += (predictions == labels).sum()
            
                    total += len(labels)
            
                accuracy = correct * 100 / total
                loss_list.append(loss.data)
                iteration_list.append(count)
                accuracy_list.append(accuracy)
        
        if not (count % 500):
            print("Iteration: {}, Loss: {:.2f}, Accuracy: {:2f}%".format(count, loss.data, accuracy))
        end_time = time.time()
        print(f"Time of {epoch+1}/{num_epochs} epoch: {end_time-start_time:.2f}s.")

## Read from Local

In [4]:
s_time = time.time()

train_csv = pd.read_csv("input/fashion-mnist_train.csv")
test_csv = pd.read_csv("input/fashion-mnist_test.csv")

train_set = FashionDataset(train_csv, transform=transforms.Compose([transforms.ToTensor()]))
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=128)
test_loader = DataLoader(train_set, batch_size=128)

e_time = time.time()

print(f"Preparing before training: {e_time - s_time:.2f}")

training()

Preparing before training: 4.46
Time of 1/3 epoch: 144.34s.
Time of 2/3 epoch: 133.69s.
Time of 3/3 epoch: 145.90s.


## Read from Pelican using Pelicanfs

In [5]:
s_time = time.time()

fs = PelicanFileSystem("pelican://osg-htc.org")
train_csv = pd.read_csv(fs.open('/chtc/PUBLIC/hzhao292/fashion-mnist_train.csv', 'rb'))
test_csv = pd.read_csv(fs.open('/chtc/PUBLIC/hzhao292/fashion-mnist_test.csv', 'rb'))

train_set = FashionDataset(train_csv, transform=transforms.Compose([transforms.ToTensor()]))
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=128)
test_loader = DataLoader(train_set, batch_size=128)

e_time = time.time()

print(f"Preparing before training: {e_time - s_time:.2f}")

training()

Preparing before training: 8.83
Time of 1/3 epoch: 133.68s.
Time of 2/3 epoch: 135.55s.
Time of 3/3 epoch: 149.93s.


# Reading from Pelican with local cache

In [7]:
s_time = time.time()

fs = fsspec.filesystem("filecache", target_protocol='osdf', cache_storage='tmp/files/')

train_csv = pd.read_csv(fs.open('/chtc/PUBLIC/hzhao292/fashion-mnist_train.csv', 'rb'))
test_csv = pd.read_csv(fs.open('/chtc/PUBLIC/hzhao292/fashion-mnist_test.csv', 'rb'))

train_set = FashionDataset(train_csv, transform=transforms.Compose([transforms.ToTensor()]))
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=128)
test_loader = DataLoader(train_set, batch_size=128)

e_time = time.time()

print(f"Preparing before training: {e_time - s_time:.2f}")

training()

Preparing before training: 8.44
Time of 1/3 epoch: 150.61s.
Time of 2/3 epoch: 141.30s.
Time of 3/3 epoch: 169.34s.


# Downloading zip file from pelican

In [10]:
import zipfile
from io import BytesIO

s_time = time.time()

fs = PelicanFileSystem("pelican://osg-htc.org")

def read_csv_from_zipped(fs, path):
    # Read the zip file from Pelican into a BytesIO buffer
    with fs.open(path, 'rb') as file:
        zip_buffer = BytesIO(file.read())

    # Create a ZipFile object from the buffer
    with zipfile.ZipFile(zip_buffer, 'r') as zipf:
        csv_file_name = zipf.namelist()[0]
        with zipf.open(csv_file_name) as csv_file:
            # Read the CSV file content into a pandas DataFrame
            df = pd.read_csv(csv_file)
    return df

train_csv = read_csv_from_zipped(fs, '/chtc/PUBLIC/hzhao292/fashion-mnist_train.zip')
test_csv = read_csv_from_zipped(fs, '/chtc/PUBLIC/hzhao292/fashion-mnist_test.zip')

train_set = FashionDataset(train_csv, transform=transforms.Compose([transforms.ToTensor()]))
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=128)
test_loader = DataLoader(train_set, batch_size=128)

e_time = time.time()

print(f"Preparing before training: {e_time - s_time:.2f}")

training()

Preparing before training: 6.27
Time of 1/3 epoch: 151.06s.
Time of 2/3 epoch: 124.90s.
Time of 3/3 epoch: 133.65s.


In [11]:
import zipfile
from io import BytesIO

s_time = time.time()
fs = PelicanFileSystem("pelican://osg-htc.org")

def stream_large_zip(fs, path):
    with fsspec.open(path, 'rb') as file:
        with zipfile.ZipFile(file) as zipf:
            csv_file_name = zipf.namelist()[0]
            with zipf.open(csv_file_name) as csv_file:
                df = pd.read_csv(csv_file)
    return df


train_csv = read_csv_from_zipped(fs, '/chtc/PUBLIC/hzhao292/fashion-mnist_train.zip')
test_csv = read_csv_from_zipped(fs, '/chtc/PUBLIC/hzhao292/fashion-mnist_test.zip')

train_set = FashionDataset(train_csv, transform=transforms.Compose([transforms.ToTensor()]))
test_set = FashionDataset(test_csv, transform=transforms.Compose([transforms.ToTensor()]))

train_loader = DataLoader(train_set, batch_size=128)
test_loader = DataLoader(train_set, batch_size=128)

e_time = time.time()

print(f"Preparing before training: {e_time - s_time:.2f}")

training()

Preparing before training: 6.37
Time of 1/3 epoch: 125.07s.
Time of 2/3 epoch: 123.13s.
Time of 3/3 epoch: 135.19s.
