# ResNet Model

In [4]:
'''ResNet in PyTorch.

For Pre-activation ResNet, see 'preact_resnet.py'.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet10():
    return ResNet(BasicBlock, [1,1,1,1])

def ResNet12():
    return ResNet(BasicBlock, [1,2,1,1])

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def ResNet34():
    return ResNet(BasicBlock, [3, 4, 6, 3])


def ResNet50():
    return ResNet(Bottleneck, [3, 4, 6, 3])


def ResNet101():
    return ResNet(Bottleneck, [3, 4, 23, 3])


def ResNet152():
    return ResNet(Bottleneck, [3, 8, 36, 3])


def test():
    net = ResNet18()
    y = net(torch.randn(1, 3, 32, 32))
    print(y.size())

# test()


# Pre_Activation ResNet Model

In [5]:
'''Pre-activation ResNet in PyTorch.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
'''
import torch
import torch.nn as nn
import torch.nn.functional as F


class PreActBlock(nn.Module):
    '''Pre-activation version of the BasicBlock.'''
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out += shortcut
        return out


class PreActBottleneck(nn.Module):
    '''Pre-activation version of the original Bottleneck module.'''
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(PreActBottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)

        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
            )

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        out = self.conv3(F.relu(self.bn3(out)))
        out += shortcut
        return out


class PreActResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(PreActResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def PreActResNet10():
    return PreActResNet(PreActBlock, [1,1,1,1])

def PreActResNet12():
    return PreActResNet(PreActBlock, [1,2,1,1])

def PreActResNet18():
    return PreActResNet(PreActBlock, [2,2,2,2])

def PreActResNet34():
    return PreActResNet(PreActBlock, [3,4,6,3])

def PreActResNet50():
    return PreActResNet(PreActBottleneck, [3,4,6,3])

def PreActResNet101():
    return PreActResNet(PreActBottleneck, [3,4,23,3])

def PreActResNet152():
    return PreActResNet(PreActBottleneck, [3,8,36,3])


def test():
    net = PreActResNet18()
    y = net((torch.randn(1,3,32,32)))
    print(y.size())

# test()


# Model Parameters num

In [6]:
import torch
# from utils import progress_bar
from torchsummary import summary

model = PreActResNet152()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  

summary(model, input_size=(3,32, 32)) 

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,728
       BatchNorm2d-2           [-1, 64, 32, 32]             128
            Conv2d-3          [-1, 256, 32, 32]          16,384
            Conv2d-4           [-1, 64, 32, 32]           4,096
       BatchNorm2d-5           [-1, 64, 32, 32]             128
            Conv2d-6           [-1, 64, 32, 32]          36,864
       BatchNorm2d-7           [-1, 64, 32, 32]             128
            Conv2d-8          [-1, 256, 32, 32]          16,384
  PreActBottleneck-9          [-1, 256, 32, 32]               0
      BatchNorm2d-10          [-1, 256, 32, 32]             512
           Conv2d-11           [-1, 64, 32, 32]          16,384
      BatchNorm2d-12           [-1, 64, 32, 32]             128
           Conv2d-13           [-1, 64, 32, 32]          36,864
      BatchNorm2d-14           [-1, 64,

# Load Data
two transformers for train data, 

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import pickle
import numpy as np
import os
from PIL import Image
import multiprocessing
from torch.utils.data import random_split


# Unpickle function
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

class CIFARDataset(Dataset):
    def __init__(self, data_files, transform=None):
        self.data = []
        self.labels = []
        self.transform = transform

        for file in data_files:
            batch = unpickle(file)
            images = batch[b'data']
            labels = batch[b'labels']

            images = images.reshape(-1, 3, 32, 32).astype(np.uint8)  # Keep uint8 (0-255)
            self.data.append(images)
            self.labels.extend(labels)

        self.data = np.vstack(self.data)
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = self.data[idx]  # Still a NumPy array (H, W, C)
        label = self.labels[idx]

        # Convert NumPy array to PIL image before applying transforms
        image = Image.fromarray(np.transpose(image, (1, 2, 0)))  # Convert (C, H, W) -> (H, W, C)
        
        if self.transform:
            image = self.transform(image)  # Now transform works correctly

        return image, label




# Define paths
data_dir = "deep-learning-spring-2025-project-1/cifar-10-python/cifar-10-batches-py"  # Change this if your path is different
train_files = [os.path.join(data_dir, f"data_batch_{i}") for i in range(1, 6)]
test_file = os.path.join(data_dir, "test_batch")

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  # Randomly crop 32x32 with padding of 4
    transforms.RandomHorizontalFlip(),  # Flip the image horizontally with 50% probability
    transforms.RandomRotation(10),
    # transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),  # Convert image to a PyTorch tensor
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),  # Normalize with CIFAR-10 mean/std
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)),
])

# transform_train = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
# ])

"""
Q: Do we need trans-test?

"""
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


# Automatically determine optimal num_workers
num_workers = min(8, multiprocessing.cpu_count() // 2)  # Use half of CPU cores, max 8
pin_memory = True if torch.cuda.is_available() else False  # Enable only if using GPU



# Load datasets
train_dataset = CIFARDataset(train_files, transform=transform_train)
test_dataset = CIFARDataset([test_file], transform=transform_test)

train_size = int(0.9 * len(train_dataset))  
val_size = len(train_dataset) - train_size 
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

# train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
# test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)



In [8]:
# Check Data
# data = unpickle("deep-learning-spring-2025-project-1/cifar-10-python/cifar-10-batches-py/data_batch_1" )
# print(data.keys())  
# print(len(data[b'labels']))

# Model Initialization

In [None]:
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model
model = PreActResNet152().to(device)


# Training function with validation
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=200, patience=10, save_dir="checkpoints"):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    best_val_acc = 0.0  # Store the best validation accuracy
    early_stop_counter = 0  # Early stopping counter
    best_val_loss = float("inf") 

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

        scheduler.step()  # Update learning rate

        # Compute training loss and accuracy
        train_loss = running_loss / len(train_loader)
        train_acc = 100 * correct / total

        # Compute validation loss and accuracy
        val_loss, val_acc = validate(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

        # # Save the best model based on validation accuracy
        # if val_acc > best_val_acc:
        #     best_val_acc = val_acc
        #     torch.save(model.state_dict(), os.path.join(save_dir, "best_model.pth"))
        #     print("Best model saved!")

        # # Early stopping logic
        # if val_loss > train_loss:
        #     early_stop_counter += 1
        #     if early_stop_counter >= patience:
        #         print("Early stopping triggered!")
        #         break

        # --- Save the best model based on val_acc ---
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), os.path.join(save_dir, "best_model.pth"))
            print(f"Best model saved (based on val_acc = {val_acc:.2f}%)!")

        # --- Early Stopping based on val_loss ---
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered! Validation loss did not improve.")
                break

# Validation function
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * correct / total
    model.train()  # Switch back to training mode
    return avg_val_loss, val_acc

# Test function
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = outputs.max(1)
            correct += predicted.eq(labels).sum().item()
            total += labels.size(0)

    print(f"Test Accuracy: {100 * correct / total:.2f}%")


# Set hyperparameters
"""
when epochs = 350
warmup_total_iters = 10
"""
epochs = 200
warmup_total_iters = 5
patience = 2000  # Number of epochs to wait before early stopping
save_dir = "checkpoints"

# Define loss function
criterion = nn.CrossEntropyLoss()
# Define optimizer

# optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4, nesterov=True)
# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# Define learning rate scheduler
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

# Learning rate scheduler with warmup
warmup_scheduler = LinearLR(optimizer, start_factor=0.1, total_iters=warmup_total_iters)
cosine_scheduler = CosineAnnealingLR(optimizer, T_max=epochs-warmup_total_iters)
scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, cosine_scheduler], milestones=[5])


# Train the model
train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs, patience, save_dir)

# Test the model
test(model, test_loader)


Epoch 1/200, Train Loss: 1.7874, Train Acc: 33.92%, Val Loss: 1.5252, Val Acc: 44.20%
Best model saved (based on val_acc = 44.20%)!
Epoch 2/200, Train Loss: 1.4275, Train Acc: 48.33%, Val Loss: 1.4562, Val Acc: 49.96%
Best model saved (based on val_acc = 49.96%)!
Epoch 3/200, Train Loss: 1.1612, Train Acc: 58.55%, Val Loss: 1.0614, Val Acc: 61.74%
Best model saved (based on val_acc = 61.74%)!
Epoch 4/200, Train Loss: 0.9705, Train Acc: 65.95%, Val Loss: 1.0456, Val Acc: 64.52%
Best model saved (based on val_acc = 64.52%)!




Epoch 5/200, Train Loss: 0.8637, Train Acc: 70.25%, Val Loss: 0.9723, Val Acc: 66.30%
Best model saved (based on val_acc = 66.30%)!
Epoch 6/200, Train Loss: 0.8041, Train Acc: 72.09%, Val Loss: 0.9667, Val Acc: 68.84%
Best model saved (based on val_acc = 68.84%)!
Epoch 7/200, Train Loss: 0.7395, Train Acc: 74.49%, Val Loss: 0.7535, Val Acc: 74.16%
Best model saved (based on val_acc = 74.16%)!
Epoch 8/200, Train Loss: 0.6971, Train Acc: 75.75%, Val Loss: 0.9112, Val Acc: 70.48%
Epoch 9/200, Train Loss: 0.6634, Train Acc: 76.98%, Val Loss: 0.6933, Val Acc: 76.06%
Best model saved (based on val_acc = 76.06%)!
Epoch 10/200, Train Loss: 0.6426, Train Acc: 77.75%, Val Loss: 0.8073, Val Acc: 72.96%
Epoch 11/200, Train Loss: 0.6271, Train Acc: 78.42%, Val Loss: 0.6889, Val Acc: 76.46%
Best model saved (based on val_acc = 76.46%)!
Epoch 12/200, Train Loss: 0.6112, Train Acc: 78.91%, Val Loss: 0.7610, Val Acc: 74.22%
Epoch 13/200, Train Loss: 0.6014, Train Acc: 79.18%, Val Loss: 0.7039, Val Acc:

# Save Model

In [10]:
PreActResNet152_2_model = model

In [11]:
torch.save(PreActResNet152_2_model.state_dict(), "PreActResNet152_2_model.pth")

# Run Kaggal Test

In [12]:
import pickle
import numpy as np
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Load the test dataset
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Load the no-label test dataset
test_data = unpickle("deep-learning-spring-2025-project-1/cifar-10-python/cifar-10-batches-py/cifar_test_nolabel.pkl")

# Extract image data and IDs
test_images = test_data[b'data']  # Shape (N, 3072)
test_ids = test_data[b'ids']  # Image IDs for submission


In [13]:
test_images = test_images.transpose(0,3,1,2)
# test_images.shape

In [14]:

# Reshape to (N, 3, 32, 32)
# test_images = test_images.reshape(-1, 3, 32, 32).astype(np.uint8)
# test_images = test_images.transpose(0,3,2,1)
class TestCIFARDataset(Dataset):
    def __init__(self, images, transform=None):
        self.images = images
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        image = Image.fromarray(np.transpose(image, (1, 2, 0)))  # Convert (C, H, W) → (H, W, C)

        if self.transform:
            image = self.transform(image)

        return image

# Define test transforms (same as training but without augmentation)
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Create DataLoader for inference
test_dataset = TestCIFARDataset(test_images, transform=transform_test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=8, pin_memory=True)


In [15]:
def run_inference(model, test_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():  # Disable gradients for faster inference
        for images in test_loader:
            images = images.to(device)

            outputs = model(images)
            _, predicted = outputs.max(1)  # Get the class index with max probability
            predictions.extend(predicted.cpu().numpy())  # Move to CPU for saving

    return predictions


In [16]:
import pandas as pd

# Run inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predictions = run_inference(model, test_loader, device)

# Create a DataFrame
submission_df = pd.DataFrame({'ID': test_ids, 'Labels': predictions})

# Save to CSV (without index column)
submission_df.to_csv("newopt_resnet10.csv", index=False)

print("Submission file saved as submission.csv")


Submission file saved as submission.csv
