In [1]:
#JV

I would develop the code for the assignment in this notebook as it is easy to quickly test (and even unit testing).

When a module/part is bug free I would add it to the .py file later.



In [121]:
import os
import shutil
import numpy as np
import re
import torch
import torchvision

In [86]:
def make_dir(dir,returnIfDirAlreadyExists=False):
    """
    Function to create a directory, if it doesn't exist
    """
    try:
        os.mkdir(dir)
    except Exception as e:
        if "File exists" in str(e):
            if returnIfDirAlreadyExists:
                return True
            pass
        else:
            print(e)

In [78]:
seed = 76 #setting this as seed wherever randomness comes

In [143]:
## Train data downloaded from the given source (https://storage.googleapis.com/wandb_datasets/nature_12K.zip)

"""
Now, the goal is to split 20% of train data, in "train" folder to get validation data.
"""



data_base_dir = 'inaturalist_12K/'

def train_validation_split(base_dir,seed = 76):
    """
    Function to split 20% of the train data into validation data, Uniformly At Random (UAR). Import os and shutil before using this method.

    Note  : Instead of taking 20% of samples randomly out of the entire train data; 20% of train data of each class is taken (UAR), 
    so that for training there is a balance between the number of samples per class.

    Params:

        base_dir : The path to the directory in which the "train/" and "test/" directories are present after unzipping. It is assumed that the given dir path string has a "/ at the end.

        seed : The seed use in the random number generator, default : 76.

    Returns :

        None.
    """

    base_data_dir = base_dir
    train_base_dir = base_data_dir+'train/'
    train_data_class_dirs = os.listdir(train_base_dir)
    
    ## remove dirs starting with "." from the list
    train_data_class_dirs = [i for i in train_data_class_dirs if i[0] != "." ]

    ## Test data is called as val, which is confusing, hence renaming it to test
    os.rename(data_base_dir+"val/",base_data_dir+"test/")
    
    
    ## validation dir
    val_base_dir = base_data_dir+'validation/'
    make_dir(val_base_dir)
    
    ## Iterate over each class and
    ## take 20% data of each class at random as validation data
    
    random_num_generator = np.random.RandomState(seed)
    
    for class_label in train_data_class_dirs:
    
        current_class_train_filenames = os.listdir(train_base_dir+class_label+"/")
    
        num_of_files = len(current_class_train_filenames)
        
        validation_indices = random_num_generator.choice(num_of_files,int(0.2*num_of_files),replace=False)
        train_indices = np.array(list(set(np.arange(num_of_files)).difference(set(validation_indices))))
    
        ##create class dir validation dir
        cur_validation_dir = val_base_dir + class_label +"/"
        make_dir(cur_validation_dir)
        
        for i in validation_indices:
            shutil.move(train_base_dir+class_label+"/"+current_class_train_filenames[i],cur_validation_dir+current_class_train_filenames[i])
        
        print(f"Validation Split for {class_label} is Done!")



In [146]:
"""
Careful perform this train-validation split only once in the entire lifetime, that too on the unzipped dataset.

"""

base_data_dir = "inaturalist_12K/"

#train_validation_split(base_data_dir)

Validation Split for Plantae is Done!
Validation Split for Aves is Done!
Validation Split for Amphibia is Done!
Validation Split for Insecta is Done!
Validation Split for Animalia is Done!
Validation Split for Mollusca is Done!
Validation Split for Fungi is Done!
Validation Split for Arachnida is Done!
Validation Split for Reptilia is Done!
Validation Split for Mammalia is Done!


In [147]:
""" Create loader for train, test and validation data """

## Train data
train_path = base_data_dir+"train/"
train_dataset = torchvision.datasets.ImageFolder(root=train_path,transform=torchvision.transforms.ToTensor())
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=16,shuffle=True,num_workers=0)

## Validation data
val_path = base_data_dir+"validation/"
val_dataset = torchvision.datasets.ImageFolder(root=val_path,transform=torchvision.transforms.ToTensor())
val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=16,shuffle=True,num_workers=0)

## Test data
test_path = base_data_dir+"test/"
test_dataset = torchvision.datasets.ImageFolder(root=test_path,transform=torchvision.transforms.ToTensor())
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size=16,shuffle=True,num_workers=0)

### References:

1. https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
2. https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#:~:text=PyTorch%20provides%20two%20data%20primitives,easy%20access%20to%20the%20samples.
3. https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel