# Data Collection and Preprocessing

This notebook is divided into 5 sections:

1. Collect Data
2. Create Dtaa For Transfer Learning
3. Create Augmented
4. Create Data For SemiSupervised Learning
5. Create inference data
6. Create directories for plots, logs and models

In [1]:
# import packages
import os
import pandas as pd
import random
import shutil
from PIL import Image

### Collect Data

1. Here we collecte data from given URL.
2. Unzip them and delete the zip file

In [5]:
# data URL
URL = \
"https://he-public-data.s3-ap-southeast-1.amazonaws.com/HE_Challenge_data.zip"

# download data
os.system("curl -O "+URL )

# unzip data
os.system("unzip HE_Challenge_data.zip")

# delete Zip file
os.remove("HE_Challenge_data.zip")

0

### Create Data for Transfer Learning

1. Here we split the original train data into 80% Train and 20% Valid data using stratified random sampling.
2. Also we segregate images into different directories based on their categories.

In [12]:
# get path
path = os.getcwd()

# create directory for Transfer learning data
os.mkdir(path+'/data/data_TL/')

# create directory for train data
os.mkdir(path+'/data/data_TL/train')

# create directory for val data
os.mkdir(path+'/data/data_TL/val')

# import train information
df_train = pd.read_csv(path+'/data/train.csv')

# create directories for different categories in train and val folders
for i in df_train.category.unique():
    # create directory in train
    os.mkdir(path+'/data/data_TL/train/'+str(i)+'/')
    # create directory in val
    os.mkdir(path+'/data/data_TL/val/'+str(i)+'/')

# stratified random sampling to split data b/w train and val
for i in df_train.category.unique():
    
    # make list of images belonging t o the category
    lis = list(df_train[df_train['category']==i]['image_id'])
    
    # randomly shuffle the list
    random.seed(i)
    random.shuffle(lis)
    
    # get list of training images
    t_lis = lis[0:int(len(lis)*0.8)]
    
    # copy training images
    for j in t_lis:
        shutil.copyfile(path+'/data/train/'+str(j)+'.jpg', \
                        path+'/data/data_TL/train/'+str(i)+'/'+str(j)+'.jpg')
    # get list of validation images
    v_lis = lis[int(len(lis)*0.8):]
    
    # copy training images
    for j in v_lis:
        shutil.copyfile(path+'/data/train/'+str(j)+'.jpg', \
                        path+'/data/data_TL/val/'+str(i)+'/'+str(j)+'.jpg')

### Create Augmented Data

1. Here we take all images from train.
2. Rotate them randomly at 3 different angles.
3. Then split the data into 80% Train and 20% Valid data using stratified random sampling.

In [36]:
# get path
path = os.getcwd()

# create directory for Transfer learning data
os.mkdir(path+'/data/data_Aug/')

# create directory for train data
os.mkdir(path+'/data/data_Aug/train')

# create directory for val data
os.mkdir(path+'/data/data_Aug/val')

# create a temporary directory named aug
os.mkdir(path+'/data/data_Aug/aug')

# import train information
df_train = pd.read_csv(path+'/data/train.csv')

# create directories for different categories in train and val folders
for i in df_train.category.unique():
    # create directory in train
    os.mkdir(path+'/data/data_Aug/train/'+str(i)+'/')
    # create directory in val
    os.mkdir(path+'/data/data_Aug/val/'+str(i)+'/')
    # create directory in aug
    os.mkdir(path+'/data/data_Aug/aug/'+str(i)+'/')

In [7]:
# stratified random sampling to split data b/w train and val
for i in df_train.category.unique():
    
    # make list of images belonging t o the category
    lis = list(df_train[df_train['category']==i]['image_id'])
    
    # augment all images and store in aug
    for j in lis:
        
        # open image and save in aug
        img = Image.open(path+'/data/train/'+str(j)+'.jpg')
        img.save(path+'/data/data_Aug/aug/'+str(i)+'/0_'+str(j)+'.jpg')        
        
        # randomly choose 4 angles to flip data
        lis_angle = [45,90,135,180,225,270,315]
        random.seed(j)
        random.shuffle(lis_angle)
        # rotate and save image
        for k in lis_angle[0:4]:
            rot = img.rotate(k)
            rot.save(path+'/data/data_Aug/aug/'+str(i)+\
                     '/'+str(k)+'_'+str(j)+'.jpg')
        
    # list of all images in aug folder
    lis = list(os.listdir(path+'/data/data_Aug/aug/'+str(i)+'/'))
    
    # randomly shuffle the list
    random.seed(i)
    random.shuffle(lis)
    
    # get list of training images
    t_lis = lis[0:int(len(lis)*0.8)]
    
    # copy training images
    for j in t_lis:
        if '.jpg' in j:
            shutil.copyfile(path+'/data/data_Aug/aug/'+str(i)+'/'+j,\
                            path+'/data/data_Aug/train/'+str(i)+'/'+j)
    
    # get list of validation images
    v_lis = lis[int(len(lis)*0.8):]
    
    # copy training images
    for j in v_lis:
        if '.jpg' in j:
            shutil.copyfile(path+'/data/data_Aug/aug/'+str(i)+'/'+j,\
                            path+'/data/data_Aug/val/'+str(i)+'/'+j)

# delete aug from data_Aug
shutil.rmtree(path+'/data/data_Aug/aug/')

### Create Data for Semi Supervised Learning

1. Here we use all images from train and test.
2. Invert them to make two categories: Upright and Inverted.
3. Then split the data in these two categories into 80% Train and 20% Valid data using stratified random sampling.

In [24]:
# get path
path = os.getcwd()

# create directory for Semi Supervised learning data
os.mkdir(path+'/data/data_SSL/')

# create directory for train data
os.mkdir(path+'/data/data_SSL/train')

# create directory for val data
os.mkdir(path+'/data/data_SSL/val')

In [25]:
# create directories for different categories in train and val folders
for i in ['u', 'r']:
    # create directory in train
    os.mkdir(path+'/data/data_SSL/train/'+str(i)+'/')
    # create directory in val
    os.mkdir(path+'/data/data_SSL/val/'+str(i)+'/')

In [32]:
# make list of train images
lis_train = list(os.listdir(path+'/data/train/'))

# make list of test images
lis_test = list(os.listdir(path+'/data/test/'))

In [33]:
### split the original images and save in 'u' directory:

# randomly shuffle the list
random.seed(199)
random.shuffle(lis_train)
random.shuffle(lis_test)

# get list of training images
t_lis_train = lis_train[0:int(len(lis_train)*0.8)]
t_lis_test = lis_test[0:int(len(lis_test)*0.8)]

# copy training images
for j in t_lis_train:
    if '.jpg' in j:
        shutil.copyfile(path+'/data/train/'+j, \
                        path+'/data/data_SSL/train/u/0_'+j)
for j in t_lis_test:
    if '.jpg' in j:
        shutil.copyfile(path+'/data/test/'+j, \
                        path+'/data/data_SSL/train/u/0_'+j)

# get list of val images
v_lis_train = lis_train[int(len(lis_train)*0.8):]
v_lis_test = lis_test[int(len(lis_test)*0.8):]

# copy validation images
for j in v_lis_train:
    if '.jpg' in j:
        shutil.copyfile(path+'/data/train/'+j, \
                        path+'/data/data_SSL/val/u/0_'+j)
for j in v_lis_test:
    if '.jpg' in j:
        shutil.copyfile(path+'/data/test/'+j, \
                        path+'/data/data_SSL/val/u/0_'+j)
    
### split the rotated images and save in 'r' directory:

# randomly shuffle the list
random.seed(299)
random.shuffle(lis_train)
random.shuffle(lis_test)

# get list of training images
t_lis_train = lis_train[0:int(len(lis_train)*0.8)]
t_lis_test = lis_test[0:int(len(lis_test)*0.8)]

# copy training images
for j in t_lis_train:
    if '.jpg' in j:
        img = Image.open(path+'/data/train/'+j)
        rot = img.rotate(180)
        rot.save(path+'/data/data_SSL/train/r/1_'+j)
for j in t_lis_test:
    if '.jpg' in j:
        img = Image.open(path+'/data/test/'+j)
        rot = img.rotate(180)
        rot.save(path+'/data/data_SSL/train/r/1_'+j)

# get list of val images
v_lis_train = lis_train[int(len(lis_train)*0.8):]
v_lis_test = lis_test[int(len(lis_test)*0.8):]

# copy validation images
for j in v_lis_train:
    if '.jpg' in j:
        img = Image.open(path+'/data/train/'+j)
        rot = img.rotate(180)
        rot.save(path+'/data/data_SSL/val/r/1_'+j)    
for j in v_lis_test:
    if '.jpg' in j:
        img = Image.open(path+'/data/test/'+j)
        rot = img.rotate(180)
        rot.save(path+'/data/data_SSL/val/r/1_'+j)

### Create Inference Data

Simply copy images in data/test/ folder into data/inference/test/.

We do this to make it compatible for pytorch dataloader.

In [8]:
# get path
path = os.getcwd()

# create directory for inference
os.mkdir(path+'/data/inference/')

# copy images from test to inference
shutil.copytree(path+'/data/test/', path+'/data/inference/test/')

'/scratch/skp454/SemiSupervised_Image_Classification/data/inference/test/'

### Create Other Useful Directories

In [2]:
# get path
path = os.getcwd()

# create directory for plots
os.mkdir(path+'/data/plots/')

# create directory for logs
os.mkdir(path+'/data/logs/')

# create directory for models
os.mkdir(path+'/data/models/')

# create directory for outputs
os.mkdir(path+'/data/outputs/')