Training data is split into training and validation set so that model can be evaluated without submitting test set predictions on Kaggle and using custom evaluation metrics

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [None]:
import sys
sys.path.append("../")
from config import DATASETS_PATH

In [None]:
csv_labels_path = os.path.join(DATASETS_PATH, 'train_validation_labels.csv')
df_labels = pd.read_csv(csv_labels_path)

In [None]:
train_set, validation_set = train_test_split(df_labels, test_size=0.2, random_state=10)

evaluate whether the labels are properly distributed among train and validations sets

In [None]:
def calculate_distrbution(df):
    n_instances = len(df)
    label_count = df.iloc[:, 2:].sum()
    return label_count / n_instances

In [None]:
validation_set.iloc[:, 2:].sum()

In [None]:
# test whether the labels are properly distributed
original_distribution = calculate_distrbution(df_labels)
train_distribution = calculate_distrbution(train_set)
validation_distribution = calculate_distrbution(validation_set)

In [None]:
# %difference in label distribution between original and new training set
diff_training = pd.concat([original_distribution,
          train_distribution], axis = 1).T.pct_change().iloc[1, :]*100

In [None]:
# %difference in label distribution between original training set and validation set
diff_validation = pd.concat([original_distribution,
          validation_distribution], axis = 1).T.pct_change().iloc[1, :]*100

In [None]:
df_diff = pd.concat([original_distribution, diff_training, diff_validation], axis=1)
df_diff.columns = ['original distribution', '% diff. training', '% diff. validation']

In [None]:
df_diff_round = df_diff.round(decimals=3)

In [None]:
print(df_diff_round.to_latex())

move images in the training set to new directories corresponding to the new training and validation sets

In [None]:
# get file names
training_images = train_set[['image_name']].iloc[:, 0]
validation_images = validation_set[['image_name']].iloc[:, 0]

In [None]:
"""
CODE IS NOT NEEDED ANYMORE ONCE THE FILES HAVE BEEN MOVED

# create sub directories
original_imgs_dir = os.path.join(DATASETS_PATH, 'train')
train_imgs_dir = os.path.join(original_imgs_dir, 'train')
validation_imgs_dir = os.path.join(original_imgs_dir, 'validation')

if not os.path.exists(train_imgs_dir):
    os.makedirs(train_imgs_dir)
    
if not os.path.exists(validation_imgs_dir):
    os.makedirs(validation_imgs_dir)

# move images to appropriate directories
for img in training_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(train_imgs_dir, file_name)
    os.rename(old, new)
    
for img in validation_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(validation_imgs_dir, file_name)
    os.rename(old, new)
"""

reorder dataframes in file name numerical order. This is important during training when the training data image features must match the appropriate labels

In [None]:
train_set_sorted = train_set.sort_index()
validation_set_sorted = validation_set.sort_index()

output training and validation labels to csv file

In [None]:
train_set_sorted.to_csv(os.path.join(DATASETS_PATH, 'train_labels.csv'), index=False)
validation_set_sorted.to_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'), index=False)