Training data is split into training and validation set so that model can be evaluated without submitting test set predictions on Kaggle and using custom evaluation metrics

In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [59]:
import sys
sys.path.append("../")
from config import DATASETS_PATH

In [60]:
csv_labels_path = os.path.join(DATASETS_PATH, 'indicator_labels.csv')
df_labels = pd.read_csv(csv_labels_path)

In [61]:
train_set, validation_set = train_test_split(df_labels, test_size=0.2, random_state=10)

In [40]:
def calculate_distrbution(df):
    n_instances = len(df)
    label_count = df.iloc[:, 2:].sum()
    return label_count / n_instances

In [41]:
# test whether the labels are properly distributed
original_distribution = calculate_distrbution(df_labels)
train_distribution = calculate_distrbution(train_set)
validation_distribution = calculate_distrbution(validation_set)

In [50]:
# %difference in label distribution between original and new training set
diff_training = pd.concat([original_distribution,
          train_distribution], axis = 1).T.pct_change().iloc[1, :]

In [51]:
# %difference in label distribution between original training set and validation set
diff_validation = pd.concat([original_distribution,
          validation_distribution], axis = 1).T.pct_change().iloc[1, :]

In [54]:
df_diff = pd.concat([original_distribution, diff_training, diff_validation], axis=1)
df_diff.columns = ['original distribution', '% diff. training', '% diff. validation']

In [55]:
df_diff

Unnamed: 0,original distribution,% diff. training,% diff. validation
primary,0.926727,-0.000973,0.003894
clear,0.702364,-0.002227,0.008909
agriculture,0.304232,0.001529,-0.006115
road,0.199387,-0.00368,0.014719
water,0.183083,-0.001478,0.005912
partly_cloudy,0.179377,0.001762,-0.007048
cultivation,0.110601,-0.00407,0.01628
habitation,0.090417,-0.013314,0.053253
haze,0.066627,0.010852,-0.043405
cloudy,0.051607,0.010059,-0.040234


move images in the training set to new directories corresponding to the new training and validation sets

In [87]:
# get file names
training_images = train_set[['image_name']].iloc[:, 0]
validation_images = validation_set[['image_name']].iloc[:, 0]

In [90]:
# create sub directories
original_imgs_dir = os.path.join(DATASETS_PATH, 'train')
train_imgs_dir = os.path.join(original_imgs_dir, 'train')
validation_imgs_dir = os.path.join(original_imgs_dir, 'validation')

if not os.path.exists(train_imgs_dir):
    os.makedirs(train_imgs_dir)
    
if not os.path.exists(validation_imgs_dir):
    os.makedirs(validation_imgs_dir)

In [91]:
"""
CODE IS NOT NEEDED ANYMORE ONCE THE FILES HAVE BEEN MOVED
for img in training_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(train_imgs_dir, file_name)
    os.rename(old, new)
    
for img in validation_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(validation_imgs_dir, file_name)
    os.rename(old, new)
"""

output training and validation labels to csv file

In [94]:
train_set.to_csv(os.path.join(DATASETS_PATH, 'train_labels.csv'))
validation_set.to_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'))