Training data is split into training and validation set so that model can be evaluated without submitting test set predictions on Kaggle and using custom evaluation metrics

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [2]:
import sys
sys.path.append("../")
from config import DATASETS_PATH

In [3]:
csv_labels_path = os.path.join(DATASETS_PATH, 'indicator_labels.csv')
df_labels = pd.read_csv(csv_labels_path)

In [4]:
train_set, validation_set = train_test_split(df_labels, test_size=0.2, random_state=10)

In [5]:
def calculate_distrbution(df):
    n_instances = len(df)
    label_count = df.iloc[:, 2:].sum()
    return label_count / n_instances

In [6]:
# test whether the labels are properly distributed
original_distribution = calculate_distrbution(df_labels)
train_distribution = calculate_distrbution(train_set)
validation_distribution = calculate_distrbution(validation_set)

In [10]:
df_test = pd.DataFrame({'A':[1, 2], 'B':[2, 4]})

In [13]:
# %difference in label distribution between original and new training set
diff_training = pd.concat([original_distribution,
          train_distribution], axis = 1).T.pct_change().iloc[1, :]*100

In [14]:
# %difference in label distribution between original training set and validation set
diff_validation = pd.concat([original_distribution,
          validation_distribution], axis = 1).T.pct_change().iloc[1, :]*100

In [15]:
df_diff = pd.concat([original_distribution, diff_training, diff_validation], axis=1)
df_diff.columns = ['original distribution', '% diff. training', '% diff. validation']

In [16]:
df_diff

Unnamed: 0,original distribution,% diff. training,% diff. validation
primary,0.926727,-0.097349,0.389384
clear,0.702364,-0.222732,0.890899
agriculture,0.304232,0.152872,-0.611469
road,0.199387,-0.367988,1.471908
water,0.183083,-0.147811,0.591227
partly_cloudy,0.179377,0.176214,-0.704836
cultivation,0.110601,-0.407024,1.628046
habitation,0.090417,-1.331358,5.325267
haze,0.066627,1.085163,-4.340517
cloudy,0.051607,1.005889,-4.023434


move images in the training set to new directories corresponding to the new training and validation sets

In [87]:
# get file names
training_images = train_set[['image_name']].iloc[:, 0]
validation_images = validation_set[['image_name']].iloc[:, 0]

In [90]:
# create sub directories
original_imgs_dir = os.path.join(DATASETS_PATH, 'train')
train_imgs_dir = os.path.join(original_imgs_dir, 'train')
validation_imgs_dir = os.path.join(original_imgs_dir, 'validation')

if not os.path.exists(train_imgs_dir):
    os.makedirs(train_imgs_dir)
    
if not os.path.exists(validation_imgs_dir):
    os.makedirs(validation_imgs_dir)

In [91]:
"""
CODE IS NOT NEEDED ANYMORE ONCE THE FILES HAVE BEEN MOVED
for img in training_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(train_imgs_dir, file_name)
    os.rename(old, new)
    
for img in validation_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(validation_imgs_dir, file_name)
    os.rename(old, new)
"""

output training and validation labels to csv file

In [94]:
train_set.to_csv(os.path.join(DATASETS_PATH, 'train_labels.csv'))
validation_set.to_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'))