Training data is split into training and validation set so that model can be evaluated without submitting test set predictions on Kaggle and using custom evaluation metrics

In [61]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os

In [62]:
import sys
sys.path.append("../")
from config import DATASETS_PATH

In [63]:
csv_labels_path = os.path.join(DATASETS_PATH, 'train_validation_labels.csv')
df_labels = pd.read_csv(csv_labels_path)

In [64]:
train_set, validation_set = train_test_split(df_labels, test_size=0.2, random_state=10)

evaluate whether the labels are properly distributed among train and validations sets

In [6]:
def calculate_distrbution(df):
    n_instances = len(df)
    label_count = df.iloc[:, 2:].sum()
    return label_count / n_instances

In [71]:
validation_set.iloc[:, 2:].sum()

primary              7532
clear                5737
agriculture          2448
road                 1638
water                1491
partly_cloudy        1442
cultivation           910
habitation            771
haze                  516
cloudy                401
bare_ground           180
selective_logging      67
artisinal_mine         66
blooming               62
slash_burn             47
conventional_mine      17
blow_down              22
dtype: int64

In [7]:
# test whether the labels are properly distributed
original_distribution = calculate_distrbution(df_labels)
train_distribution = calculate_distrbution(train_set)
validation_distribution = calculate_distrbution(validation_set)

In [8]:
# %difference in label distribution between original and new training set
diff_training = pd.concat([original_distribution,
          train_distribution], axis = 1).T.pct_change().iloc[1, :]*100

In [9]:
# %difference in label distribution between original training set and validation set
diff_validation = pd.concat([original_distribution,
          validation_distribution], axis = 1).T.pct_change().iloc[1, :]*100

In [43]:
df_diff = pd.concat([original_distribution, diff_training, diff_validation], axis=1)
df_diff.columns = ['original distribution', '% diff. training', '% diff. validation']

In [57]:
df_diff_round = df_diff.round(decimals=3)

In [58]:
print(df_diff_round.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  original distribution &  \% diff. training &  \% diff. validation \\
\midrule
primary           &                  0.927 &            -0.097 &               0.389 \\
clear             &                  0.702 &            -0.223 &               0.891 \\
agriculture       &                  0.304 &             0.153 &              -0.611 \\
road              &                  0.199 &            -0.368 &               1.472 \\
water             &                  0.183 &            -0.148 &               0.591 \\
partly\_cloudy     &                  0.179 &             0.176 &              -0.705 \\
cultivation       &                  0.111 &            -0.407 &               1.628 \\
habitation        &                  0.090 &            -1.331 &               5.325 \\
haze              &                  0.067 &             1.085 &              -4.341 \\
cloudy            &                  0.052 &             1.006 &              -4.023 \\
bare

move images in the training set to new directories corresponding to the new training and validation sets

In [87]:
# get file names
training_images = train_set[['image_name']].iloc[:, 0]
validation_images = validation_set[['image_name']].iloc[:, 0]

In [91]:
"""
CODE IS NOT NEEDED ANYMORE ONCE THE FILES HAVE BEEN MOVED

# create sub directories
original_imgs_dir = os.path.join(DATASETS_PATH, 'train')
train_imgs_dir = os.path.join(original_imgs_dir, 'train')
validation_imgs_dir = os.path.join(original_imgs_dir, 'validation')

if not os.path.exists(train_imgs_dir):
    os.makedirs(train_imgs_dir)
    
if not os.path.exists(validation_imgs_dir):
    os.makedirs(validation_imgs_dir)

# move images to appropriate directories
for img in training_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(train_imgs_dir, file_name)
    os.rename(old, new)
    
for img in validation_images:
    file_name = f'{img}.tif'
    old = os.path.join(original_imgs_dir, file_name)
    new = os.path.join(validation_imgs_dir, file_name)
    os.rename(old, new)
"""

reorder dataframes in file name numerical order. This is important during training when the training data image features must match the appropriate labels

In [28]:
train_set_sorted = train_set.sort_index()
validation_set_sorted = validation_set.sort_index()

output training and validation labels to csv file

In [29]:
train_set_sorted.to_csv(os.path.join(DATASETS_PATH, 'train_labels.csv'), index=False)
validation_set_sorted.to_csv(os.path.join(DATASETS_PATH, 'validation_labels.csv'), index=False)