## This notebook is used to split the image dataset into training and testing portions

In [1]:
import numpy as np
import shutil
from random import shuffle
import os
import pathlib
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import pandas as pd

### Copy files from an original directory to their new directory 

In [2]:
def copy_files(files,old_directory,new_directory):
  os.makedirs(new_directory,exist_ok=True)
  for i in files:
    os.makedirs(f"{new_directory}/{i}",exist_ok=True)
    for f in files[i]:
      shutil.copyfile(f"{old_directory}/{i}/{f}",f"{new_directory}/{i}/{f}")

In [3]:
def get_dataset_filenames(PATH,SPLIT,TRAIN_PATH,TEST_PATH,BALANCE_DATASET=True):
    
    files = dict()
    train = dict()
    test = dict()

    for i in os.listdir(f'{PATH}'):
        files[i] = os.listdir(f'{PATH}/{i}')
        print(f"{i} {len(files[i])}")


    if BALANCE_DATASET:
      min_categorie = min([len(files[x]) for x in files ])
      for i in files:
        x = files[i]
        shuffle(x)
        print(len(files[i]))
        files[i] = x[:min_categorie]
    
    for i in files:
      list_split = int(SPLIT * len(files[i]))
      train[i] = list(files[i][:list_split])
      test[i]  = list(files[i][list_split:])

    categories = [i for i in files]
    print(categories)
    copy_files(train,PATH,TRAIN_PATH)
    copy_files(test,PATH,TEST_PATH)


    train_files =  sum([len(train[i]) for i in test])
    test_files =  sum([len(test[i]) for i in test])

    return (train_files,test_files,categories)

In [5]:
TRAIN_PERC = 0.8
TEST_PERC  = 0.2

PATH = "resized_images/portugal/MODEL2"
TRAIN_PATH = "image_dataset/MODEL2/train"
TEST_PATH = "image_dataset/MODEL2/test"

get_dataset_filenames(PATH,TRAIN_PERC,TRAIN_PATH,TEST_PATH,BALANCE_DATASET=True)


LEVEL1 25901
LEVEL2 8278
LEVEL3 3740
25901
8278
3740
['LEVEL1', 'LEVEL2', 'LEVEL3']


(8976, 2244, ['LEVEL1', 'LEVEL2', 'LEVEL3'])

### Second dataset version 2 since we use csv generators to load the dataset into keras we separate the csv containing the original labels into test and training csv files by dividing a shuffled version of dataset into  80% and 20% portions

In [4]:
def train_test_split_no_categories(DATASET,TRAIN_PERC,TEST_PERC):
    
    shuffled_dataset = shuffle(DATASET)
    
    train_elements = int(len(shuffled_dataset) * TRAIN_PERC)
    test_elements = int(len(shuffled_dataset) * TEST_PERC)
    
    
    train_df = shuffled_dataset.head(train_elements)
    test_df = shuffled_dataset.tail(test_elements)
    
    return (train_df,test_df)

In [5]:
TRAIN_PERC = 0.8
TEST_PERC  = 0.2

DATASET = pd.read_csv("datasets/MODEL2_LABELS.csv")
DATASET['Unnamed: 0']
train_df,test_df = train_test_split_no_categories(DATASET,TRAIN_PERC,TEST_PERC)

In [6]:
print(len(train_df))
train_df.head()

30335


Unnamed: 0.1,Unnamed: 0,location,label
29582,29583,MODEL2_2/-8.497719530449713_-8.495478218635428...,0.0
8697,8698,MODEL2_2/-8.4192736169497_-8.417032305135415_4...,0.0
22804,22805,MODEL2_2/-8.59185462664973_-8.589613314835447_...,0.0
36850,36851,MODEL2_2/-7.4062006768923805_-7.40395936507809...,0.683061
20394,20395,MODEL2_2/-8.970636323264081_-8.968395011449797...,0.0


In [14]:
print(len(test_df))
test_df.head()

print(train_df['label'])

7583
29582    0.000000
8697     0.000000
22804    0.000000
36850    0.683061
20394    0.000000
           ...   
37099    0.000000
31096    0.000000
9763     0.000000
8638     0.683061
27877    0.000000
Name: label, Length: 30335, dtype: float64


In [24]:
train_df.to_csv("datasets/model2_2_train.csv")

In [25]:
test_df.to_csv("datasets/model2_2_test.csv")

In [15]:
train_m2_v2 = pd.read_csv("datasets/model2_2_train.csv")

In [21]:
train_m2_v2.dtypes

Unnamed: 0        int64
Unnamed: 0.1      int64
location         object
label           float64
dtype: object