# Divide your data into train, val and test sets

Select the file *.txt* that contains all the images and its corresponding classes. Now we want to split our images into training, validation and testing files, so that the different volume classes are well distributed (e.g. if the dataset is highly imbalanced the images are not randomly separated).

In [1]:
import sqlite3
from enum import Enum
from datetime import datetime
import os
import hashlib
import multiprocessing
from multiprocessing import Pool
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from sklearn.model_selection import train_test_split
from skimage import io

from collections import Counter
import random as r

In [2]:
#Funcion para separar en bins
def get_bin(true_values):
    
    grams = [100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375]
    bins = [87.5, 112.5, 137.5, 162.5, 187.5, 212.5, 237.5, 262.5, 287.5, 312.5, 337.5, 362.5, 387.5]
    
    
    if type(true_values) == list:
        result = list()
        for e in true_values:
            l = len(bins)
            index = 0
            for i in range(l):
                try:
                    if bins[i] < float(e) and bins[i+1] > float(e):
                        index = i
                        result.append(grams[index])
                        break
                except:
                    print("Error:", e)
    else:
        l = len(bins)
        index = 0
        for i in range(l):
            try:
                if bins[i] < float(true_values) and bins[i+1] > float(true_values):
                    index = i
                    result = grams[index]
                    break
            except:
                print("Error:", true_values)
        
    
    return result

In [3]:
# funcion que genera el chisero dataset.txt con las carpetas de imagenes y un dataframe con path y bin_size
def generate_dataset_file(fruitDirectory):

    # VALUES(date, user, picture, hash, location, idfruta, idvariedad, tamaño, luz, plano, angulo, plato, superficie);
    with open('../data/dataset_files/dataset.txt', 'w') as fw: #Escribo en fichero dataset
        df = pd.DataFrame(columns=['path', 'clase'])
        for variety in os.listdir(fruitDirectory): # Para cada variedad
            varietyDirectory = fruitDirectory + variety + "/"
            print(varietyDirectory)
            for sizeDirectory in os.listdir(varietyDirectory): # Para cada peso
                tamaño=sizeDirectory
                sizeDirectory = varietyDirectory + sizeDirectory + "/"

                df = df.append(pd.DataFrame([[sizeDirectory,float(get_bin(tamaño.replace(",",".")))]], columns=['path', 'clase']), ignore_index=True)
        
                string = str(sizeDirectory) + '*' + str(tamaño.replace(",","."))+'\n'
                fw.write(string)
                
    fw.close()

    return df

In [4]:
#Escribo en el fichero el path y tamaño extraido del path
def write_file(writefile, X):
    
    print("Writing "+ writefile)
    with open('../data/dataset_files/'+writefile, 'w') as fw: #Escribo en fichero Train
        for i in range(len(X)):
            directory= X.iloc[i]
            tamaño = directory.split('/')[-2].replace(',','.')
            #print(directory, tamaño)
            for filename in os.listdir(directory+"/"): 
                #print(filename)
                f = os.path.join(directory, filename)
                #checking if it is a file
                if not os.path.isfile(f):
                    raise Exception("File Not found: " + str(f))

                try:
                    _ = io.imread(f)
                    string = str(f) + '*' + str(tamaño)+'\n'
                    fw.write(string)
                except Exception as e:
                    print(f)
        fw.close()

In [None]:
# Estratificamos datos y escribimos en fichero
def gen_data(dataset_dir):

    #Genero dataset.txt y obtengo df [path, size]
    df = generate_dataset_file(dataset_dir)
    #df = df.sort_values(by=['clase'])
    for i in range(len(df.clase)): #Hay pocas manzanas superiores a 300g por lo que se combinan dentro del bin 300g
        if df.clase[i] > 300:
            df.clase[i] = 300
        elif df.clase[i] < 125:
            df.clase[i] = 125
    #Dristribuir en train, test, val
    #print(Counter(df.clase))
    X_train, X_2, y_train, y_2 = train_test_split(df.path, df.clase, test_size=0.30, random_state=1, stratify=df.clase)
    print("Train:", Counter(y_train))
    #print(Counter(y_2))
    X_test, X_val, y_test, y_val = train_test_split(X_2, y_2, test_size=0.33, random_state=1, stratify=y_2)
    print("Test:",Counter(y_test))
    print("Val:",Counter(y_val))
    #Train
    write_file('train.txt', X_train)
    #val
    write_file('val.txt', X_val)
    #Test
    write_file('test.txt', X_test)
    
gen_data('/srv/nextcloud/MANZANA/')

/srv/nextcloud/MANZANA/Fuji/
/srv/nextcloud/MANZANA/Golden/
/srv/nextcloud/MANZANA/Granny Smith/
Train: Counter({200.0: 22, 250.0: 17, 150.0: 17, 175.0: 13, 125.0: 12, 275.0: 8, 225.0: 7, 300.0: 4})
Test: Counter({200.0: 6, 250.0: 5, 150.0: 5, 125.0: 3, 275.0: 3, 175.0: 3, 225.0: 2, 300.0: 1})
Val: Counter({200.0: 3, 150.0: 3, 250.0: 2, 175.0: 2, 125.0: 2, 225.0: 1, 300.0: 1, 275.0: 1})
Writing train.txt


from collections import Counter
print(Counter(data.clase))

# split into train test sets
X_train, X_2, y_train, y_2 = train_test_split(data.image, data.clase, test_size=0.215, random_state=1, stratify=data.clase)
print(Counter(y_train))
print(Counter(y_2))

X_test, X_val, y_test, y_val = train_test_split(X_2, y_2, test_size=0.5, random_state=1, stratify=y_2)
print(Counter(y_test))
print(Counter(y_val))

train=pd.DataFrame(data={'image': X_train, 'clase': y_train})
train.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/train.txt', sep='*', index=None, header=None, mode='a')

val=pd.DataFrame(data={'image': X_val, 'clase': y_val})
val.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/val.txt', sep='*', index=None, header=None, mode='a')

test=pd.DataFrame(data={'image': X_test, 'clase': y_test})
test.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/test.txt', sep='*', index=None, header=None, mode='a')

len(train)

len(val)

len(test)