# Divide your data into train, val and test sets

Select the file *.txt* that contains all the images and its corresponding classes. Now we want to split our images into training, validation and testing files, so that the different volume classes are well distributed (e.g. if the dataset is highly imbalanced the images are not randomly separated).

In [2]:
import sqlite3
from enum import Enum
from datetime import datetime
import os
import hashlib
import multiprocessing
from multiprocessing import Pool
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from sklearn.model_selection import train_test_split
from skimage import io

from collections import Counter
import random as r

In [3]:
#Funcion para separar en bins
def get_bin(true_values):
    
    grams = [100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375]
    bins = [87.5, 112.5, 137.5, 162.5, 187.5, 212.5, 237.5, 262.5, 287.5, 312.5, 337.5, 362.5, 387.5]
    
    
    if type(true_values) == list:
        result = list()
        for e in true_values:
            l = len(bins)
            index = 0
            for i in range(l):
                try:
                    if bins[i] < float(e) and bins[i+1] > float(e):
                        index = i
                        result.append(grams[index])
                        break
                except:
                    print("Error:", e)
    else:
        l = len(bins)
        index = 0
        for i in range(l):
            try:
                if bins[i] < float(true_values) and bins[i+1] > float(true_values):
                    index = i
                    result = grams[index]
                    break
            except:
                print("Error:", true_values)
        
    
    return result

In [4]:
# funcion que genera el chisero dataset.txt con las carpetas de imagenes y un dataframe con path y bin_size
def generate_dataset_file(fruitDirectory):

    # VALUES(date, user, picture, hash, location, idfruta, idvariedad, tamaño, luz, plano, angulo, plato, superficie);
    with open('../data/dataset_files/dataset.txt', 'w') as fw: #Escribo en fichero dataset
        df = pd.DataFrame(columns=['path', 'clase'])
        for variety in os.listdir(fruitDirectory): # Para cada variedad
            varietyDirectory = fruitDirectory + variety + "/"
            print(varietyDirectory)
            variety = varietyDirectory.split("/")[3]
            for sizeDirectory in os.listdir(varietyDirectory): # Para cada peso
                tamaño=sizeDirectory
                sizeDirectory = varietyDirectory + sizeDirectory + "/"

                df = df.append(pd.DataFrame([[sizeDirectory,float(get_bin(tamaño.replace(",","."))),variety]],
                                            columns=['path', 'clase', 'variety']), ignore_index=True)
        
                string = str(sizeDirectory) + '*' + str(tamaño.replace(",","."))+'\n'
                fw.write(string)
                
    fw.close()

    return df

In [24]:
#Escribo en el fichero el path y tamaño extraido del path
def write_file(writefile, X):
    
    print("Writing "+ writefile)
    with open('../data/dataset_files/'+writefile, 'w') as fw: #Escribo en fichero Train
        for i in range(len(X)):
            directory= X.iloc[i]
            tamaño = directory.split('/')[-2].replace(',','.')
            for filename in os.listdir(directory+"/"): 
                #print(filename)
                f = os.path.join(directory, filename)
                #checking if it is a file
                if not os.path.isfile(f):
                    raise Exception("File Not found: " + str(f))

                try:
                    
                    _ = io.imread(f)
                    string = str(f) + '*' + str(tamaño)+'\n'
                    fw.write(string)
                    
                except Exception as e:
                    print(f)
        fw.close()

In [26]:
# Estratificamos datos y escribimos en fichero
def gen_data(dataset_dir):

    #Genero dataset.txt y obtengo df [path, size]
    df = generate_dataset_file(dataset_dir)
    #df = df.sort_values(by=['clase'])
    for i in range(len(df.clase)): #Hay pocas manzanas superiores a 300g por lo que se combinan dentro del bin 300g
        if df.clase[i] > 300:
            df.clase[i] = 300
        elif df.clase[i] < 125:
            df.clase[i] = 125
    #Dristribuir en train, test, val
    print(df)
    X_train, X_2, y_train, y_2 = train_test_split(df.path, df.clase, test_size=0.30, random_state=1, stratify=df.clase)
    print("Train:", Counter(y_train))

    #print(Counter(y_2))
    X_test, X_val, y_test, y_val = train_test_split(X_2, y_2, test_size=0.33, random_state=1, stratify=y_2)
    print("Test:",Counter(y_test))
    print("Val:",Counter(y_val))
    #Train
    write_file('train.txt', X_train)
    #val
    write_file('val.txt', X_val)
    #Test
    write_file('test.txt', X_test)
    
gen_data('/storage/MANZANA/')

/storage/MANZANA/Fuji/
/storage/MANZANA/Golden/
/storage/MANZANA/Granny Smith/
                                      path  clase       variety
0            /storage/MANZANA/Fuji/108.63/  125.0          Fuji
1            /storage/MANZANA/Fuji/109.11/  125.0          Fuji
2            /storage/MANZANA/Fuji/129.11/  125.0          Fuji
3            /storage/MANZANA/Fuji/129.84/  125.0          Fuji
4            /storage/MANZANA/Fuji/131.07/  125.0          Fuji
..                                     ...    ...           ...
172  /storage/MANZANA/Granny Smith/270,41/  275.0  Granny Smith
173  /storage/MANZANA/Granny Smith/272,28/  275.0  Granny Smith
174  /storage/MANZANA/Granny Smith/276,01/  275.0  Granny Smith
175  /storage/MANZANA/Granny Smith/283,04/  275.0  Granny Smith
176  /storage/MANZANA/Granny Smith/295,00/  300.0  Granny Smith

[177 rows x 3 columns]
Train: Counter({200.0: 30, 250.0: 19, 150.0: 18, 225.0: 15, 125.0: 12, 275.0: 12, 175.0: 12, 300.0: 5})
Test: Counter({200.0: 9, 

In [70]:
import random as rand
#Escribo en el fichero el path y tamaño extraido del path
def write_file_df(writefile, df):
   
    # ESCRIBIR EN FICHERO
    print("Writing "+ writefile)
    with open('../data/dataset_files/'+writefile, 'w') as fw: #Escribo en fichero Train
        for i in range(len(df.path)):
            directory= df.path.iloc[i]
            tamaño = directory.split('/')[-2].replace(',','.')

            #print(directory, tamaño)
            for filename in os.listdir(directory+"/"): 
                #print(filename)
                f = os.path.join(directory, filename)
                #checking if it is a file
                if not os.path.isfile(f):
                    raise Exception("File Not found: " + str(f))

                try:

                    _ = io.imread(f)
                    string = str(f) + '*' + str(tamaño)+'\n'
                    fw.write(string)

                except Exception as e:
                    print(f)

        fw.close()
            

In [71]:
def balance_df(df):
     # Obtendo la clase mas representada (Peso más representado)
    class_most_common_name = Counter(df.clase).most_common()[0][0]
    class_most_common_value = Counter(df.clase).most_common()[0][1]
    
    # Como esa clase puede estar desbalanceada (por variedad), obtengo la frecuencia de la variedad más representada.
    freq_variety_top = Counter(df.loc[df.clase == class_most_common_name].variety).most_common()[0][1]
    #print("freq_variety_top",freq_variety_top)
    # El numero de carpetas por clase sera freq_variety_top x 3variedades
    total_freq_for_balanced_class = freq_variety_top * 3
    
    # Creo una copia del df, donde añadiré carpetas repetidas, con el objetivo de balancear las clases.
    df_repetidas = df.copy()
    #print("df_repetidas:\n", df_repetidas)
    
    #TODO: seria añadir lineas con carpetas repetidas antes de empezar a escribir. Y despues escribir
    #Para cada clase (que cuenta con ciertas carpetas), repito carpetas hasta llegar al total_freq_for_balanced_class
    for clase, lenght in Counter(df.clase).most_common(): #Para clase
        repetir_rand = total_freq_for_balanced_class - lenght
        print("###Clase: ",clase,"###")
        print("lenght:", lenght, "rand:",repetir_rand)
        print("DF INICIAL: ",Counter(df_repetidas.loc[df_repetidas.clase == clase].variety).most_common())
        #print(Counter(df_repetidas.loc[df_repetidas.clase == clase].variety).most_common())
        for j in range(repetir_rand): #TODO: EJECUTA RARO
            ## TODO: Se necesita ejecutar tantas veces como repeticiones necesarias
            # En cada ejecucion se mira la variedad menos representada y se selecciona aleatoriamente
            #una carpeta de esta manzana, se añade al df y se repite proceso

            # Miramos la variedad menos representada, teniendo en cuenta las repeticiones
            less_common_variety = Counter(df_repetidas.loc[df_repetidas.clase == clase].variety).most_common()[-1][0]
            
            # Obtenemos la lista con las frutas a repetir, sin las repeticiones ya añadidas
            df_less_common = df[(df.clase == clase) & (df.variety == str(less_common_variety))].path.tolist()
            
            # Seleccioamos aleatoriamente una manzana para repetir todas sus imagenes
            choosen = rand.choice(df_less_common)
            row = {'path': choosen, 'clase': clase, 'variety': less_common_variety}
            df_repetidas=df_repetidas.append(row, ignore_index=True) #TODO: HAY QUE AÑADIR AL DF_REPETIDAS LA ELEGIDA. REPETIR
            #print("repetidas:\n", df_repetidas)
            #print(less_common_variety
            #rand.sample(df.path, k=repetir_rand) # TODO: RANDOM?? tener en cuenta la variedad
        
        print("DF RESULTADO: ",Counter(df_repetidas.loc[df_repetidas.clase == clase].variety).most_common())
        print("TAMAÑO FINAL: ",total_freq_for_balanced_class)
        print("###########")
        
    print("tamaño df_repetidas:", len(df_repetidas))
    print(Counter(df_repetidas.clase).most_common())
    print(Counter(df_repetidas.variety).most_common())
    print(df_repetidas)
    
    return df_repetidas
    

In [72]:
# Estratificamos datos y escribimos en fichero
def gen_data(dataset_dir):

    #Genero dataset.txt y obtengo df [path, size]
    df = generate_dataset_file(dataset_dir)
    #df = df.sort_values(by=['clase'])
    for i in range(len(df.clase)): #Hay pocas manzanas superiores a 300g por lo que se combinan dentro del bin 300g
        if df.clase[i] > 300:
            df.clase[i] = 300
        elif df.clase[i] < 125:
            df.clase[i] = 125
    #Dristribuir en train, test, val
    print(df)
    X_train, X_2, y_train, y_2 = train_test_split(df.path, df.clase, test_size=0.30, random_state=1, stratify=df.clase)
    print("Train:", Counter(y_train))
    df_train = df.loc[X_train.index]

    #print(Counter(y_2))
    X_test, X_val, y_test, y_val = train_test_split(X_2, y_2, test_size=0.33, random_state=1, stratify=y_2)
    print(X_test)
    df_test = df.loc[X_test.index]
    df_val = df.loc[X_val.index]
   # print("Test:",Counter(y_test))
   # print("Val:",Counter(y_val))
    #Train
    write_file_df('train.txt',balance_df(df_train))
    #val
    #write_file('val.txt', X_val)
    #Test
    #write_file('test.txt', X_test)
    
gen_data('/storage/MANZANA/')

/storage/MANZANA/Fuji/
/storage/MANZANA/Golden/
/storage/MANZANA/Granny Smith/
                                      path  clase       variety
0            /storage/MANZANA/Fuji/108.63/  125.0          Fuji
1            /storage/MANZANA/Fuji/109.11/  125.0          Fuji
2            /storage/MANZANA/Fuji/129.11/  125.0          Fuji
3            /storage/MANZANA/Fuji/129.84/  125.0          Fuji
4            /storage/MANZANA/Fuji/131.07/  125.0          Fuji
..                                     ...    ...           ...
173  /storage/MANZANA/Granny Smith/270,41/  275.0  Granny Smith
174  /storage/MANZANA/Granny Smith/272,28/  275.0  Granny Smith
175  /storage/MANZANA/Granny Smith/276,01/  275.0  Granny Smith
176  /storage/MANZANA/Granny Smith/283,04/  275.0  Granny Smith
177  /storage/MANZANA/Granny Smith/295,00/  300.0  Granny Smith

[178 rows x 3 columns]
Train: Counter({200.0: 30, 250.0: 19, 150.0: 18, 225.0: 16, 175.0: 12, 125.0: 12, 275.0: 12, 300.0: 5})
124    /storage/MANZANA/G

from collections import Counter
print(Counter(data.clase))

# split into train test sets
X_train, X_2, y_train, y_2 = train_test_split(data.image, data.clase, test_size=0.215, random_state=1, stratify=data.clase)
print(Counter(y_train))
print(Counter(y_2))

X_test, X_val, y_test, y_val = train_test_split(X_2, y_2, test_size=0.5, random_state=1, stratify=y_2)
print(Counter(y_test))
print(Counter(y_val))

train=pd.DataFrame(data={'image': X_train, 'clase': y_train})
train.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/train.txt', sep='*', index=None, header=None, mode='a')

val=pd.DataFrame(data={'image': X_val, 'clase': y_val})
val.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/val.txt', sep='*', index=None, header=None, mode='a')

test=pd.DataFrame(data={'image': X_test, 'clase': y_test})
test.to_csv(r'/srv/images_classes/Regression/copasVasos_Madrid_LaRioja/test.txt', sep='*', index=None, header=None, mode='a')

len(train)

len(val)

len(test)