# Sort images
## Extra images
This notebook is based on sort_images.ipynb and is adapted to sort the extra images.

In [1]:
#importing the necessary modules
import shutil
from imutils import paths
from random import shuffle
import os
from datetime import datetime
import numpy as np
import pathlib
import pandas as pd

## Sort train images into train and validation folders

In [2]:
train_data = pd.read_csv('../data/extra_images_loc.csv')
train_data.image_id = train_data.image_id.apply(lambda x: x.strip()+".jpg")
train_data['turtle_id-image_id'] = train_data.turtle_id + "-" + train_data.image_id
train_data['turtle_id-image_location'] = train_data.turtle_id + "-" + train_data.image_location

In [3]:
train_data

Unnamed: 0,image_id,image_location,turtle_id,turtle_id-image_id,turtle_id-image_location
0,ID_Y0KYE5XD.jpg,left,t_id_he7JTQxO,t_id_he7JTQxO-ID_Y0KYE5XD.jpg,t_id_he7JTQxO-left
1,ID_8JTIQ4UI.jpg,top,t_id_he7JTQxO,t_id_he7JTQxO-ID_8JTIQ4UI.jpg,t_id_he7JTQxO-top
2,ID_LSXPZYSN.jpg,left,t_id_he7JTQxO,t_id_he7JTQxO-ID_LSXPZYSN.jpg,t_id_he7JTQxO-left
3,ID_SHZ2HDSP.jpg,left,t_id_he7JTQxO,t_id_he7JTQxO-ID_SHZ2HDSP.jpg,t_id_he7JTQxO-left
4,ID_6TOFB06E.jpg,top,t_id_xry0Yg2j,t_id_xry0Yg2j-ID_6TOFB06E.jpg,t_id_xry0Yg2j-top
...,...,...,...,...,...
10653,ID_PLYDY39S.jpg,right,t_id_9YXAIhtI,t_id_9YXAIhtI-ID_PLYDY39S.jpg,t_id_9YXAIhtI-right
10654,ID_1EJCP0DF.jpg,left,t_id_9YXAIhtI,t_id_9YXAIhtI-ID_1EJCP0DF.jpg,t_id_9YXAIhtI-left
10655,ID_WXIGYROR.jpg,top,t_id_ajlHbN2F,t_id_ajlHbN2F-ID_WXIGYROR.jpg,t_id_ajlHbN2F-top
10656,ID_UAU4GUNK.jpg,right,t_id_ajlHbN2F,t_id_ajlHbN2F-ID_UAU4GUNK.jpg,t_id_ajlHbN2F-right


In [4]:
#list all pictures in the image_dir
image_dir = '../images/'
imagePaths = sorted(list(paths.list_images(image_dir)))

In [5]:
# Store all the turtle_ids in train_data
turtle_ids = train_data['turtle_id']
turtle_ids

0        t_id_he7JTQxO
1        t_id_he7JTQxO
2        t_id_he7JTQxO
3        t_id_he7JTQxO
4        t_id_xry0Yg2j
             ...      
10653    t_id_9YXAIhtI
10654    t_id_9YXAIhtI
10655    t_id_ajlHbN2F
10656    t_id_ajlHbN2F
10657    t_id_ajlHbN2F
Name: turtle_id, Length: 10658, dtype: object

## Sort images
To sort the images you need to create a subfolder "sorted_images" into the main folder

In [6]:
#after listing all individuals create two empty folders for each individual 
#one for the training and another for the validaiton dataset

#define the folder were the training and validation datasets will be placed
if not os.path.exists("../sorted_extra_images"):
    os.makedirs("../sorted_extra_images")
root_dir="../sorted_extra_images"

#loop through all individuals and create a folder for the training dataset
# and a folder for the validation dataset
for i in range(0, len(turtle_ids)):
    train_dir=root_dir+"/train/"+turtle_ids[i]#variable with the full path of the training folder
    val_dir=root_dir+"/val/"+turtle_ids[i]#variable with the full path of the validation folder
    if not os.path.exists(train_dir):#condition for if the folder already exists
        os.makedirs(train_dir)#create the folder
    if not os.path.exists(val_dir):
        os.makedirs(val_dir)   



In [7]:
# These lines will create a folder called "new_turtle" into our train and validation sub-folders. 
#This is necessary for our specific task. (Reminder: if the image most likelx does not belong to any turtle_id, the models needs to output "new_turtle")
#These folders will not contain any pictures
#os.makedirs("../sorted_images/train/new_turtle")
#os.makedirs("../sorted_images/val/new_turtle")

In [15]:
#Check length of train_data, i.e. number of relevant pictures
round(len(train_data))

10658

In [16]:
#in this example we are going to select 1822 (approx 86%) pictures for training and 300 (approx 14%) pictures for validation.
#We setup the condition that the validation pictures should contain at least one picture per turtle_id for each image_location
#as per now we have nothing to avoid having pictures that are very similiar in both datasets, which could result in overfitting the CNN

#define the number of validation pictures and the number of training pictures
N_val_pics=2231
N_train_pics=8427

#create two empty lists to store the pictures files that are going to be moved to the training 
#and validation fodlers
training_pictures=[]
validation_pictures=[]

#list to check if combination is already used
turtle_id=[]

#loop through each individual turtle_id and secondary image_location
for index in range(0, len(train_data)):
    if train_data['turtle_id'][index] not in turtle_id:
        validation_pictures.append(train_data['turtle_id-image_id'][index])
        turtle_id.append(train_data['turtle_id'][index])
    else:
        training_pictures.append(train_data['turtle_id-image_id'][index])

In [13]:
#Check if number of validation pictures is indeed 300
len(validation_pictures)

2231

In [14]:
len(training_pictures)

8427

In [17]:
val_images = []
train_images = []
#loop through the list of pictures
#move the pictures files to the validation folder
for i in range(0, len(validation_pictures)):
    #get the picture name
    val_file_name = image_dir + validation_pictures[i].split('-')[-1]
    image_name = validation_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_val=root_dir+"/val/"+validation_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(val_file_name, output_name_val)
    
    # put the information into a DataFrame
    val_images.append(image_name)

for i in range(0, len(training_pictures)):
    #get the picture name (e.g. "01103F7D5A_2018-11-26_07-56-03.jpg")
    train_file_name = image_dir + training_pictures[i].split('-')[-1]
    image_name = training_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_train=root_dir+"/train/"+training_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(train_file_name, output_name_train)    
    
    train_images.append(image_name)

## Create dataframes with new folder structure

In [19]:
import os
import pandas as pd

In [18]:
train = []
train_dir="../sorted_extra_images/train/"

for r, d, f in os.walk(train_dir):
    d.sort()#key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            train.append((d,os.path.join(r,file)))

df_train = pd.DataFrame(train,columns=['folder','image_id'])

val = []
val_dir="../sorted_extra_images/val/"

for r, d, f in os.walk(val_dir):
    d.sort()#key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            val.append((d,os.path.join(r,file)))

df_val = pd.DataFrame(val,columns=['folder','image_id'])


In [20]:
df_train = df_train['image_id']
df_val = df_val['image_id']

In [21]:
df_train

0       ../sorted_extra_images/train/t_id_01N0gNOT/ID_...
1       ../sorted_extra_images/train/t_id_01N0gNOT/ID_...
2       ../sorted_extra_images/train/t_id_01N0gNOT/ID_...
3       ../sorted_extra_images/train/t_id_01N0gNOT/ID_...
4       ../sorted_extra_images/train/t_id_01N0gNOT/ID_...
                              ...                        
8422    ../sorted_extra_images/train/t_id_zqaXahAn/ID_...
8423    ../sorted_extra_images/train/t_id_zqaXahAn/ID_...
8424    ../sorted_extra_images/train/t_id_zqaXahAn/ID_...
8425    ../sorted_extra_images/train/t_id_zxFUnL5e/ID_...
8426    ../sorted_extra_images/train/t_id_zxFUnL5e/ID_...
Name: image_id, Length: 8427, dtype: object

In [22]:
train = []
for line in range(len(df_train)):
    train.append(df_train[line][-15:])

val = []
for line in range(len(df_val)):
    val.append(df_val[line][-15:])

In [23]:
train[-1]

'ID_9DUMXX40.jpg'

In [24]:
image_location_train = []
for i in range(len(train)):
    for j in range(len(train_data)):
        if train[i] in train_data['image_id'][j]: 
            image_location_train.append(train_data['image_location'][j])
            
d_train = {'image_id':train,'image_location':image_location_train}   
df_train = pd.DataFrame(d_train)
df_train.to_csv('../data/df_sorted_extra_train.csv', index = False)

In [25]:
image_location_val = []
for i in range(len(val)):
    for j in range(len(train_data)):
        if val[i] in train_data['image_id'][j]: 
            image_location_val.append(train_data['image_location'][j])
            
d_val = {'image_id':val,'image_location':image_location_val}   
df_val = pd.DataFrame(d_val)
df_val.to_csv('../data/df_sorted_extra_val.csv', index = False)