In [None]:
# Notebook to transform segmented images into np array and clean CSV

In [29]:
# Python ≥3.5 is required

import sys
assert sys.version_info >= (3, 5)

import random
import numpy as np
import pandas as pd
import os

import matplotlib.image as mpimg

from os import listdir
from os.path import isfile, join
import re

from PIL import Image as PIL_Image
import timeit




# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
# Specifying paths for output

In [3]:
# Where to save the figures from pre-processing

PROJECT_ROOT_DIR = "output"
CHAPTER_ID = "segmentation"

IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "figures", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
# Specifying paths and switch for image data

In [33]:
# Implement a switch between dummy and full image data
full_data_switch_on = False # if the full data set should be used, this switch need to be set to true

In [None]:
#train_switch = True  # Set False for Test
# je nachdem ob wir ein test-csv bekommen, müssen wir das train_set selber in test&train splitten


In [34]:
if full_data_switch_on == False:
    #set path to sample data set
    ROOT_PATH_DATA = "input/"
    PATH_DATA_TRAIN = os.path.join(ROOT_PATH_DATA + "03_segmented/train/sample_segmented_images/seg_train_sample_100")
    LABEL_DATA_PATH = os.path.join(ROOT_PATH_DATA + "01_raw/train_sample_100.csv")
elif full_data_switch_on == True:
    #set path to the full data set
    ROOT_PATH_DATA = "input/"
    PATH_DATA_TRAIN = os.path.join(ROOT_PATH_DATA + "03_segmented/train/full_set_segmented_images")
    PATH_DATA_TEST = os.path.join(ROOT_PATH_DATA + "03_segmented/test/full_set_segmented_images") 
    LABEL_DATA_PATH = os.path.join(ROOT_PATH_DATA + "01_raw/train.csv")
    #csv for test not available yet 
else:
    raise ValueError("Full data switch is not correctly defined")

In [7]:
## Getting an overview of the data - checking out CSV

In [23]:
label_train_df = pd.read_csv(LABEL_DATA_PATH, sep = ',') #NOTE: separation is different between raw csv and segmented csv

In [24]:
label_train_df.head(10)

Unnamed: 0,image,species,individual_id
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9
1,000562241d384d.jpg,humpback_whale,1a71fbb72250
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392
5,000a8f2d5c316a.jpg,bottlenose_dolphin,b9907151f66e
6,000be9acf46619.jpg,beluga,afb9b3978217
7,000bef247c7a42.jpg,humpback_whale,444d8894ccc8
8,000c3d63069748.jpg,beluga,df94b15285b9
9,000c476c11bad5.jpg,bottlenose_dolphin,b11b2404c7e3


In [25]:
label_train_df['species']. value_counts()

bottlenose_dolphin           9664
beluga                       7443
humpback_whale               7392
blue_whale                   4830
false_killer_whale           3326
dusky_dolphin                3139
spinner_dolphin              1700
melon_headed_whale           1689
minke_whale                  1608
killer_whale                 1493
fin_whale                    1324
gray_whale                   1123
bottlenose_dolpin            1117
kiler_whale                   962
southern_right_whale          866
spotted_dolphin               490
sei_whale                     428
short_finned_pilot_whale      367
common_dolphin                347
cuviers_beaked_whale          341
pilot_whale                   262
long_finned_pilot_whale       238
white_sided_dolphin           229
brydes_whale                  154
pantropic_spotted_dolphin     145
globis                        116
commersons_dolphin             90
pygmy_killer_whale             76
rough_toothed_dolphin          60
frasiers_dolph

In [11]:
#We have some typos in the species:

#E.g., there is bottlenose_dolphin and bottlenose_dolpin, kiler_whale and killer_whale. 
#Obviously, it is typos. Also, globis and pilot_whale are short_finned_pilot_whale

#We thus need to correct the labels:

In [26]:
label_train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

In [27]:
label_train_df['species']. value_counts()

bottlenose_dolphin           10781
beluga                        7443
humpback_whale                7392
blue_whale                    4830
false_killer_whale            3326
dusky_dolphin                 3139
killer_whale                  2455
spinner_dolphin               1700
melon_headed_whale            1689
minke_whale                   1608
fin_whale                     1324
gray_whale                    1123
southern_right_whale           866
short_finned_pilot_whale       745
spotted_dolphin                490
sei_whale                      428
common_dolphin                 347
cuviers_beaked_whale           341
long_finned_pilot_whale        238
white_sided_dolphin            229
brydes_whale                   154
pantropic_spotted_dolphin      145
commersons_dolphin              90
pygmy_killer_whale              76
rough_toothed_dolphin           60
frasiers_dolphin                14
Name: species, dtype: int64

In [14]:
def store_csv(label_df):
    if full_data_switch_on == True:
        label_df.to_csv("input/04_cleaned/train/clean_train.csv", sep=';')
        print("dataframe successfully stored in: input/04_cleaned/train")
    elif full_data_switch_on == False:
        label_df.to_csv("input/04_cleaned/train/clean_sample_train.csv", sep=';')
        print("dataframe successfully stored in: input/04_cleaned/train")
    else:
        raise ValueError("Full data switch is not correctly defined")

In [28]:
store_csv(label_train_df)

dataframe successfully stored in: input/04_cleaned/


In [30]:
# Function to transform images into np.array

In [39]:
# Defining function to list all pictures to include
def list_files(dir):
    r = []
    for root, dirs, files in os.walk(dir):
        for name in files:
            r.append(os.path.join(root, name))
    return r

In [40]:
# define filepaths for sample and full image data
if full_data_switch_on == True:
    filepaths_test = list_files(PATH_DATA_TEST)

In [41]:
filepaths_train = list_files(PATH_DATA_TRAIN)

In [42]:
# Defining function that performs translation of jpg files into numerical representation
def pixel_transformation(target_pixel,train_switch):
    
    if train_switch == True:
        
        # Run-time information
        start_time = timeit.default_timer()
        print(">... Starting pixel transformation for train images for resolution: ", target_pixel, "x", target_pixel)

        # Initialize emtpy array of fitting length
        array_length = target_pixel*target_pixel*3 #Times 3 as we have 3 values (RGB) per pixel
        loaded_pics_train = np.empty([0,array_length])

        #Running trough all train image files
        for filename in filepaths_train:
            # open picture
            pic = PIL_Image.open(filename)
            # Reduce size from original format to target format
            pic_resized = pic.resize((target_pixel, target_pixel))
            # Extract RGB data
            pic_data = np.array(pic_resized)
            # Include help array to reshape 3D-array(e.g.: 1024, 1024, 3) into 1D array
            help_array = np.reshape(pic_data,(pic_data.size,))
            # Stack each array onto each other to have one larger array of shape (#obs,#pixels*3)
            loaded_pics_train = np.vstack((loaded_pics_train, help_array))
        
        #End run-time information
        elapsed = timeit.default_timer() - start_time
        print("Finished", target_pixel, "x", target_pixel ,"pixel transformation for train images. Run-time in seconds: ", round(elapsed,2))
 
        # Returning
        return loaded_pics_train # array with dimension pixels x pixels x colors for each image
    
    elif train_switch == False:
        
        # Run-time information
        start_time = timeit.default_timer()
        print(">... Starting pixel transformation for test images for resolution: ", target_pixel, "x", target_pixel)

        # Initialize emtpy array of fitting length
        array_length = target_pixel*target_pixel*3 #Times 3 as we have 3 values (RGB) per pixel
        loaded_pics_test = np.empty([0,array_length])

        #Running trough all train image files
        for filename in filepaths_test:
            # open picture
            pic = PIL_Image.open(filename)
            # Reduce size from original format to target format
            pic_resized = pic.resize((target_pixel, target_pixel))
            # Extract RGB data
            pic_data = np.array(pic_resized)
            # Include help array to reshape 3D-array(e.g.: 1024, 1024, 3) into 1D array
            help_array = np.reshape(pic_data,(pic_data.size,))
            # Stack each array onto each other to have one larger array of shape (#obs,#pixels*3)
            loaded_pics_test = np.vstack((loaded_pics_test, help_array))
        
        #End run-time information
        elapsed = timeit.default_timer() - start_time
        print("Finished", target_pixel, "x", target_pixel ,"pixel transformation for test images. Run-time in seconds: ", round(elapsed,2))
 
        # Returning
        return loaded_pics_test # array with dimension pixels x pixels x colors for each image
    
    else:
        print("train switch is not defined correctly")

In [None]:
# make transformation for test data, train data, sample data
# depending on full_data_swtich_on configuration, the transformation is done for FULL data set (test or train) or for SAMPLE data set (only train)
# if the function is called, the train_switch must be set to True or False

In [43]:
image_data_train_sample = pixel_transformation(224, train_switch=True)


>... Starting pixel transformation for train images for resolution:  224 x 224
Finished 224 x 224 pixel transformation for train images. Run-time in seconds:  6.45


In [None]:
# turn full_data_switch_on to true and apply functions above before 
image_data_full_train = pixel_transformation(224, train_switch=True)
image_data_full_test = pixel_transformation(224, train_switch=False)

In [None]:
# define function to store np.arrays in respective data folders

In [44]:
from numpy import savez_compressed

In [47]:
def store_dict_train(image_df, target_pixel):
    
    if full_data_switch_on == True:
        savez_compressed("input/04_cleaned/train/img_data_full_"+str(target_pixel)+".npz",image_df)
        print("file successfully stored in: input/04_cleaned/train/img_data_full_"+str(target_pixel)+".npz")
        
    elif full_data_switch_on == False:
        savez_compressed("input/04_cleaned/train/img_data_sample_"+str(target_pixel)+".npz",image_df)
        print("file successfully stored in: input/04_cleaned/train/img_data_sample_"+str(target_pixel)+".npz")
        
    else:
        raise ValueError("Full data switch is not correctly defined")

In [48]:
def store_dict_test(image_df,target_pixel):
    savez_compressed("input/04_cleaned/test/img_data_full_"+str(target_pixel)+".npz",image_df)
    print("file successfully stored in: input/04_cleaned/test/img_data_full_"+str(target_pixel)+".npz")


In [49]:
store_dict_train(image_data_train_sample,target_pixel=224)

file successfully stored in: input/04_cleaned/train/img_data_sample_224.npz
