## 1. Preprocess-GroupImages

### Import pkgs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import os
import zipfile
import pickle
from PIL import Image
from shutil import copy2

### Unzip files

In [2]:
def Unzip(data_path, zip_name):
    extract_name = zip_name[0:-4]
    extract_path = os.path.join(data_path, extract_name)
    zip_path = os.path.join(data_path, zip_name)
    if not (os.path.isdir(extract_path) or os.path.isfile(extract_path)):
        with zipfile.ZipFile(zip_path) as file:
            for name in file.namelist():
                file.extract(name, data_path)

In [3]:
cwd = os.getcwd()
data_path = os.path.join(cwd, 'input')
Unzip(data_path, os.path.join(data_path, 'labels.csv.zip'))
Unzip(data_path, os.path.join(data_path, 'sample_submission.csv.zip'))
Unzip(data_path, os.path.join(data_path, 'test.zip'))
Unzip(data_path, os.path.join(data_path, 'train.zip'))

### Group train data by class
**Note: We create folder structure for train_datagen.flow_from_directory(...).**

In [4]:
labels_path = os.path.join(data_path, 'labels.csv')
labels = pd.read_csv(labels_path)
print('labels.shape is {0}.'.format(labels.shape))
display(labels.head(2))

labels.shape is (10222, 2).


Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo


In [5]:
label_classes = labels.iloc[:,1].unique()
label_classes = sorted(label_classes)
display('The breeds of dogs is {0}'.format(len(label_classes)))
display(label_classes)  ## You can display all to confirm this breeds are correct.

'The breeds of dogs is 120'

['affenpinscher',
 'afghan_hound',
 'african_hunting_dog',
 'airedale',
 'american_staffordshire_terrier',
 'appenzeller',
 'australian_terrier',
 'basenji',
 'basset',
 'beagle',
 'bedlington_terrier',
 'bernese_mountain_dog',
 'black-and-tan_coonhound',
 'blenheim_spaniel',
 'bloodhound',
 'bluetick',
 'border_collie',
 'border_terrier',
 'borzoi',
 'boston_bull',
 'bouvier_des_flandres',
 'boxer',
 'brabancon_griffon',
 'briard',
 'brittany_spaniel',
 'bull_mastiff',
 'cairn',
 'cardigan',
 'chesapeake_bay_retriever',
 'chihuahua',
 'chow',
 'clumber',
 'cocker_spaniel',
 'collie',
 'curly-coated_retriever',
 'dandie_dinmont',
 'dhole',
 'dingo',
 'doberman',
 'english_foxhound',
 'english_setter',
 'english_springer',
 'entlebucher',
 'eskimo_dog',
 'flat-coated_retriever',
 'french_bulldog',
 'german_shepherd',
 'german_short-haired_pointer',
 'giant_schnauzer',
 'golden_retriever',
 'gordon_setter',
 'great_dane',
 'great_pyrenees',
 'greater_swiss_mountain_dog',
 'groenendael',


In [6]:
## Create data_train folder
data_train_path = os.path.join(data_path, 'data_train')
if os.path.isdir(data_train_path):
    print('{0} is exist!'.format(data_train_path))
else:
    os.mkdir(data_train_path)
    print('{0} created!'.format(data_train_path))
    
    ## Create subfolders of data_train folder
    for c in label_classes:
        class_dir = os.path.join(data_train_path, c)
        if not os.path.isdir(class_dir):
            os.mkdir(class_dir)
    print(os.listdir(data_train_path))

D:\ref\Kaggle\dog-breed-identification\input\data_train created!
['affenpinscher', 'afghan_hound', 'african_hunting_dog', 'airedale', 'american_staffordshire_terrier', 'appenzeller', 'australian_terrier', 'basenji', 'basset', 'beagle', 'bedlington_terrier', 'bernese_mountain_dog', 'black-and-tan_coonhound', 'blenheim_spaniel', 'bloodhound', 'bluetick', 'border_collie', 'border_terrier', 'borzoi', 'boston_bull', 'bouvier_des_flandres', 'boxer', 'brabancon_griffon', 'briard', 'brittany_spaniel', 'bull_mastiff', 'cairn', 'cardigan', 'chesapeake_bay_retriever', 'chihuahua', 'chow', 'clumber', 'cocker_spaniel', 'collie', 'curly-coated_retriever', 'dandie_dinmont', 'dhole', 'dingo', 'doberman', 'english_foxhound', 'english_setter', 'english_springer', 'entlebucher', 'eskimo_dog', 'flat-coated_retriever', 'french_bulldog', 'german_shepherd', 'german_short-haired_pointer', 'giant_schnauzer', 'golden_retriever', 'gordon_setter', 'greater_swiss_mountain_dog', 'great_dane', 'great_pyrenees', 'gro

In [7]:
## Create data_val folder
data_val_path = os.path.join(data_path, 'data_val')
if os.path.isdir(data_val_path):
    print('{0} is exist!'.format(data_val_path))
else:
    os.mkdir(data_val_path)
    print('{0} created!'.format(data_val_path))
    ## Create subfolder of data_val folder
    for c in label_classes:
        class_dir = os.path.join(data_val_path, c)
        if not os.path.isdir(class_dir):
            os.mkdir(class_dir)
    print(os.listdir(data_val_path))

D:\ref\Kaggle\dog-breed-identification\input\data_val created!
['affenpinscher', 'afghan_hound', 'african_hunting_dog', 'airedale', 'american_staffordshire_terrier', 'appenzeller', 'australian_terrier', 'basenji', 'basset', 'beagle', 'bedlington_terrier', 'bernese_mountain_dog', 'black-and-tan_coonhound', 'blenheim_spaniel', 'bloodhound', 'bluetick', 'border_collie', 'border_terrier', 'borzoi', 'boston_bull', 'bouvier_des_flandres', 'boxer', 'brabancon_griffon', 'briard', 'brittany_spaniel', 'bull_mastiff', 'cairn', 'cardigan', 'chesapeake_bay_retriever', 'chihuahua', 'chow', 'clumber', 'cocker_spaniel', 'collie', 'curly-coated_retriever', 'dandie_dinmont', 'dhole', 'dingo', 'doberman', 'english_foxhound', 'english_setter', 'english_springer', 'entlebucher', 'eskimo_dog', 'flat-coated_retriever', 'french_bulldog', 'german_shepherd', 'german_short-haired_pointer', 'giant_schnauzer', 'golden_retriever', 'gordon_setter', 'greater_swiss_mountain_dog', 'great_dane', 'great_pyrenees', 'groen

In [8]:
## Create folder for data_test folder
data_test_path = os.path.join(data_path, 'data_test')
if os.path.isdir(data_test_path):
    print('{0} is exist!'.format(data_test_path))
else:
    os.mkdir(data_test_path)
    print('{0} created!'.format(data_test_path))

## Create subfolder for data_test folder
data_test_sub_path = os.path.join(data_test_path, 'test')
if not os.path.isdir(data_test_sub_path):
    os.mkdir(data_test_sub_path)
    print('{0} created!'.format(data_test_sub_path))
else:
    print('{0} is exist!'.format(data_test_sub_path))

D:\ref\Kaggle\dog-breed-identification\input\data_test created!
D:\ref\Kaggle\dog-breed-identification\input\data_test\test created!


In [9]:
# Split data into train and validation
rate = 0.9
total_count = len(labels)
train_count = int(rate*total_count)
labels_train = labels[0:train_count]
labels_val = labels[train_count:]
print('total_count = {0}, train_count = {1}, val_count = {2}'.format(total_count, len(labels_train), len(labels_val)))

total_count = 10222, train_count = 9199, val_count = 1023


In [10]:
labels[:3]

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese


In [11]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_train', 'affenpinscher')

if os.listdir(target_dir):
    print(target_dir + ' is not empty, do not need move images again.')
else:
    print('start to move images into data_train.')
    # Move images of train data into its correct subfolder
    for i, row in labels_train.iterrows():
        iamge_path = os.path.join(data_path, 'train', '{0}.jpg'.format(row[0]))
        target_dir = os.path.join(data_path, 'data_train', row[1])
#         In order to comfirm we get the correct file path
#         print(row[0])
#         print(row[1])
#         print(iamge_path)
#         print(target_dir)
        copy2(iamge_path, target_dir)
print('finish')

start to move images into data_train.
finish


In [12]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_val', 'affenpinscher')

if os.listdir(target_dir):
    print(target_dir + ' is not empty, do not need move images again.')
else:
    print('start to move images into data_val.')
    # Move images of val data into its correct subfolder
    for i, row in labels_val.iterrows():
        iamge_path = os.path.join(data_path, 'train', '{0}.jpg'.format(row[0]))
        target_dir = os.path.join(data_path, 'data_val', row[1])
#         In order to comfirm we get the correct file path
#         print(row[0])
#         print(row[1])
#         print(iamge_path)
#         print(target_dir)
        copy2(iamge_path, target_dir)
print('finish')

start to move images into data_val.
finish


In [13]:
# If images have moved to target_dir, do not move them again. Only check is first subfolder empty
target_dir = os.path.join(data_path, 'data_test', 'test')

if os.listdir(target_dir):
    print(target_dir + ' is not empty, do not need move images again.')
else:
    print('start to move images into data_test.')

    # Move images of test data into test subfolder
    test_image_pathes = os.listdir(os.path.join(data_path, 'test'))
    # print(test_image_pathes)
    for path in test_image_pathes:
        iamge_path = os.path.join(data_path, 'test', path)
        copy2(iamge_path, data_test_sub_path)
print('finish')

start to move images into data_test.
finish


In [14]:
print('Done!')

Done!
