In [1]:
# Import all packages & Dependencies

import sys
import os
import tarfile
import shutil
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from utils import *
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from scipy import ndimage, io, misc
from xml.dom import minidom
from matplotlib.pyplot import imshow
%matplotlib inline

In [9]:
cd E:\dog-breed-classifier

E:\dog-breed-classifier


## Creating the Path to Directory

In [14]:
def file_create(path):
    if  not os.path.exists(path):
        os.mkdir(path)   

In [15]:
file_create('train')
file_create('test')
file_create('cropped')
file_create('cropped/train')
file_create('cropped/test')

# Data Source
url = 'http://vision.stanford.edu/aditya86/ImageNetDogs/'
last_percent_reported = None
data_root = '.'
num_classes = 120
image_size = 224
num_channels = 3
np.random.seed(133)

## Download Progress

In [17]:
# Extraction of data
# Source: stackoverflow

def download_progress_hook(count, blockSize, totalSize):
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

    last_percent_reported = percent

In [18]:
def maybe_download(filename, expected_bytes, force=False):
    dest_filename = os.path.join(data_root, filename)
    if force or not os.path.exists(dest_filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(dest_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', dest_filename)
    else:
        raise Exception('Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
    return dest_filename

In [19]:
def maybe_extract(filename, check_classes=True, force=False):
    root = os.path.splitext(filename)[0]  # remove .tar
    if os.path.isdir(root) and not force:
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(data_root)
        tar.close()
    if check_classes:
        data_folders = [os.path.join(root, d) for d in sorted(os.listdir(root)) if os.path.isdir(os.path.join(root, d))]
        if len(data_folders) != num_classes:
            raise Exception('Expected %d folders, one per class. Found %d instead.' % (num_classes, len(data_folders)))
        print('Completed extraction of %s.' % filename)
        return data_folders
    else:
        print('Completed extraction of %s.' % filename)

## Extracting & Downloading 

In [23]:
images_filename = maybe_download('images.tar', 793579520)
annotation_filename = maybe_download('annotation.tar', 21852160)
lists_filename = maybe_download('lists.tar', 481280)

Attempting to download: images.tar
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified .\images.tar
Attempting to download: annotation.tar
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
Found and verified .\annotation.tar
Attempting to download: lists.tar
0%..5%..10%..15%..20%..25%..30%..35%..40%..45%...................80%..85%..90%..95%..100%
Download Complete!
Found and verified .\lists.tar


In [86]:
images_filename = 'images.tar'
annotation_filename = 'annotation.tar'

images_folders = maybe_extract(images_filename)
annotation_folders = maybe_extract(annotation_filename)
maybe_extract(lists_filename, check_classes=False)

Extracting data for images. This may take a while. Please wait.
Completed extraction of images.tar.
annotation already present - Skipping extraction of annotation.tar.
Completed extraction of annotation.tar.
Extracting data for .\lists. This may take a while. Please wait.
Completed extraction of .\lists.tar.


In [87]:
# Cropped Train-Test Images

for folder in images_folders:
    os.makedirs("train/"+folder.split("\\")[-1])
    os.makedirs("test/"+folder.split("\\")[-1])
    os.makedirs("cropped/train/"+folder.split("\\")[-1])
    os.makedirs("cropped/test/"+folder.split("\\")[-1])

test_list = io.loadmat('test_list.mat')['file_list']
train_list = io.loadmat('train_list.mat')['file_list']

In [88]:
def move_data_files(image_list, new_folder):
    for file in image_list:
        if os.path.exists('Images/'+file[0][0]):
            shutil.move('Images/'+file[0][0],new_folder+'/'+file[0][0])
        elif not os.path.exists(new_folder+'/'+file[0][0]):
           print('%s does not exist, it may be missing' % os.path.exists('./images/'+file[0][0]))
    return [new_folder+'/'+d for d in sorted(os.listdir(new_folder)) if os.path.isdir(os.path.join(new_folder, d))]

In [89]:
test_folders = move_data_files(test_list, 'test')
train_folders = move_data_files(train_list, 'train')

In [104]:
# Load Data from Single Breed
# Source: Stackoverflow

def load_breed(folder):
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), image_size, image_size,num_channels), dtype=np.float32)
    print(folder)
    num_images = 0
    for image in image_files:
        image_file = folder+'/'+image
        try:
            
            image_data = misc.imread(image_file)
            
            annon_file = 'Annotation' + '/' + folder.split('/')[-1] + '/' + image.split('.')[0]
            annon_xml = minidom.parse(annon_file)
            xmin = int(annon_xml.getElementsByTagName('xmin')[0].firstChild.nodeValue)
            ymin = int(annon_xml.getElementsByTagName('ymin')[0].firstChild.nodeValue)
            xmax = int(annon_xml.getElementsByTagName('xmax')[0].firstChild.nodeValue)
            ymax = int(annon_xml.getElementsByTagName('ymax')[0].firstChild.nodeValue)
            
            new_image_data = image_data[ymin:ymax,xmin:xmax,:]
            new_image_data = misc.imresize(new_image_data, (image_size, image_size))
            misc.imsave('cropped/' + folder + '/' + image, new_image_data)
            dataset[num_images, :, :, :] = new_image_data
            num_images = num_images + 1
        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')

    dataset = dataset[0:num_images, :, :, :]

    print('Full dataset tensor:', dataset.shape)
    return dataset

# Check the Img Shape

from keras.preprocessing import image
i=image.load_img('cropped/test/n02085620-Chihuahua/n02085620_588.jpg',target_size=(229,229))
image.img_to_array(i).shape

(229, 229, 3)

## Pickle the data

In [30]:
def maybe_pickle(data_folders, force=False):
    dataset_names = []
    for folder in data_folders:
        set_filename = folder + '.pickle'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
            print('%s already present - Skipping pickling.' % set_filename)
        else:
            print('Pickling %s.' % set_filename)
            dataset = load_breed(folder)
            try:
                with open(set_filename, 'wb') as f:
                    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
            except Exception as e:
                print('Unable to save data to', set_filename, ':', e)
  
    return dataset_names

# Convert to 4D Tesnor

dataset = load_breed('train/n02105855-Shetland_sheepdog')
with open('n02105855-Shetland_sheepdog.pickle', 'wb') as f:
    pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

train/n02105855-Shetland_sheepdog
Full dataset tensor: (100, 224, 224, 3)


In [9]:
# Train, Test- pickled Datasets

train_folders=os.listdir('train')
train_folders=['train'+'/'+d for d in train_folders]
test_folders=os.listdir('test')
test_folders=['test'+'/'+d for d in test_folders]

train_datasets = maybe_pickle(train_folders, force=True)
test_datasets = maybe_pickle(test_folders, force=True)

def make_arrays(nb_rows, img_size):
    if nb_rows:
        dataset = np.ndarray((nb_rows,img_size, img_size,num_channels), dtype=np.float32)
        labels = np.ndarray(nb_rows, dtype=np.int32)
    else:
        dataset, labels = None, None
    return dataset, labels

In [10]:
# Merge all datasets

def merge_datasets(pickle_files, train_size, valid_size=0, even_size=True):
    num_classes = len(pickle_files)
    valid_dataset, valid_labels = make_arrays(valid_size, image_size)
    train_dataset, train_labels = make_arrays(train_size, image_size)
    vsize_per_class = valid_size // num_classes
    tsize_per_class = train_size // num_classes
    
    start_v, start_t = 0, 0
    end_v, end_t = vsize_per_class, tsize_per_class
    end_l = vsize_per_class+tsize_per_class
    for label, pickle_file in enumerate(pickle_files):
        try:
            with open(pickle_file, 'rb') as f:
                breed_set = pickle.load(f)
                np.random.shuffle(breed_set)
                
            if not even_size:
                tsize_per_class,end_l = len(breed_set),len(breed_set)
                end_t = start_t + tsize_per_class
                
            if valid_dataset is not None:
                valid_breed = breed_set[:vsize_per_class, :, :, :]
                valid_dataset[start_v:end_v, :, :, :] = valid_breed
                valid_labels[start_v:end_v] = label
                start_v += vsize_per_class
                end_v += vsize_per_class

            
            train_breed = breed_set[vsize_per_class:end_l, :, :, :]
            train_dataset[start_t:end_t, :, :, :] = train_breed
            train_labels[start_t:end_t] = label
            start_t += tsize_per_class
            end_t += tsize_per_class
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            raise
    
    return valid_dataset, valid_labels, train_dataset, train_labels

In [26]:
# Train, Test, Validation Size

train_size = 9600
valid_size = 2400
test_size = 8580

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size, even_size=False)
a=[len(os.listdir("train"+'/'+d)) for d in os.listdir('train') if  not d.endswith('pickle')]

# Save numpy arrays

from utils import *
np.save(open('train_dataset.npy','wb'), train_dataset)
np.save(open('train_labels.npy','wb'), train_labels)
np.save(open('valid_dataset.npy','wb'), valid_dataset)
np.save(open('valid_labels.npy','wb'), valid_labels)

np.save(open('test_dataset.npy','wb'), test_dataset)
np.save(open('test_labels.npy','wb'), test_labels) 