In [1]:
from PIL import Image

import os
import urllib2
import csv
import numpy as np

In [2]:
# total number of image ids
total_items = 15000

In [3]:
# dictionaries storing numeric values of labels
type_index_dict = dict()

# lists to store anomalies in data(entries for which image url or label is not available)
anom_image = list()
anom_type = list()

# arrays to save labels as integers as comptuers are better with numeric values
type_labels = np.zeros((15000,), dtype=int)

# index and count to be assigned to each label and image
type_index = 0
count = 0

"""
Either both image and type will be saved or none of them.
"""
with open('Footwear Dataset - Sheet1.csv', 'rb') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        im_url = row['thumbnail']
        im_type = row['Type']
        
        # separating images with wrong url (RARE)
        if im_url == '' or type(im_url) != str or not im_url.startswith('http'):
            anom_image.append(count)
            continue
        
        # separating types with no or wrong labels (FREQUENT)
        if im_type == '' or type(im_type) != str:
            anom_type.append(count)
            continue
        
        try:
            # try to load the image from url, if error occurs add index to anomaly and continue
            im = Image.open(urllib2.urlopen(im_url))
        except Exception as e:
            anom_image.append(count)
            continue
            
        # saving type_index of type label to nparray
        if im_type not in type_index_dict:
            # if label is not already present in the dictionary
            # save label to dictionary and to labels array
            type_index_dict[im_type] = type_index
            type_labels[count] = type_index
            type_index += 1
        else:
            # else extract the index of label and save to array
            ind = type_index_dict[im_type]
            type_labels[count] = ind
        
        # saving images to data folders
        path = './data/%s' % im_type
        if not os.path.exists(path):
            os.makedirs(path)
        im.save(path + '/img%d.png' % count)
        count += 1
        # if count reaches boundary break(avoid overflow)
        if count == total_items : break
        
    

In [5]:
# convert lists to numpy arrays to save them on the disk
anom_image = np.asarray(anom_image)
anom_type = np.asarray(anom_type)

### Saving only final labels

In [32]:
# only 9400 images have valid labels so size of final labels can be decreased
final_labels = np.zeros((9400,))
for i in range(9400):
    final_labels[i] = type_labels[i]

### Only 9338 images were found with correct labels

In [33]:
# save all the computed arrays
np.save('bad_image_url.npy', anom_image)
np.save('bad_label.npy', anom_type)
np.save('type_index_dict.npy', type_index_dict)
np.save('final_labels.npy', final_labels)