## Code for converting CIFAR 10 h5py file to numpy arrays
- Not used for the tutorial, provided as a util file to process CIFAR 10 data for Keras
- One hot encodes labels
- Creates smaller dataset with only two classes, airplane and cat
- Assumes data has been downloaded and extracted to 'train.h5' and 'test.h5'

### Note
- This script assumes that train.h5 and test.h5 have been created and saved to this directory using the creatingdatasets_cifar10.py script

In [1]:
import h5py
import pickle
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [47]:
f1 = h5py.File('train.h5', 'r')
list(f1.items())

[('data', <HDF5 dataset "data": shape (50000, 3, 32, 32), type "|u1">),
 ('label', <HDF5 dataset "label": shape (50000,), type "<i8">)]

In [48]:
f2 = h5py.File('test.h5', 'r')
list(f2.items())

[('data', <HDF5 dataset "data": shape (10000, 3, 32, 32), type "|u1">),
 ('label', <HDF5 dataset "label": shape (10000,), type "<i8">)]

In [49]:
print(list(f1.keys()))
print(list(f2.keys()))

['data', 'label']
['data', 'label']


In [50]:
train_data = f1['data'][()]
train_labels = f1['label'][()]
test_data = f2['data'][()]
test_labels = f2['label'][()]
print("Train data shape: {}".format(train_data.shape))
print("Train labels shape: {}".format(train_labels.shape))
print("Test data shape: {}".format(test_data.shape))
print("Test labels shape: {}".format(test_labels.shape))

Train data shape: (50000, 3, 32, 32)
Train labels shape: (50000,)
Test data shape: (10000, 3, 32, 32)
Test labels shape: (10000,)


In [51]:
# Convert labels to one hot encoding
encoder = OneHotEncoder(sparse=False)
train_labels = train_labels.reshape(-1, 1)
print(train_labels.shape)
test_labels = test_labels.reshape(-1, 1)
print(test_labels.shape)
train_labels_one_hot = encoder.fit_transform(train_labels)
test_labels_one_hot = encoder.fit_transform(test_labels)
print("Train labels shape: {}".format(train_labels_one_hot.shape))
print("Test labels shape: {}".format(test_labels_one_hot.shape))

(50000, 1)
(10000, 1)
Train labels shape: (50000, 10)
Test labels shape: (10000, 10)


In [52]:
pickle.dump(train_data, open("CIFAR_10_train_data.pkl", 'wb'), protocol=2)
pickle.dump(train_labels_one_hot, open("CIFAR_10_train_labels.pkl", 'wb'), protocol=2)
pickle.dump(test_data, open("CIFAR_10_test_data.pkl", 'wb'), protocol=2)
pickle.dump(test_labels_one_hot, open("CIFAR_10_test_labels.pkl", 'wb'), protocol=2)

In [111]:
# Create subset of data with only two classes (airplane or cat)
airplane = train_labels==0
cat = train_labels==3
airandcat_train = (train_labels==0) | (train_labels==3)
print(airplane[:10])
print(cat[:10])
print(airandcat_train[:10])
airandcat_train_ix = np.where(airandcat_train)[0]
print(airandcat_train_ix[:10])
airandcat_test = (test_labels==0) | (test_labels==3)
print(airandcat_test[:10])
airandcat_test_ix = np.where(airandcat_test)[0]
print(airandcat_test_ix[:10])

[[False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]]
[[False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]]
[[False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]]
[ 2  7 12 20 31 33 34 36 39 40]
[[False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [False]]
[ 5  8 12 18 21 31 37 42 48 60]


In [112]:
train_data_2classes = train_data[airandcat_train_ix,::]
train_labels_2classes = train_labels[airandcat_train_ix]
test_data_2classes = test_data[airandcat_test_ix,::]
test_labels_2classes = test_labels[airandcat_test_ix]

In [113]:
print(sum(airandcat_train))
print(airandcat_train.shape)
print(train_data.shape)
print(train_data_2classes.shape)
print(train_labels_2classes.shape)
print(sum(airandcat_test))
print(airandcat_test.shape)
print(test_data.shape)
print(test_data_2classes.shape)
print(test_labels_2classes.shape)

[10000]
(50000, 1)
(50000, 3, 32, 32)
(10000, 3, 32, 32)
(10000, 1)
[2000]
(10000, 1)
(10000, 3, 32, 32)
(2000, 3, 32, 32)
(2000, 1)


In [116]:
# Convert labels to one hot encoding
encoder = OneHotEncoder(sparse=False)
train_labels_2classes = train_labels_2classes.reshape(-1, 1)
print(train_labels_2classes.shape)
test_labels_2classes = test_labels_2classes.reshape(-1, 1)
print(test_labels_2classes.shape)
train_labels_2classes_one_hot = encoder.fit_transform(train_labels_2classes)
test_labels_2classes_one_hot = encoder.fit_transform(test_labels_2classes)
print("Train 2 classes labels shape: {}".format(train_labels_2classes_one_hot.shape))
print("Test 2 classes labels shape: {}".format(test_labels_2classes_one_hot.shape))

(10000, 1)
(2000, 1)
Train 2 classes labels shape: (10000, 2)
Test 2 classes labels shape: (2000, 2)


In [117]:
pickle.dump(train_data_2classes, open("CIFAR_2_train_data.pkl", 'wb'), protocol=2)
pickle.dump(train_labels_2classes_one_hot, open("CIFAR_2_train_labels.pkl", 'wb'), protocol=2)
pickle.dump(test_data_2classes, open("CIFAR_2_test_data.pkl", 'wb'), protocol=2)
pickle.dump(test_labels_2classes_one_hot, open("CIFAR_2_test_labels.pkl", 'wb'), protocol=2)