### Useful for datasets which don't fit in RAM
### Generate each batch and process it
### Can also do real time augmentation

## NOTE: Brandon used ImageDataGenerator, which is a more high level way of doing fit_generator (i.e. you need to code less)
## This code uses fit_generator, which gives you more control, but is a bit harder to code

In [10]:
import numpy as np
import glob
from keras.utils import Sequence
from keras.applications.resnet50 import ResNet50
from PIL import Image

In [11]:
def process_image(imagefile):
    im = Image.open(imagefile)
    im = im.convert(mode = 'L')
    im = im.resize((350,350))
    # Add augmentation if you want
    return np.asarray(im).reshape(350,350,1)

In [12]:
class BatchSampler(Sequence):
    '''
    See https://keras.io/utils/
    Sequences are a safer way of doing generators
    '''
    
    def __init__(self, data_dir, label_file, batch_size):
        '''
        data_dir: train_image directory
        label_file: train.csv
        '''
        self.batch_size = batch_size
        self.image_files = glob.glob(data_dir+'/*.jpg') #get all files in data directory
        self.labels_dict = {}
        self.label2id = {}
        
        # get all labels
        lines = open(label_file).read().splitlines()
        labels = []
        for line in lines:
            name,label = line.split(',')
            self.labels_dict[name] = label
            labels.append(label)
        
        # create integer labels for each emotion
        count=0
        for l in set(labels):
            self.label2id[l] = count
            count+=1
        
    def __len__(self):
        '''
        number of batches in 1 epoch
        '''
        return int(np.ceil(len(self.image_files) / float(self.batch_size)))

    def __getitem__(self, idx):
        '''
        This is like the generator
        Outputs a batch of samples
        '''
        batch_x = self.image_files[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = []
        for file in batch_x:
            file_name = file.split('/')[-1] #this isolates only the filename
            label = self.labels_dict[file_name] 
            batch_y.append(self.label2id[label])
        return np.array([process_image(file) for file in batch_x]), np.array(batch_y)

In [13]:
bs = BatchSampler('./train_image', './train.csv', 32)
print(len(bs.image_files))
print(bs.label2id)
print(len(bs))

12993
{'neutral': 0, 'fear': 1, 'anger': 2, 'happiness': 3, 'contempt': 4, 'surprise': 5, 'disgust': 6, 'sadness': 7}
407


### Create Keras model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
model.fit_generator(bs, steps_per_epoch=len(bs), epochs=10)