In [1]:
import numpy as np
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.callbacks import ModelCheckpoint, EarlyStopping


num_samples = 217636
x_shape = (100, 200, 3)
y_shape = (8,)

x_dtype = 'float32'  # Determine the appropriate dtype
y_dtype = 'float32'  # Determine the appropriate dtype

# x_memmap = np.memmap('x_dataset_head_pos.memmap', dtype=x_dtype, mode='w+', shape=(num_samples,) + x_shape)
# y_memmap = np.memmap('y_dataset_head_pos.memmap', dtype=y_dtype, mode='w+', shape=(num_samples,) + y_shape)

In [None]:
import pickle
import glob
import numpy as np

def process_and_combine_pkl_files_to_memmap(directory_path, x_memmap, y_memmap):
    current_index = 0
    
    for file_path in glob.glob(directory_path + '/*.pkl'):
        print(f"Processing file: {file_path}")
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
            
        if isinstance(data, dict) and 'X' in data and 'Y' in data:
            X_data = data['X']
            Y_data = data['Y']
            
            # Flatten Y_data
            flattened_Y_data = []
            for y in Y_data:
                # Make sure to handle both cases where y[0] could be a list or a numpy array
                gaze_data = np.array(y[0], dtype=np.float32) if isinstance(y[0], list) else y[0].astype(np.float32)
                head_pose_data = y[1].astype(np.float32)
                flattened_y = np.concatenate([gaze_data, head_pose_data])
                flattened_Y_data.append(flattened_y)
                
            Y_data = np.array(flattened_Y_data, dtype=np.float32)
        
        else:
            Y_numeric = []  # Initialize an empty list to hold the processed Y data
            for y in data[1]:
                numeric_values = y[:2] + y[3:]  # Adjusted to exclude index 3
                Y_numeric.append([float(val) for val in numeric_values])  # Convert to float
            
            Y_data = np.array(Y_numeric, dtype=np.float32)  # Convert the list to a numpy array of type float32
            X_data = np.array(data[0], dtype=np.float32)  # Ensure X_data is also properly formatted

        num_samples_in_file = len(X_data)
        x_batch = np.array(X_data, dtype=x_memmap.dtype).reshape((num_samples_in_file,) + x_shape)
        
        # No need to reshape Y_data as it is already in the correct shape after flattening
        y_batch = Y_data  # It should already be in the correct shape
        
        # Ensure we do not exceed the allocated memmap size
        if current_index + num_samples_in_file > len(x_memmap):
            raise ValueError("The dataset is larger than expected.")
        
        # Write directly to the memmap files
        x_memmap[current_index:current_index + num_samples_in_file] = x_batch
        y_memmap[current_index:current_index + num_samples_in_file] = y_batch
        
        current_index += num_samples_in_file
        x_memmap.flush()
        y_memmap.flush()

In [None]:
directory_path = './process_MPIIGaze/batches_head_pos/' 
process_and_combine_pkl_files_to_memmap(directory_path, x_memmap, y_memmap)

In [2]:
import cv2
import numpy as np

def augment_image(image):
    # Random rotation between -5 and 5 degrees
    rows, cols = image.shape[0], image.shape[1]
    rotation_angle = np.random.uniform(-3, 3)
    M_rot = cv2.getRotationMatrix2D((cols / 2, rows / 2), rotation_angle, 1)
    
    # Random shift between -2% to 2% of the image size
    max_shift = max(rows, cols) * 0.02
    dx = np.random.uniform(-max_shift, max_shift)
    dy = np.random.uniform(-max_shift, max_shift)
    M_shift = np.float32([[1, 0, dx], [0, 1, dy]])
    
    # Apply the transformations
    dst = cv2.warpAffine(image, M_rot, (cols, rows))
    dst = cv2.warpAffine(dst, M_shift, (cols, rows))

    brightness_factor = np.random.uniform(0.8, 1.2)
    dst = dst * brightness_factor
    dst = np.clip(dst, 0, 255).astype(image.dtype) * brightness_factor
    return dst

In [3]:
def memmap_batch_generator(x_memmap_path, y_memmap_path, batch_size, indices, shuffle=True, augment=False):
    x_memmap = np.memmap(x_memmap_path, dtype=x_dtype, mode='r', shape=(num_samples,) + x_shape)
    y_memmap = np.memmap(y_memmap_path, dtype=y_dtype, mode='r', shape=(num_samples,) + y_shape)
    
    while True:
        if shuffle:
            np.random.shuffle(indices)
        
        for start_idx in range(0, len(indices), batch_size):
            end_idx = min(start_idx + batch_size, len(indices))
            batch_indices = indices[start_idx:end_idx]
            
            x_batch = x_memmap[batch_indices]
            if augment:
                # Apply augmentation to each image in the batch
                x_batch = np.array([augment_image(image) for image in x_batch])
            
            gaze_data = y_memmap[batch_indices, :2]  # Assuming the first 2 values are for gaze
            pose_data = y_memmap[batch_indices, 2:]  # The next 6 values for head pose
            
            # Yielding a batch of data with the correct format for multi-output
            yield x_batch, {'gaze_output': gaze_data, 'pose_output': pose_data}


In [4]:
batch_size = 32

In [5]:
# Paths to the memmap files
x_memmap_path = 'x_dataset_head_pos.memmap'
y_memmap_path = 'y_dataset_head_pos.memmap'

In [6]:
from keras.layers import Input, Flatten, Dense, Dropout
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.applications import VGG16

# Load VGG16 without the top classification layers
vgg_base = VGG16(include_top=False, weights='imagenet', input_shape=(100, 200, 3))

# Flatten the output of the convolutional base
flat1 = Flatten()(vgg_base.output)

# Common dense layers, now directly following the VGG16 output
dense1 = Dense(4096, activation='relu', kernel_regularizer=l2(0.001), kernel_initializer='he_uniform')(flat1)
dropout1 = Dropout(0.5)(dense1)

# Gaze prediction branch (remains unchanged)
gaze_dense = Dense(4096, activation='relu', kernel_initializer='he_uniform')(dropout1)
gaze_dropout = Dropout(0.5)(gaze_dense)
gaze_output = Dense(2, activation='sigmoid', name='gaze_output')(gaze_dropout)

# Head pose estimation branch (remains unchanged)
pose_dense = Dense(4096, activation='relu', kernel_initializer='he_uniform')(dropout1)
pose_dropout = Dropout(0.5)(pose_dense)
pose_output = Dense(6, activation='sigmoid', name='pose_output')(pose_dropout)

# Final model now has only one input, which is the VGG16 input
model = Model(inputs=vgg_base.input, outputs=[gaze_output, pose_output])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.00005), loss='mse', metrics=['mae'])


In [7]:
checkpoint = ModelCheckpoint('eye_gaze_v31_{epoch:02d}.h5', save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

callbacks_list = [checkpoint, early_stopping]

In [8]:
indices = np.arange(num_samples)
np.random.shuffle(indices)

# Split indices into training and validation sets
train_indices = indices[:int(0.85 * num_samples)]  # 85% for training
val_indices = indices[int(0.85 * num_samples):]  # 15% for validation

# Instantiate the generators
train_generator = memmap_batch_generator(x_memmap_path, y_memmap_path, batch_size, train_indices, shuffle=True)
validation_generator = memmap_batch_generator(x_memmap_path, y_memmap_path, batch_size, val_indices, shuffle=False)


# Calculate steps
steps_per_epoch = len(train_indices) // batch_size
validation_steps = len(val_indices) // batch_size

# Train the model
history = model.fit(
    x=train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=100,
    validation_data=validation_generator,
    validation_steps=validation_steps,
    callbacks=callbacks_list
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
   7/5780 [..............................] - ETA: 27:19 - loss: 0.0014 - gaze_output_loss: 7.2564e-04 - pose_output_loss: 4.7655e-04 - gaze_output_mae: 0.0209 - pose_output_mae: 0.0163

KeyboardInterrupt: 

In [None]:
#save the model
model.save('eye_gaze_v30.h5')