In [1]:
import numpy as np

num_samples = 274874  
x_shape = (75, 150, 3)
y_shape = (2,)

x_dtype = 'float16'  # Determine the appropriate dtype
y_dtype = 'float16'  # Determine the appropriate dtype

# x_memmap = np.memmap('x_dataset.memmap', dtype=x_dtype, mode='w+', shape=(num_samples,) + x_shape)
# y_memmap = np.memmap('y_dataset.memmap', dtype=y_dtype, mode='w+', shape=(num_samples,) + y_shape)


In [3]:
# load x_memmap and y_memmap
x_memmap = np.memmap('x_dataset.memmap', dtype=x_dtype, mode='r', shape=(num_samples,) + x_shape)
y_memmap = np.memmap('y_dataset.memmap', dtype=y_dtype, mode='r', shape=(num_samples,) + y_shape)

In [2]:
import pickle
import glob
import numpy as np

def process_and_combine_pkl_files_to_memmap(directory_path, x_memmap, y_memmap):
    current_index = 0
    
    for file_path in glob.glob(directory_path + '/*.pkl'):
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
            
        # Check if data is a dictionary with 'X' and 'Y' keys
        if isinstance(data, dict) and 'X' in data and 'Y' in data:
            X_data = data['X']
            Y_data = data['Y']
        else:
            # Assume that the data is just the raw arrays for X and Y
            X_data = data[0]
            Y_data = data[1]

        num_samples_in_file = len(X_data)
        x_batch = np.array(X_data, dtype=x_memmap.dtype).reshape((num_samples_in_file,) + x_shape)
        y_batch = np.array(Y_data, dtype=y_memmap.dtype).reshape((num_samples_in_file,) + y_shape)
        
        # Ensure we do not exceed the allocated memmap size
        if current_index + num_samples_in_file > len(x_memmap):
            raise ValueError("The dataset is larger than expected.")
        
        # Write directly to the memmap files
        x_memmap[current_index:current_index + num_samples_in_file] = x_batch
        y_memmap[current_index:current_index + num_samples_in_file] = y_batch
        
        current_index += num_samples_in_file
        x_memmap.flush()
        y_memmap.flush()


In [7]:
import numpy as np
import pickle

def append_to_memmap(x_memmap, y_memmap, new_file_path):
    # Determine the current index from the existing memmap files
    current_index = np.where(~x_memmap[:].any(axis=(1, 2, 3)))[0][0]  # Finds the first all-zero sample
    
    # Load the new data
    with open(new_file_path, 'rb') as file:
        X, Y = pickle.load(file)

    Y = Y.reshape(-1, 2)

    x_new_data = np.array(X, dtype=x_memmap.dtype).reshape((-1,) + x_shape)
    y_new_data = np.array(Y, dtype=y_memmap.dtype).reshape((-1,) + y_shape)

    # Check that there is enough space in the memmap files for the new data
    if current_index + len(x_new_data) > len(x_memmap):
        raise ValueError("Not enough space in memmap files to append new data.")
    
    # Append the new data onto the end of the memmap files
    x_memmap[current_index:current_index + len(x_new_data)] = x_new_data
    y_memmap[current_index:current_index + len(y_new_data)] = y_new_data
    
    # Flush changes to disk
    x_memmap.flush()
    y_memmap.flush()
    print(f"Appended new data. New data end index is {current_index + len(x_new_data)}.")

# Usage:
new_file_path = './process_MPIIGaze/all_data_200_100.pkl'
append_to_memmap(x_memmap, y_memmap, new_file_path)


TypeError: list indices must be integers or slices, not tuple

In [3]:
directory_path = './process_MPIIGaze/' 
process_and_combine_pkl_files_to_memmap(directory_path, x_memmap, y_memmap)

In [4]:
# Check the len of memmap files
x_memmap = np.memmap('x_dataset.memmap', dtype=x_dtype, mode='r', shape=(num_samples,) + x_shape)
y_memmap = np.memmap('y_dataset.memmap', dtype=y_dtype, mode='r', shape=(num_samples,) + y_shape)

print(len(x_memmap))
print(len(y_memmap))

274874
274874


In [28]:
import cv2
import numpy as np

# Select the image
image = x_memmap[-1].astype(np.float32)

# Normalize the image based on its dynamic 

# Display the image
cv2.imshow('image', image)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [5]:
import cv2
import numpy as np

# Select the image
image = x_memmap[500].astype(np.float32)

# Normalize the image based on its dynamic 

# Display the image
cv2.imshow('image', image)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [25]:
y_memmap[-3979]

memmap([0.2389, 0.981 ], dtype=float16)

In [None]:
def memmap_batch_generator(x_memmap_path, y_memmap_path, batch_size=16, shuffle=True):
    x_memmap = x_memmap_path
    y_memmap = y_memmap_path
     
    num_samples = len(x_memmap)
    indices = np.arange(num_samples)

    while True:
        if shuffle:
            np.random.shuffle(indices)

        for start_idx in range(0, num_samples, batch_size):
            end_idx = min(start_idx + batch_size, num_samples)
            batch_indices = indices[start_idx:end_idx]

            # Yield a batch of data
            yield x_memmap[batch_indices], y_memmap[batch_indices]


In [None]:
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Dropout, Flatten
from keras.regularizers import l2
# Maxpooling
from keras.layers import MaxPooling2D
#BatchNormalization
from keras.layers import BatchNormalization

# Your model definition
model = Sequential([
    Conv2D(32, (7, 7), activation='relu', input_shape=(75, 150, 3), kernel_regularizer=l2(0.001)),
    
    
    Conv2D(64, (7, 7), activation='relu'),
    MaxPooling2D((2, 2)),
    BatchNormalization(),
    Dropout(0.15),


    Conv2D(128, (5, 5), activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.15),
    MaxPooling2D((2, 2)),
    

    Conv2D(256, (3, 3), activation='relu'),


    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(2, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.00005), loss='mse', metrics=['mean_squared_error', 'mean_absolute_error'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)



In [None]:
batch_size = 16

In [None]:
# Assuming you have already set up your model

# Paths to your memmap files
x_memmap_path = 'x_dataset.memmap'
y_memmap_path = 'y_dataset.memmap'

# Calculate steps per epoch and validation steps
num_train_samples = int(200000 * 0.7)  # Example: 60% for training
num_val_samples = int(200000 * 0.15)   # Example: 15% for validation

steps_per_epoch = num_train_samples // batch_size
validation_steps = num_val_samples // batch_size

# Create generators
train_generator = memmap_batch_generator(x_memmap_path, y_memmap_path, batch_size, shuffle=True)
validation_generator = memmap_batch_generator(x_memmap_path, y_memmap_path, batch_size, shuffle=False)  # Assuming you can use the same for simplicity

# Train the model
model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    epochs=100,  # Adjust as needed
    validation_data=validation_generator,
    validation_steps=validation_steps,
    callbacks=[early_stopping]
)


In [None]:
# Example: Evaluate on a subset
x_test_memmap = np.memmap(x_memmap_path, dtype='float16', mode='r', shape=(num_samples) + x_shape)
y_test_memmap = np.memmap(y_memmap_path, dtype='float16', mode='r', shape=(200000, 2))

# Assuming the last 15% of the data is for testing
test_start_index = int(200000 * 0.85)
x_test = x_test_memmap[test_start_index:]
y_test = y_test_memmap[test_start_index:]

model.evaluate(x_test, y_test, batch_size=16)
