In [0]:
# mount the drive containing the dataset
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# unzip the gaze history files
!unzip drive/My\ Drive/PFE/Gaze_txt_files

In [0]:
# create gaze dataset
import numpy as np
import os
from sklearn import preprocessing
import random


raw_data_path = "Gaze_txt_files"
dir_list = sorted(os.listdir(raw_data_path))
p_num = len(dir_list)
p_data_list = [{} for _ in range(p_num)]
# fill p_data
for dir_name in dir_list:
  person_gaze_txt_path = os.path.join(raw_data_path, dir_name)
  pid = int(dir_name.replace('p', ''))
  for video_name in os.listdir(person_gaze_txt_path):
    with open(os.path.join(person_gaze_txt_path,video_name), 'r') as f:
      trajectory_list = f.read().split()
      gaze_video_list = []
      f = 0
      for i in trajectory_list:
        # add only gazes for the sampled frames
        if f%5==0:
          i = i.split(",")
          x = float(i[-2])
          y = float(i[-1])
          gaze_video_list.append((x,y))
        f+=1
      p_data_list[int(pid)-1][int(video_name.split(".")[0])] = np.array(gaze_video_list)

In [0]:
# show some message
print('number of viewers: ', len(p_data_list))
print('number of videos for the first viewer: ', len(p_data_list[1]))
print('number of frames in the first video of first viewer (video 179): ', len(p_data_list[1][179]))
print('maximal value in video gazes : ', p_data_list[1][179].max())
print('minimal value in video gazes : ', p_data_list[1][179].min())

number of viewers:  45
number of videos for the first viewer:  36
number of frames in the first video of first viewer (video 179):  296
maximal value in video gazes :  0.9594789
minimal value in video gazes :  0.008680457


In [0]:
# get all videos's gazes and store them in a list
videos_frames_list = np.array([np.array([j, p_data_list[i][j]]) for i in range(len(p_data_list)) for j in p_data_list[i].keys()])

In [0]:
# show some information
print("number of total gazes history files: ", len(videos_frames_list))
for i in range(1, 100, 11):
  print("number of viewers who watched video",i,": ", np.sum(videos_frames_list[:,0]==i))

number of total gazes history files:  6654
number of viewers who watched video 1 :  33
number of viewers who watched video 12 :  33
number of viewers who watched video 23 :  33
number of viewers who watched video 34 :  33
number of viewers who watched video 45 :  30
number of viewers who watched video 56 :  30
number of viewers who watched video 67 :  30
number of viewers who watched video 78 :  34
number of viewers who watched video 89 :  34


In [0]:
def get_sampled_frames(gazes_history,f_i=5, step=5):
  if f_i - step + 1 < 0 or f_i + step + 1 > len(gazes_history[1]):
    return [], []
  # use the history gaze path in the first step frames to predict the gaze points in next step frames
  frames_X = np.array(gazes_history[1][f_i-step+1:f_i+1])
  frames_Y = np.array(gazes_history[1][f_i+1:f_i+step+1]) - frames_X
  return frames_X, frames_Y

In [0]:
# get the excel file containing the video names used in the training and testing
!cp drive/My\ Drive/PFE/train_test_set.xlsx .

In [0]:
# load train_test videos indices from the excel file 
import pandas as pd
train_ids = pd.read_excel("train_test_set.xlsx",header=None, sheet_name='train_set').to_numpy().flatten()
test_ids = pd.read_excel("train_test_set.xlsx",header=None, sheet_name='test_set').to_numpy().flatten()

In [0]:
def generate_dataset(train_ids, test_ids, videos_frames_list):
  train_x = []
  train_y = []
  test_x = []
  test_y = []
  # create the dataset of training
  for i in train_ids:
    video_all_viewers = videos_frames_list[videos_frames_list[:,0]==i]
    for view in video_all_viewers:
      for frame in range(4, len(view[1])-5):
        sampled_frames = get_sampled_frames(view, frame)
        train_x.append(sampled_frames[0])
        train_y.append(sampled_frames[1])
  # create the dataset of testing
  for i in test_ids:
    video_all_viewers = videos_frames_list[videos_frames_list[:,0]==i]
    # a video can have many viewers
    for view in video_all_viewers:
      # get a gaze trace for each 5 frames
      for frame in range(4, len(view[1])-5):
        sampled_frames = get_sampled_frames(view, frame)
        test_x.append(sampled_frames[0])
        test_y.append(sampled_frames[1])
  return np.array(train_x), np.array(train_y), np.array(test_x), np.array(test_y)

In [0]:
train_x, train_y, test_x, test_y = generate_dataset(train_ids,
                                                    test_ids,
                                                    videos_frames_list)

In [0]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Lambda, concatenate, Reshape, Add
from keras.applications.inception_resnet_v2 import InceptionResNetV2

from keras import backend as K

"""
The trajectory encoder module
"""
# Define an input sequence of previous gazes and process it.
encoder_inputs = Input(shape=(None, 2))
# Define the first lstm layer
lstm1 = LSTM(128, return_sequences=True, return_state=True)
# Define the second lstm layer
lstm2 = LSTM(128)

"""
The Saliency encoder module
"""
# Define input for all spatial and temporal saliency maps and process it.
saliency_inputs = Input(shape=(5, 480*6, 960, 3))
# get  Inception-ResNet-V2 to extract saliency features for gaze prediction followed with a global pooling
inception = InceptionResNetV2(weights='imagenet', include_top=False, input_shape=(480*6, 960, 3), pooling='avg')
#saliency_encoder_outputs

all_displacement_outputs=[]
traj_inputs = encoder_inputs

for i in range(5):
  # apply the first lstm on the trajectory history of the 5 gazes
  lstm1_outputs, state_h, state_c = lstm1(traj_inputs)
  lstm1_states = [state_h, state_c]
  # Set up the second lstm, using 'lstm1_states' as initial state.
  trajectory_encoder_outputs = lstm2(lstm1_outputs,
                                  initial_state=lstm1_states)
  # get only the current saliency input at frame t
  current_saliency_input = Lambda(lambda x: x[:,i])(saliency_inputs)
  # apply inception on the current saliency input
  current_saliency_inception = inception(current_saliency_input)
  """
  Displacement Prediction Module
  """
  # Concatenate s the output of saliency encoder module and trajectory encoder module
  merged = concatenate([current_saliency_inception, trajectory_encoder_outputs])
  # use two fully connected layer to estimate the displacement between the gaze point at time t + 1 and gaze point at time t
  dense_1000 = Dense(1000)(merged)
  displacement_output = Dense(2, activation='sigmoid')(dense_1000)
  # get the current gaze positions at frame t to add to the predicted gaze displacement
  current_gaze_t = Lambda(lambda x: x[:,-1])(traj_inputs)
  traj_inputs = concatenate([traj_inputs, Reshape((1,2))(Add()([current_gaze_t, displacement_output]))], 1)
  # append the current predicted gaze point at t+1 to the list of 5 predicted outputs
  all_displacement_outputs.append(displacement_output)
# concatenate all the element of the output list to get the final output
displacement_outputs = Reshape((5,2))(concatenate(all_displacement_outputs,1))
# Create the proposed model
model = Model([encoder_inputs, saliency_inputs], displacement_outputs)

In [2]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 5, 2880, 960, 0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, None, 2)      0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 2880, 960, 3) 0           input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 128),  67072       input_1[0][0]                    
                                                                 concatenate_2[0][0]        

In [0]:
# Compile the model
from keras.optimizers import SGD
sgd = SGD(lr=0.1, decay=5e-4, momentum=0.9)
model.compile(optimizer=sgd, loss='mean_squared_error')

In [0]:
hist = model.fit(train_x, train_y, validation_split=0.10, batch_size=512, epochs=20)

Train on 783552 samples, validate on 87062 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

In [0]:
test_y[10:11]

array([[[0.5016659, 0.4817111],
        [0.5002428, 0.4873397],
        [0.4950311, 0.4852373],
        [0.4498359, 0.4880946],
        [0.4475695, 0.4842822]]])

In [0]:
model.predict(test_x[10:11])

array([[[0.5145015 , 0.54397964],
        [0.5351348 , 0.541896  ],
        [0.5416983 , 0.53524816],
        [0.54063916, 0.53550714],
        [0.5448361 , 0.52955645]]], dtype=float32)

In [0]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, concatenate
from keras.applications.inception_resnet_v2 import InceptionResNetV2

"""
The trajectory encoder module
"""
# Define an input sequence of previous gazes and process it.
encoder_inputs = Input(shape=(None, 2))
# Define the first lstm layer
lstm1 = LSTM(128, return_sequences=True, return_state=True)
lstm1_outputs, state_h, state_c = lstm1(encoder_inputs)
lstm1_states = [state_h, state_c]
# Define the second lstm layer
lstm2 = LSTM(128)
# Set up the second lstm, using `lstm1_states` as initial state.
trajectory_encoder_outputs = lstm2(lstm1_outputs,
                                initial_state=lstm1_states)
"""
The Saliency encoder module
"""
# Define input for all spatial and temporal saliency maps and process it.
saliency_inputs = Input(shape=(480*6, 960, 3))
# Use  Inception-ResNet-V2 to extract saliency features for gaze prediction followed with a global pooling
saliency_encoder_outputs = InceptionResNetV2(weights='imagenet', include_top=False, pooling='avg')(saliency_inputs)
"""
Displacement Prediction Module
"""
# Concatenate s the output of saliency encoder module and trajectory encoder module
merged = concatenate([saliency_encoder_outputs, trajectory_encoder_outputs])
# use two fully connected layer to estimate the displacement between the gaze point at time t + 1 and gaze point at time t:
x = Dense(1000)(merged)
displacement_outputs = Dense(2)(x)

# Create the proposed model
model = Model([encoder_inputs, saliency_inputs], displacement_outputs)

Using TensorFlow backend.















Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.7/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [0]:
# here is a summary of the global architecture
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, None, 2)      0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 3840, 960, 3) 0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 128),  67072       input_11[0][0]                   
__________________________________________________________________________________________________
inception_resnet_v2 (Model)     (None, 1536)         54336736    input_12[0][0]                   
____________________________________________________________________________________________

In [0]:
# Compile the model
from keras.optimizers import SGD
sgd = SGD(lr=0.1, decay=5e-4, momentum=0.9)
model.compile(optimizer=sgd, loss='categorical_crossentropy')



