# Attention Tracking with ROI

This module implements an attention tracking system using state of the art face detection algorithms (OpenCV and Dlib shape predictor class). This notebook requires proper installation  these library dependencies.

The is sectioned into:

* Head pose CNN Classifier (using a pretrained VGG16 model)
* Physical environment parameters

* Face detector: Dlib face shape detector
* Eye detector: iris tracking reference()

The above functions are exposed to our simulating environment which we refer to in the module named ().




In [30]:
import numpy as np
import pandas as pd
import cv2
import imutils
import imutils.video
from imutils.video import VideoStream
from imutils.video import FPS
from imutils import face_utils
import dlib
import time
import numpy.linalg as LA
import os, os.path, sys
import matplotlib.pyplot as plt

In [3]:
import keras
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, Reshape
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [None]:
import re

train_data = []
data = "../data/clean_data/"
train_dir = "../data/tf_data/"
clean_dups = r"[A-Z]+_[0-9]+"
count = 0 

for i in os.listdir(train_dir):
    m = re.match(clean_dups, i)
    if m is not None:
        match = m.group(0)
        print(match)
        if i not in train_data and match != None:
            train_data.append(i)
            img = cv2.imread(train_dir+i)
            cv2.imwrite(os.path.join(data,str(count+1)+".jpg"), img)
            count+=1
            if count ==50 : break
    
size = len(os.listdir(data))
print(size)

In [8]:
# This function is a spiral function that returns an array of tuple pairs
# used for generating our mouse position (x,y) values
def spiral(X, Y):
    pairs = []
    x = y = 0
    dx = 0
    dy = -5
    for i in range(max(X, Y)**2):
        if (-X/2 < x <= X/2) and (-Y/2 < y <= Y/2):
            pairs.append((x, y))
        if x == y or (x < 0 and x == -y) or (x > 0 and x == 5-y):
            dx, dy = -dy, dx
        x, y = x+dx, y+dy
    return pairs

In [12]:
import pygame, sys, math

#Here, we initialize a pygame environment to compute parameters for our game loop.

pygame.init()
W,H = 800, 800;
x, y = W//2, H//2
radius = 40
color = (255,0,0)

points = spiral(W, H)
screen = pygame.display.set_mode((W, H))
count = 0
run = True
point = iter(points)

while run :
    #dt = clock.tick()/100
    pygame.time.delay(3)
    for event in pygame.event.get():
        if event.type == pygame.QUIT: run = False
        #if event.type == MOUSEBUTTONUP: None
    screen.fill((0,0,0))
    pygame.draw.circle(screen, color, (x, y), radius)
    
    x, y = next(point)
    x, y = x+W//2, y+H//2
    
    pygame.display.update()
    if count == 50:
        break

KeyboardInterrupt: 

In [None]:
import boto3
session = boto3.Session(profile_name='default')                        
client = session.client('rekognition')

W, H = 450,450

def detect_labels_local_file(path):
    # @params: input - path containing all face images
    # @params: output- This function returns the a list of tuples holding the roll, 
    #                  pitch and yaw values of face images in the input path.
    
    label_arr = []
    
    #iterate over n images in input path
    for i in os.listdir(path):
        if i.endswith('.jpg'):
            old_photo = os.path.join(path,i)
            with open(old_photo, 'rb') as image:
                
                #initiate our aws API service for each image
                response = client.detect_faces(Image={'Bytes': image.read()})
                label = response['FaceDetails']
                
                #check for a single face in each image
                if len(label) == 1:
                    for label in label:
                        pose_labels = (int(label['Pose']['Roll']),
                                       int(label['Pose']['Yaw']),
                                       int(label['Pose']['Pitch']))
                        
                        x1, y1, w, h = (int(label['BoundingBox']['Left']*W), 
                                        int(label['BoundingBox']['Top']*H), 
                                        int(label['BoundingBox']['Width']*W),
                                        int(label['BoundingBox']['Height']*H))    
                        
                        #convert boundingbox format
                        x1, y1, x2, y2 = x1, y1, (x1+w), (y1+h)
                        label_arr.extend([i, pose_labels])
                        cropped_photo = cv2.imread(old_photo)
                        
                        #using our boundingbox, crop face of each image
                        new_photo = cropped_photo[y:y2, x:x2]
                        
                        #save our images into the same path as our input images
                        cv2.imwrite(old_photo, new_photo)
                        print(pose_labels)
                else:
                    os.remove(old_photo)
    return label_arr
print(detect_labels_local_file(data))

In [None]:
VGGmodel = keras.applications.VGG16(input_shape=(224, 224, 3),
                                 include_top=False,
                                 weights='imagenet')
VGGmodel.trainable = False

def create_model(model_type):
    model = Sequential()
    model.add(Convolution2D(64, kernel_size=(3,3), activation='relu'))
    model.add(BatchNormalization())
    #model.add(MaxPooling2D()4
    
    #model.add(Reshape(()))
    model.add(Convolution2D(64, kernel_size=(1,1), activation='relu'))
    model.add(BatchNormalization())
    #model.add(MaxPooling2D())
    
    model.add(Convolution2D(128, kernel_size=(1,1), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D())
    
    model.add(Flatten())
    
    if model_type == "head_pose_model":
        model.add(Dense(3))
        pass
    elif model_type == "eye_gaze_model":
        model.add(Dense(2))
    
    return model

hp_model = create_model('head_pose_model')
hp_model = Sequential([VGGmodel,hp_model])

#eg_model = Sequential([VGGmodel,eg_model])
hp_model.summary()

In [None]:
batch_size = 32
img_height = 100
img_width = 100
epochs = 20
size = len(train_data)

labels = pd.read_csv('../data/tf_label/labels.csv')

train_gen = ImageDataGenerator(rescale=1./255)
valid_gen = ImageDataGenerator(resclae=1./255)

train_gen = train_gen.flow_from_directory(batch_size=batch_size,
                                            directory=train_dir,
                                            classes=None,
                                            shuffle=True,
                                            target_size=(img_width, img_width),
                                            class_mode='multi_output')
def getLabel(x,y):
    y = np.array([])
    for i in x:
        y = y.append(labels[i])
    return y
    
def Generator(train_gen, getLabels):
    for x, y in train_gen:
        yield x, getLabels(x,y)
        
gen = Generator(train_gen, getLabels)

model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=keras.optimizers.RMSprop(),
              metrics=['accuracy'])

history = model.fit_generator(
                    gen, 
                    batch_size,
                    epochs,
                    validation_split=0.2,
                    steps_per_epoch=(len(train_data)//batch_size))


test_scores = model.evaluate(x_test, y_test, verbose=2)

print('Test loss:', test_scores[0])
print('Test accuracy:', test_scores[1])

In [None]:
# We define our eye aspect ratio below where 0=< ratio =< 1 for closed to opened respectively
def Ear(eye):
    # compute the euclidean distances between the two sets of
    # vertical eye landmarks (x, y)-coordinates
    A = LA.norm(eye[1] - eye[5])
    B = LA.norm(eye[2] - eye[4])

    # compute the euclidean distance between the horizontal
    # eye landmark (x, y)-coordinates
    C = LA.norm(eye[0] - eye[3])

    # compute the eye aspect ratio
    ratio = (A + B) / (2.0 * C)

    # return the eye aspect ratio
    return ratio

In [1]:
# Each classroom object is referenced to the World Coordinate of the environment(classroom C)
#floor_plane = 

In [2]:
# We use the function below to get the detection plane by 
# selecting the corners of the 

import pygame.mouse
import sys
import numpy as np
import cv2

color = (0,255,0)
radius = 3
filename = '../data/plane.jpg'
scale = 0.2
W, H = cv2.imread(filename).shape[:2]
W, H = int(W*scale), int(H*scale)

def getCorners():
    # Get shape of window from size of snapshot save in directory
    #W, H = cv2.imread(snapshot).shape[:2]
    pygame.init()
    points = []
    screen = pygame.display.set_mode((H,W))
    pygame.display.set_caption('Detect plane\'s four corners')
    background = pygame.image.load(filename).convert()
    background = pygame.transform.rotozoom(background,-90,scale)
    run = True
    screen.blit(background,[0,0])
    pygame.display.update()
    while run:
        for event in pygame.event.get():
            if event.type == pygame.QUIT:  run = False
            if event.type == pygame.MOUSEBUTTONDOWN:
                pos = pygame.mouse.get_pos()
                points.append(pos)
                for i in points:
                    pygame.draw.circle(screen, color, i, radius)
                pygame.display.update()
                if len(points) >= 4: run = False
    pygame.quit()
    exit()
    return points

def _orderPoints_():
    points = np.array(getCorners()).reshape(4,2)
    OP = np.zeros(points.shape[:2])

    top_left = np.argmin(np.sum(points, axis=1))
    top_right = np.argmin(np.diff(points, axis=1))
    bottonw_right = np.argmax(np.sum(points, axis=1))
    bottonw_left = np.argmax(np.diff(points, axis=1))
    
    OP[0] = points[top_left]
    OP[1] = points[top_right]
    OP[2] = points[bottonw_right]
    OP[3] = points[bottonw_left]
    
    
    def findIntersection(x1,y1,x2,y2,x3,y3,x4,y4):
        px= ( (x1*y2-y1*x2)*(x3-x4)-(x1-x2)*(x3*y4-y3*x4) ) / ( (x1-x2)*(y3-y4)-(y1-y2)*(x3-x4) ) 
        py= ( (x1*y2-y1*x2)*(y3-y4)-(y1-y2)*(x3*y4-y3*x4) ) / ( (x1-x2)*(y3-y4)-(y1-y2)*(x3-x4) )
        return (px, py)
    
    # We compute Vxy as the vanishing point on the ground plane
    Vp = findIntersection(OP[3][0],OP[3][1],OP[0][0],OP[0][1],OP[2][0],OP[2][1],OP[1][0],OP[1][1])
    
    return Vp, tuple(OP)
_orderPoints_()

((266.61531235321746, 416.782527007985),
 (array([211., 501.]),
  array([342., 502.]),
  array([480., 658.]),
  array([106., 660.])))

In [3]:
import cv2
import numpy as np

In [6]:
from collections import namedtuple, OrderedDict

class Classroom:
    __slot__ = ['x', 'y', 'z']
    
    def __init__(self, width, height, depth, offset):

        self.width = width
        self.height = height
        self.depth = depth
        
        # We define offset as the depth distance between image plane and 
        # the ground plane
        self.offset = offset
        
        self.x = width//2
        self.y = height//2
        self.z = 0
        
        detection_plane = _orderPoints_()
        
        self.ROI = None
        
        self.objects =  orderedDict()
        
        #tracker = cv2.MultiTracker_create('CSRT')
        
    def get_object(self, obj):
        #self.objects['obj'] = 
        
        return 
    
    def get_HG(self):
        class_plane = np.array([[0,0],
                        [self.width-1, 0],
                        [self.width-1, self.depth-self.offset-1],
                        [0, self.depth-self.offset-1]],dtype='int32')
        H, _ = cv2.findHomography(self.detection_plane, 
                            class_plane, method=0, 
                            ransacReprojThreshold=3.0, 
                            status=None)
        return H
        
    def addBoard(self, board_obj, position):
        if len(position) == 2:
            self.ROI = (position[0] + board_obj.cx, position[1]+ board_obj.cy)
        else:
            self.ROI = (self.x, self.y)
            
        print('{} Board added at {}'.format(board_obj, self.ROI))
        #self.objects['board_obj'] = (position[0], position[1], self.Point.z)
        return self.ROI
    
class Board:
    def __init__(self, width, height):
        self.height = height
        self.width = width
        self.depth = 0
        self.cx, self.cy = width//2, height//2


In [7]:
def main():
    c_w, c_h, c_d, offset = (400, 150, 300, 50)
    b_w, b_h = (200,100)
    
    csci595 = Classroom(c_w, c_h, c_d, offset)
    board = Board(b_w, b_h)
    
    position = ((c_w-b_w)//2,(c_h-b_h)//2)
    
    csci595.addBoard(board, position)
    
    csci595.HG()
    
    return 0

if __name__ ==main():
    main()

TypeError: 'tuple' object does not support item assignment

In [41]:

class Face:
    def __init__(self):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
    
        #X,Y,Z = getHomography(, Z=0) #move to classroom class
        self.face_gaze = 
        Gaze_total = 
        
    def getPose(self):
        #model_f.run()
        return theta_x, theta_y, theta_z
    
    def show_attr(self):
        print(self.x, self.y, self.w, self.h, theta_x, theta_y)

#We create our eye class which contains eye properties
class Eyes(Face):
    def __init__(self):
        super().__init__()
        eye_aspect_ratio = EAR(self)
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        theta_x, theta_y = getPose(self)
        
    def getPose(self):
        #model_e.run(img[])
        return theta_x, theta_y

SyntaxError: invalid syntax (<ipython-input-41-f76bfaca5ddf>, line 9)

# Let's create our 3D environment using pygame to view

In [4]:
import pygame, sys, math

#Here, we initialize a pygame environment to compute parameters for our game loop.

pygame.init()

W,H = 300,300;
cx,cy = W//2,H//2
screen = pygame.display.set_mode((W, H))
w,h = 100,100

clock = pygame.time.Clock()
verts = (-1,-1,-1),(1,-1,-1),(-1,1,-1),(-1,-1,1),(1,1,-1),(1,-1,1),(-1,1,1),(1,1,1)
edges = (0,1),(1,2),(2,3),(3,0),(4,5),(5,6),(6,7),(7,4),(0,4),(3,7),(2,6),(1,5)
#img = "../data/pygame_ball.jpeg"
#ball = pygame.image.load(img)
#ballrect = ball.get_rect()

while True :
    dt = clock.tick()/100
    for event in pygame.event.get():
        if event.type == pygame.QUIT: False
    screen.fill((255,255,255))
    pygame.draw.rect(screen, color, (cx,cy, w, h))
    pygame.display.update()
    """for x, y, z in verts:
        z+=5
        f = 200/z
        x,y = x*f,y*f
        pygame.draw.circle(screen,(0,0,0),(cx+int(x),cy+int(y)),5)
        
    for edge in edges:
        points = []
        for x,y,z in (verts[edge[0]],verts[edge[1]]):
            z+=5
            f = 200/z
            x,y = x*f, y*f
            points+=[cx+int(x), cy+int(y)] 
        pygame.draw.circle(screen,(0,0,0), points[0],points[1],1)"""
    
    pygame.display.flip()
pygame.quit()



NameError: name 'color' is not defined

In [None]:
"""class ResnetIdentityBlock(tf.keras.Model):
  def __init__(self, kernel_size, filters):vc
    super(ResnetIdentityBlock, self).__init__(name='')
    filters1, filters2, filters3 = filters

    self.conv2a = tf.keras.layers.Conv2D(filters1, (1, 1))
    self.bn2a = tf.keras.layers.BatchNormalization()

    self.conv2b = tf.keras.layers.Conv2D(filters2, kernel_size, padding='same')
    self.bn2b = tf.keras.layers.BatchNormalization()

    self.conv2c = tf.keras.layers.Conv2D(filters3, (1, 1))
    self.bn2c = tf.keras.layers.BatchNormalization()

  def call(self, input_tensor, training=False):
    x = self.conv2a(input_tensor)
    x = self.bn2a(x, training=training)
    x = tf.nn.relu(x)

    x = self.conv2b(x)
    x = self.bn2b(x, training=training)
    x = tf.nn.relu(x)

    x = self.conv2c(x)
    x = self.bn2c(x, training=training)

    x += input_tensor
    return tf.nn.relu(x)


block = ResnetIdentityBlock(1, [1, 2, 3])"""

In [None]:
help(cv2.Tracker)
tracker = cv2.Tracker(img, bbox)

In [None]:
# Instantiate frontal face detector and predictor
face_detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("../iris-tracker-by-contours/shape_predictor_68_face_landmarks.dat")

#Using dlib face shape predictor, we extract the index for the left and right eyes 
lEye_l,lEye_r = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
rEye_l,rEye_r  = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]

In [None]:
# This cell runs on every frame of the stream which runs an 
# object tracker and a face detector at next frame under 
# certain conditions and constrainst as described below.

cam = cv2.VideoCapture(0)
while True:
    
    _,frame = cam.read()
    roi = frame
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    
    # detect dlib face rectangles
    faces = face_detector(gray, 0)
    
    for k, face in enumerate(faces):
        
        eyes = []
        
        # convert dlib rect to a bounding box
        x,y,w,h = face_utils.rect_to_bb(face)
        
        # print(x,y,w,h)
        xc,yc = int(x+w/2),int(y+h/2)
        radius = int((w**2+h**2)**0.5)
        
        cv2.circle(frame,(xc,yc), radius, (255,0,0), thickness=1)
        
        # We c 
        
        _face = predictor(gray, face)
        _face = face_utils.shape_to_np(_face)
        print(_face)
        
        # Eye indexes
        leftEye = _face[lEye_l:lEye_r]
        #leftEye_mid = LA.norm(eye[0], eye[3],1)
        #rightEye_mid = LA.norm(eye[0], eye[3],1)
        rightEye = _face[rEye_l:rEye_r]

        Eye1 = Eye(leftEye)
        Eye2 = Eye(rightEye)
        
        eyes.append(Eye1)  #wrap in a list
        eyes.append(Eye2)
        

        # loop through both eyes
        for index, eye in enumerate(eyes):

            eye_EAR = Ear(eye)
            single_eye_state = []  # first entry is eye index, second entry is the closed or eye direction state

            left_side_eye = eye[0]  # left edge of eye
            right_side_eye = eye[3]  # right edge of eye
            top_side_eye = eye[1]  # top side of eye
            bottom_side_eye = eye[4]  # bottom side of eye

            # calculate height and width of dlib eye keypoints
            eye_width = right_side_eye[0] - left_side_eye[0]
            eye_height = bottom_side_eye[1] - top_side_eye[1]

            # create bounding box with buffer around keypoints
            eye_x1 = int(left_side_eye[0] - 0 * eye_width)  # .25 works well too
            eye_x2 = int(right_side_eye[0] + 0 * eye_height)  # .75 works well too

            eye_y1 = int(top_side_eye[1] - 1 * eye_height)
            eye_y2 = int(bottom_side_eye[1] + 1 * eye_height)

            # draw bounding box around eye roi
            cv2.rectangle(frame,(eye_x1, eye_y1), (eye_x2, eye_y2),(0,255,0),2)

            # draw the circles for the eye landmarks
            for i in eye:
                cv2.circle(frame, tuple(i), 3, (0, 0, 255), -1)
                
            # d=10920.0/float(w)

            roi = frame[eye_y1:eye_y2,eye_x1:eye_x2]

            #  ---------    check if eyes open   -------------  #

            # state is open/close, or direction looking
            eye_state = None

            if eye_EAR > 0.25:

                #  ---------    find center of pupil   -------------  #

                gray=cv2.cvtColor(roi,cv2.COLOR_BGR2GRAY)  # grey scale convert
                blur = cv2.medianBlur(gray,5) # blue image to find the iris better
                equ = cv2.equalizeHist(blur)  # ie, improve contrast by spreading the range over the same window of intensity
                thres=cv2.inRange(equ,0,15)  # threshold the contour edges, higher number means more will be black
                kernel = np.ones((3,3),np.uint8)  # placeholder

            #     #/------- removing small noise inside the white image ---------/#
                dilation = cv2.dilate(thres,kernel,iterations = 2)
            #     #/------- decreasing the size of the white region -------------/#
                erosion = cv2.erode(dilation,kernel,iterations = 3)
            #     #/-------- finding the contours -------------------------------/#
                image, contours, hierarchy = cv2.findContours(erosion,cv2.RETR_TREE,cv2.CHAIN_APPROX_NONE)
            #     #--------- checking for 2 contours found or not ----------------#

                pupil_found = None

                if len(contours)==2 :
                    # print('2 contours found')
                    pupil_found = True

                    img = cv2.drawContours(roi, contours, 1, (0,255,0), 3)
                    #------ finding the centroid of the contour ----------------#
                    M = cv2.moments(contours[1])

                    if M['m00']!=0:
                        cx = int(M['m10']/M['m00'])
                        cy = int(M['m01']/M['m00'])
                        cv2.line(roi,(cx,cy),(cx,cy),(0,0,255),3)
                        # print(cx,cy)
                        
                #we

                if len(contours)==1:
                    pupil_found = True
                    # print('only 1 contour found ------- ')

                    img = cv2.drawContours(roi, contours, 0, (0,255,0), 3)

                    #------- finding centroid of the contour ----#
                    M = cv2.moments(contours[0])
                    if M['m00']!=0:
                        cx = int(M['m10']/M['m00'])
                        cy = int(M['m01']/M['m00'])
                        # print(cx,cy)
                        cv2.line(roi,(cx,cy),(cx,cy),(0,0,255),3)

                if pupil_found:
                    # find ratio of distance from each side of the eye bounding box
                    # to get quantify direction of pupil

                    width_ratio = cx / eye_width
                    height_ratio = cy / (eye_y2 - eye_y1)  # make sure to use bounding box height

                # eyes are opened, but pupils not found
                else:
                    print('Pupil not found')
                    single_eye_state.append(index)
                    single_eye_state.append('No pupil found')


        # end loop for one face

    cv2.imshow("frame",frame)
    # if the `q` key was pressed, break from the loop
    key = cv2.waitKey(1) & 0xFF
    if key == ord("q"):
        break
vs.release()
# print("accurracy=",(float(numerator)/float(numerator+denominator))*100)
cv2.destroyAllWindows()

# Gaze vector estimation and visualization

This notebook is a reference to _Paper: gaze estimation using a camera-based model in a classroom_ for running preliminary analysis on a webcam.
To run this notebook, a built-in webcam and good lighting is required. Meanwhile, some operating systems may require additional firewall settings to gain camera access. We make use of opencv's library to perform video processing and analysis for prelimninary tests on gaze capture.

Our gaze estimation application can be sectioned into:
 - Data pre-processing (face and eye detection)
 - Head pose classifiers (Euler angles: 𝛼,𝜃,𝜔) or reference plane
 - Attention boundary/matrix
 - Depth estimation function
 - Composition and visualization of gaze vector frequencies
Here, our model is primarily based on the relibility of our pre-trained face/eye detection(haarcascades). This dependency may undermine overall perfomance at scale. We aim to detect and identify multiple candidates in each frame of the image in order to estimate head and iris orientation.