# <center>Saliency-Based Visual Attention</center>

<p><small>Sebastian Höffner<br />October 31, 2016</small>

<p><small><b>Laurent Itti, Christof Koch, Ernst Niebur</b>: A Model of Saliency-Based Visual Attention for Rapid Scene Analysis. <i>IEEE Transactions on Pattern Analysis and Machine Intelligence</i>, Vol 20, No 11, pp. 1254&ndash;1259. 1998.

<p><br /></p>
<p><small>Image credit goes to the paper if not otherwise mentioned. Note that I do not always introduce all notations, but I follow the naming in the paper closely, so please refer to the paper in case you get lost.</small>

In [None]:
## This cell contains the methods needed and imports, run before we start

%matplotlib inline

import numpy as np
import cv2
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema

def to256(image):
    return (255. * image / np.max(image)).astype(np.uint8)

def imshow(image, conversion=cv2.COLOR_BGR2RGB):
    """Converts the image from BGR to RGB and plots it. 
    Returns the image."""
    if len(image.shape) == 2:
        conversion = None
    if conversion is None:
        plt.imshow(image, cmap='gray')
    else:
        plt.imshow(cv2.cvtColor(image, conversion))
    plt.axis('off')
    return image
    
def imshow_pyr(images, name='Pyramid', conversion=cv2.COLOR_BGR2RGB):
    """Shows a pyramid of 9 images. Returns the pyramid."""
    plt.figure(name)
    for i in range(9):
        plt.subplot(3, 3, i + 1)
        if len(images[i].shape) == 2:
            conversion = None
        if conversion is None:
            plt.imshow(images[i], interpolation='none', cmap='gray')
        else:
            plt.imshow(cv2.cvtColor(images[i], conversion), interpolation='none')
        plt.title("{}x{}".format(images[i].shape[1], images[i].shape[0]))
        plt.axis('off')
    plt.tight_layout()
    plt.show()
    return images
    
def gauss_pyramid(image):
    """Calculates a Gaussian pyramid of 9 images
    (scales 1:2^0 ... 1:2^8)."""
    pyramid = [image.copy()]
    for i in range(8):
        pyramid.append(cv2.pyrDown(pyramid[-1]))
    return pyramid

def intensity(image):
    """Calculates an intensity image, the average
    over all channels per pixel."""
    return (np.sum(image, 2) / 3.).astype(np.uint8)

def hue_from_intensity(image, intensity_image):
    """Calculates the hue image by dividing by
    the intensity. 0 where the intensity is too small 
    (< 10% of max)"""
    copy_img = image.copy()
    copy_int = intensity_image.copy()
    copy_int.shape = copy_int.shape + (1,)
    idx = np.where(copy_int > np.max(copy_int) * .1)
    copy_img[idx] = 1. * copy_img[idx] / copy_int[idx]
    copy_img[np.where(copy_int <= np.max(copy_int) * .1)] = 0
    return copy_img

def channel_or_zeros(c0, c1, c2):
    """Fills up a color channel with 0 channels to plot it nicely.
    Set channels to None to use zeros."""
    if c0 is not None:
        z = np.zeros(c0[:,:,np.newaxis].shape)
    elif c1 is not None:
        z = np.zeros(c1[:,:,np.newaxis].shape)
    elif c2 is not None:
        z = np.zeros(c2[:,:,np.newaxis].shape)
    
    out = np.append(c0[:,:,np.newaxis] if c0 is not None else z, c1[:,:,np.newaxis] if c1 is not None else z, axis=2)
    out = np.append(out, c2[:,:,np.newaxis] if c2 is not None else z, axis=2)
    return to256(out)
    
def get_orientations(image):
    """Returns a list of orientation images."""
    orientations = []
    input_image = image.copy()
    for sigma in range(9):
        for theta in range(0, 180, 45):
            ksize = tuple((np.array(input_image.shape) * .1).astype(np.uint))
            kernel = cv2.getGaborKernel(ksize, sigma, theta, 10, float(input_image.shape[0]) / float(input_image.shape[1]))
            orientations.append(cv2.filter2D(input_image, -1, kernel))
        input_image = cv2.resize(input_image, None, fx=.5, fy=.5)
    return orientations

def center_surround_diff_intensity(pyramid,  cs=[2, 3, 4], deltas=[3, 4]):
    """Calculates the center surround intensity differences for a single intensity pyramid."""
    feature_maps = []
    for c in cs:
        for d in deltas:
            s = c + d
            abs_diff = np.abs(pyramid[c] - cv2.resize(pyramid[s], pyramid[c].T.shape))
            feature_maps.append(to256(abs_diff))
    return feature_maps

def center_surround_diff_color(pyramid1, pyramid2,  cs=[2, 3, 4], deltas=[3, 4]):
    """Calculates the center surround color differences for complex color pyramides.
    The difference to intensity is, that two colors are combined such that
    $\ominus(a-b, b-a)$."""
    feature_maps = []
    for c in cs:
        for d in deltas:
            s = c + d
            image_big = pyramid1[c] - pyramid2[c]
            image_small = pyramid2[s] - pyramid1[s]
            abs_diff = np.abs(image_big - cv2.resize(image_small, image_big.T.shape))
            feature_maps.append(to256(abs_diff))
    return feature_maps

def center_surround_diff_orientation(pyramid,  cs=[2, 3, 4], deltas=[3, 4]):
    """Calculates the center surround intensity differences for a single intensity pyramid."""
    feature_maps = []
    for c in cs:
        for d in deltas:
            s = c + d
            abs_diff = np.abs(pyramid[c] - pyramid[s])
            feature_maps.append(to256(abs_diff))
    return feature_maps

def normalize(image):
    """Calculates the global normalization"""
    M = np.max(image)
    m = np.array([m for m in image[argrelextrema(image, np.greater)] if m != M])
    mean = np.mean(m)
    factor = (M - mean) ** 2
    return to256(image.copy() * factor)

def sum_up(images, target_size=None):
    """Sums up images. If a target_size is given, they are rescaled
    to that first. Converts the images to uint!"""
    def convert(img):
        i = img.copy()
        if target_size is not None:
            iMax = np.max(i)
            i = cv2.resize(to256(i), target_size)
            i = (1. * i * iMax / 256).astype(np.uint)
        return i.astype(np.uint)
    im = [convert(image) for image in images]
    result = im[0]
    for i, image in enumerate(im[1:]):
        result += image
    return result

def combine_colors(RG, BY):
    """Combines to color pyramides into one color pyradid of 
    summed colors. Converts the images to uint!"""
    return [RG[i].astype(np.uint) + BY[i].astype(np.uint) for i in range(len(RG))]

### Saliency: A method for early selection

- bottom-up approach
- scene-dependent rather than task-dependent (exogenous)
- select highly salient regions to filter upon

<br />

<center><bold><big>Saliency ⇒ "Where is the fun?!"</big></bold></center>

### Model Architecture


<img src="ittikochniebur_model_architecture.png" alt="Model architecture" style="width: 60%;"/>

In [None]:
image = imshow(cv2.imread('image.jpg'))

### Intensity image

<img src="ittikochnieburg_intensity.png" alt="Intensity" style="width: 200px" />

$$\text{int} = \frac{\text{red} + \text{green} + \text{blue}}{3}$$



In [None]:
intensity_image = imshow(intensity(image))

### Gaussian pyramid: intensity

In [None]:
intensity_pyramid = imshow_pyr(gauss_pyramid(intensity_image), 'Gaussian Pyramid Intensity')

In [None]:
_ = imshow_pyr(gauss_pyramid(image), 'Gaussian Pyramid Original Image')

### Colors

<img src="ittikochnieburg_colors.png" alt="Colors" style="width: 200px" />





### Hue image

$\text{hue}_{x,y} = f(\text{img}, \text{int}, x, y) = \begin{cases} \text{img}_{x,y} \ /\  \text{int}_{x,y} &\quad if\ \text{int}_{x,y} > 0.1 \cdot  \max{\text{int}} \\ 0 &\quad else\ \end{cases}$

Note the $0.1$: at low luminance we can't perceive them well.

In [None]:
hue_image = imshow(hue_from_intensity(image, intensity_image))

In [None]:
plt.figure('image to hue')
plt.subplot(1, 3, 1); imshow(image)
plt.subplot(1, 3, 2); imshow(intensity_image)
plt.subplot(1, 3, 3); _ = imshow(hue_image)

### Hue image to colors

- 4 color maps
- R, G, B, Y (Following the [Opponent process](https://en.wikipedia.org/wiki/Opponent_process))

![Opponent process](https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Opponent_colors.svg/480px-Opponent_colors.svg.png)

<small>Image: wikipedia</small>

In [None]:
b, g, r = hue_image[:,:,0], hue_image[:,:,1], hue_image[:,:,2]

R = to256( r - (g + b) / 2. )
G = to256( g - (r + b) / 2. )
B = to256( b - (r + g) / 2. )
Y = to256( (r + g) / 2. - np.abs(r - g) / 2. - b )

In [None]:
plt.figure('Color channels')
plt.subplot(2, 2, 3); imshow(channel_or_zeros(None, None, R)); plt.title('Red')
plt.subplot(2, 2, 1); imshow(channel_or_zeros(None, G, None)); plt.title('Green')
plt.subplot(2, 2, 4); imshow(channel_or_zeros(B, None, None)); plt.title('Blue')
plt.subplot(2, 2, 2); imshow(channel_or_zeros(None, Y, Y)); _ = plt.title('Yellow')

In [None]:
R_pyramid = gauss_pyramid(R)
G_pyramid = gauss_pyramid(G)
B_pyramid = gauss_pyramid(B)
Y_pyramid = gauss_pyramid(Y)

In [None]:
_ = imshow_pyr(R_pyramid, 'Gauss R')

In [None]:
_ = imshow_pyr(G_pyramid, 'Gauss G')

In [None]:
_ = imshow_pyr(B_pyramid, 'Gauss B')

In [None]:
_ = imshow_pyr(Y_pyramid, 'Gauss Y')

### Orientation

- Gabor cells
- Four different orientations: 0°, 45°, 90°, 135°

In [None]:
def get_orientations(image):
    """Returns a list of orientation images."""
    orientations = []
    kernels = []
    input_image = image.copy()
    for sigma in range(9):
        for theta in range(0, 180, 45):
            ksize = tuple((np.array(input_image.shape) * .25).astype(np.uint))
            kernel = cv2.getGaborKernel(ksize, sigma, theta, 10, float(input_image.shape[0]) / float(input_image.shape[1]))
            kernels.append(kernel)
            orientations.append(cv2.filter2D(input_image, -1, kernel))
        input_image = cv2.resize(input_image, None, fx=.5, fy=.5)
    return orientations, kernels

orientations, kernels = get_orientations(intensity_image)

In [None]:
plt.figure('Gabor cells')        
for i, kernel in enumerate(kernels):
    plt.subplot(9, 4, i + 1); 
    imshow(kernel)

In [None]:
plt.figure('Gabor results (one)')
_ = imshow(orientations[16])

In [None]:
plt.figure('Gabor results')
for i, img in enumerate(orientations):
    plt.subplot(9, 4, i + 1);
    imshow(img)

### Feature maps

- 6 for intensity contrast - mammals: dark centers bright surrounds or vice-versa
- 12 for color - mammals: excitation by one color, inhibition by opposite color
- 24 for orientation - mammals: primary visual cortex has layers to detect orientations
- = 42 feature maps

<img src="ittikochniebur_model_architecture.png" alt="Model architecture" style="width: 60%;" />

### Center-surround differences

$$\mathcal{I}(c, s) = \left|I(c) \ominus I(s)\right|$$

$$c \in \left\{2, 3, 4\right\}, s = c + \delta, \delta \in \left\{3, 4\right\}$$

<br />

<center>$a \ominus b$: <i>Interpolate $b$ to the size of $a$ (the bigger image) and do a point-wise subtraction</i></center>


In [None]:
intensity_maps = center_surround_diff_intensity(intensity_pyramid)
plt.figure("Center surround differences -- Intensity")
for i in range(6):
    plt.subplot(3,2,i+1)
    imshow(intensity_maps[i])

In [None]:
RG_maps = center_surround_diff_color(R_pyramid, G_pyramid); idx_RG = [1, 2, 5, 6, 9, 10]
BY_maps = center_surround_diff_color(B_pyramid, Y_pyramid); idx_BY = [3, 4, 7, 8, 11, 12]
plt.figure("Center surround differences -- Color (RG / BY)")
for i in range(6):
    plt.subplot(3,4,idx_RG[i])
    imshow(RG_maps[i])
for i in range(6):
    plt.subplot(3,4,idx_BY[i])
    imshow(BY_maps[i])

In [None]:
orientation_maps_0   = center_surround_diff_intensity(orientations[0::4]); idx_0   = [1, 2, 9, 10, 17, 18]
orientation_maps_45  = center_surround_diff_intensity(orientations[1::4]); idx_45  = [3, 4, 11, 12, 19, 20]
orientation_maps_90  = center_surround_diff_intensity(orientations[2::4]); idx_90  = [5, 6, 13, 14, 21, 22]
orientation_maps_135 = center_surround_diff_intensity(orientations[3::4]); idx_135 = [7, 8, 15, 16, 23, 24]

In [None]:
plt.figure("Center surround differences -- Gabor cells")
for i in range(6):
    plt.subplot(6,4,idx_0[i]); imshow(orientation_maps_0[i])
    plt.subplot(6,4,idx_45[i]); imshow(orientation_maps_45[i])
    plt.subplot(6,4,idx_90[i]); imshow(orientation_maps_90[i])
    plt.subplot(6,4,idx_135[i]); imshow(orientation_maps_135[i])

In [None]:
plt.figure("Center surround differences -- Gabor cells 0")
for i in range(6):
    plt.subplot(3,2,i+1)
    imshow(orientation_maps_0[i])

In [None]:
plt.figure("Center surround differences -- Gabor cells 45")
for i in range(6):
    plt.subplot(3,2,i+1)
    imshow(orientation_maps_45[i])

In [None]:
plt.figure("Center surround differences -- Gabor cells 90")
for i in range(6):
    plt.subplot(3,2,i+1)
    imshow(orientation_maps_90[i])

In [None]:
plt.figure("Center surround differences -- Gabor cells 135")
for i in range(6):
    plt.subplot(3,2,i+1)
    imshow(orientation_maps_135[i])

### Saliency map

<img src="ittikochniebur_model_architecture.png" alt="Model architecture" style="width: 60%;" />

### Normalization

- Find maximum $M$ in image
- Compute average $\bar{m}$ of all other local maxima $m_i$
- Multiply map by $(M-\bar{m})^2$

In [None]:
feature_maps = intensity_maps + \
               RG_maps + BY_maps + \
               orientation_maps_0 + orientation_maps_45 + orientation_maps_90 + orientation_maps_135
norm_maps = [normalize(fmap) for fmap in feature_maps]

In [None]:
plt.figure(); 
for i, m in enumerate(feature_maps): plt.subplot(7,6,i+1); imshow(m)

In [None]:
plt.figure(); 
for i, m in enumerate(norm_maps): plt.subplot(7,6,i+1); imshow(m)

### Combination to conspicuity maps

- one map per feature type: intensity, color, orientation
- on scale 4, which is $\text{target_size} = \frac{\text{size}}{2^{(4-1)}}$
- intensity: sum over all scales
- color: sum over scales and maps
- orientation: sum over scales, then normalize again before summation over orientations

In [None]:
target_size = tuple(np.array(image.shape[0:2][::-1]) / (2 ** 3))

conspicuity = [ # intensity
    normalize( 
        sum_up(norm_maps[0:6], target_size))]
conspicuity += [ # colors: RG, BY
    sum_up( 
        combine_colors(norm_maps[6:12], norm_maps[12:18]), target_size)]
conspicuity += [ # orientations
    sum_up( # sum orientations
        [normalize( # normalize it
                sum_up(norm_maps[i:i+6], target_size) # sum pyramid
            ) for i in (18, 24, 30, 36)])]

In [None]:
plt.figure('Conspicuity maps')
for i, (cmap, t) in enumerate(zip(conspicuity, ('intensity', 'color', 'orientation'))):
    plt.subplot(1, 3, i+1); imshow(cmap); plt.title(t)

### Combination to saliency map

$$\mathcal{S} = \frac{1}{3}\left(\mathcal{N}\left(\bar{\mathcal{I}}\right) + \mathcal{N}\left(\bar{\mathcal{C}}\right) + \mathcal{N}\left(\bar{\mathcal{O}}\right) \right)$$

In [None]:
S = to256(sum_up([normalize(cmap) for cmap in conspicuity]) / 3.)

In [None]:
_ = imshow(S)

### Using the saliency map: the winner takes it all

<img src="ittikochniebur_model_architecture.png" alt="Model architecture" style="width: 60%;" />

<img src="ittikochniebur_wta.png" style="width: 70%" alt="Winner takes it all" />
<center><small>Fig. 3: Example of final winner takes all results.</small></center>

### Results

- robust in very noisy images
- reproduces human performance, in easy and hard tasks
- not too difficult to implement 😊