<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Two: Feature Selection </h2>	


<h4> In this section, we would like you to select between 15 and 20 features to focus your model on. This will require significant explatoratory research. The first one is already implemented for you, and the next two are pre-specified.  </h4>

In [None]:
# Import anything you need here
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import skimage
from skimage import io
from skimage import feature
import seaborn as sns
%matplotlib inline

In [None]:
# load data generated from NB1
data = pd.read_hdf("data.h5", "data")

In [None]:
data.head()

In [None]:
def convert_to_rgb(image):
    """
    if image is grayscale, convert to RGB
    """
    if(len(image.shape) == 2):
        return skimage.color.gray2rgb(image)
    else:
        return image

In [None]:
data["Pictures"] = data["Pictures"].apply(convert_to_rgb)

In [None]:
test_img = data["Pictures"].iloc[0]
io.imshow(test_img)

In [None]:
def feature_size(image):
    """
    return picture size
    """
    return image.size

def feature_avg_red(image):
    """
    return mean value of red channel
    """
    return image[:, :, 0].mean()

def feature_avg_green(image):
    """
    return mean value of green channel  
    """
    return image[:, :, 1].mean()

def feature_avg_blue(image):
    """
    return mean value of blue channel
    """
    return image[:, :, 2].mean()

def feature_std_red(image):
    """
    return standard derivation of red channel
    """
    return image[:, :, 0].std()

def feature_std_green(image):
    """
    return standard derivation of green channel
    """
    return image[:, :, 1].std()

def feature_std_blue(image):
    """
    return standard derivation of blue channel
    """
    return image[:, :, 2].std()

def feature_avg_gray(image):
    """
    return mean value of grayscale
    """
    return np.mean(image[:, :, 0] + image[:, :, 1] + image[:, :, 2] / 3)

def feature_aspect_ratio(image):
    """
    return aspect ratio of the image, 
    i.e., the height divided by the width of the image
    """
    return image.shape[0] / image.shape[1]

def short_side_resize(image, length=256):
    """
    resize the image to a fixed short side length
    """
    height, width, _ = image.shape
    if height < width:
        ratio = length / height
    else:
        ratio = length / width
    new_height = int(height * ratio)
    new_width = int(width * ratio)
    return skimage.transform.resize(image, (new_height, new_width), mode='reflect', anti_aliasing=True)

def center_crop(image, length=224):
    """
    crop the center patch of the image with length * length
    """
    height, width, _ = image.shape
    cx, cy = height // 2, width // 2
    lx, ly = cx - length//2, cy - length//2
    hx, hy = length + lx, length + ly
    return image[lx:hx, ly:hy, :]

def feature_harris(image):
    """
    return amount of corners detected by Harris corner detector
    """
    image = center_crop(short_side_resize(image))
    gray = skimage.color.rgb2gray(image)
    gray = np.array(gray * 255, dtype=np.uint8)
    harris = cv2.cornerHarris(gray, blockSize=2, ksize=3, k=0.04)
    harris_corners = np.where(harris > 0)
    return len(harris_corners[0]) / harris.size

def feature_dog(image):
    """
    return the differences of images processed by two Gaussian 
    filters with different variance (we choose 0.3 and 0.5)
    """
    gray = skimage.color.rgb2gray(image)
    g3 = np.asarray(skimage.filters.gaussian(gray, sigma=0.3))
    g5 = np.asarray(skimage.filters.gaussian(gray, sigma=0.5))
    dog = g3-g5
    return sum(sum(dog > 0.05 *dog.max() ))/dog.size

def feature_avg_y(image):
    """
    return mean value of luminance Y) 
    """
    image = skimage.color.rgb2ycbcr(image)
    return image[:, :, 0].mean()

def feature_avg_cb(image):
    """
    return mean value of blue chroma component (Cb)
    """
    image = skimage.color.rgb2ycbcr(image)
    return image[:, :, 1].mean()

def feature_avg_cr(image):
    """
    return mean value of red chroma component (Cr)
    """
    image = skimage.color.rgb2ycbcr(image)
    return image[:, :, 2].mean()

def feature_std_y(image):
    """
    return standard derivation of luminance (Y)
    """
    image = skimage.color.rgb2ycbcr(image)
    return image[:, :, 0].std()

def feature_std_cb(image):
    """
    return standard derivation of blue chroma component (Cb)
    """
    image = skimage.color.rgb2ycbcr(image)
    return image[:, :, 1].std()

def feature_std_cr(image):
    """
    return standard derivation of red chroma component (Cr)
    """
    image = skimage.color.rgb2ycbcr(image)
    return image[:, :, 2].std()

def feature_avg_hog(image):
    """
    return mean value of Histogram of Oriented Gradients (HOG)
    """
    return skimage.feature.hog(convert_to_rgb(image)).mean()

def feature_std_hog(image):
    """
    return standard derivation of Histogram of Oriented Gradients (HOG)
    """
    return skimage.feature.hog(convert_to_rgb(image)).std()

Define more features above, performing any EDA research below. We expect all external sources sited, and a couple significant different graphs indicating some form of EDA. 

<h4> DataFrame Creation </h4>

In [None]:
def feature_frame(df):
    # add all features to a DataFrame and drop `Picture` column
    df["size"] = df["Pictures"].apply(feature_size)
    df["avg_red"] = df["Pictures"].apply(feature_avg_red)
    df["avg_green"] = df["Pictures"].apply(feature_avg_green)
    df["avg_blue"] = df["Pictures"].apply(feature_avg_blue)
    df["std_red"] = df["Pictures"].apply(feature_std_red)
    df["std_green"] = df["Pictures"].apply(feature_std_green)
    df["std_blue"] = df["Pictures"].apply(feature_std_blue)
    df["avg_gray"] = df["Pictures"].apply(feature_avg_gray)
    df["aspect_ratio"] = df["Pictures"].apply(feature_aspect_ratio)
    df["harris"] = df["Pictures"].apply(feature_harris)
    df["dog"] = df["Pictures"].apply(feature_dog)
    df["avg_y"] = df["Pictures"].apply(feature_avg_y)
    df["avg_cb"] = df["Pictures"].apply(feature_avg_cb)
    df["avg_cr"] = df["Pictures"].apply(feature_avg_cr)
    df["std_y"] = df["Pictures"].apply(feature_std_y)
    df["std_cb"] = df["Pictures"].apply(feature_std_cb)
    df["std_cr"] = df["Pictures"].apply(feature_std_cr)
    df["avg_hog"] = df["Pictures"].apply(feature_avg_hog)
    df["std_hog"] = df["Pictures"].apply(feature_std_hog)
    del df["Pictures"]
    return df

In [None]:
feature_df = feature_frame(data)

In [None]:
feature_df.shape

In [None]:
# storing data to .h5 file for easy loading in NB3
feature_df.to_hdf("feature.h5", "feature", mode="w")

### generate and save feature for validation set

In [None]:
test_path = "20_Validation/"
test_file_list = os.listdir(test_path)
    
# remove hidden files for macOS system
if ".DS_Store" in test_file_list:
    test_file_list.remove(".DS_Store")
        
# sort the directories to match the given encoding
test_file_list = sorted(test_file_list, key=lambda x: 
                        int(x.replace("validation_pic (", "").replace(").jpg", "")))

# read images into DataFrame
test_image = []
for test_file in test_file_list:
    test_image.append(io.imread(os.path.join(test_path, test_file)))

test_df = pd.DataFrame()
test_df["Pictures"] = test_image

In [None]:
test_df["Pictures"] = test_df["Pictures"].apply(convert_to_rgb)

In [None]:
test_feature = feature_frame(test_df)

In [None]:
test_feature.shape

In [None]:
test_feature.to_hdf("test_feature.h5", "feature", mode="w")

<h4> Graphs </h4>

In [None]:
category = ["Airplanes", "Bear", "Blimp", "Comet", "Crab", "Dog", "Dolphin", "Giraffe", 
            "Goat", "Gorilla", "Kangaroo", "Killer-Whale", "Leopards", "Llama", 
            "Penguin", "Porcupine", "Teddy-Bear," "Triceratops", "Unicorn", "Zebra"]

In [None]:
# use violin plot to visualize each feature's distribution among 20 categories
for column in feature_df.columns:
    if column != "Encoding":
        plt.figure(figsize=(12, 9))
        sns.violinplot(x="Encoding", y=column, data=feature_df)
        plt.title(column)
        plt.xticks(range(len(category)), category, rotation='vertical')
        plt.show()

We find feature `aspect_ratio` and `avg_y` quite interesting on category "airplanes" and "comet". Further analysis can be found in our report.

<h4> Sources </h4>

1. N. Dalal and B. Triggs. Histograms of oriented gradients for human detection. In2005 IEEE ComputerSociety Conference on Computer Vision and Pattern Recognition (CVPR’05), volume 1, pages 886–893vol. 1, June 2005
2. Christopher  G.  Harris  and  Mike  Stephens.   A  combined  corner  and  edge  detector.   InAlvey VisionConference, 1988.