# Lab 3

## Conrad Appel & Eric Hawkins

In [None]:
%matplotlib inline
import numpy as np
from scipy import ndimage
import os 
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pandas as p
from skimage.feature import match_template
import _pickle as cPickle
from skimage.feature import daisy
from sklearn.metrics.pairwise import pairwise_distances
from random import randint
import copy
import seaborn as sns

numlabels = 10


###
# Download dataset from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
# save all files to ./lab3/imgs/
###

In [None]:
dics = []
for root, directory, files in os.walk('imgs'):
    for f in files:
        if 'data_batch' in f:
            with open(root+'/'+f, 'rb') as fo:
                dics.append(cPickle.load(fo, encoding='latin1'))
                break
total_imgs = []
for dic in dics:
    for i in range(len(dic['data'])):
        curpic = []
        curpic.append(dic['data'][i]) # 1D img (1024 R, 1024 G, 1024 B)
        curpic.append(dic['labels'][i]) # int representing the label
        total_imgs.append(curpic)
        
total_imgs = np.array(total_imgs)
imgs_df = p.DataFrame(total_imgs)
imgs_df.columns = ['oneDColor', 'label']

In [None]:
labels = {}
with open('./imgs/batches.meta', 'rb') as fo:
    labels_tmp = cPickle.load(fo, encoding='latin1')
    for i in range(len(labels_tmp['label_names'])):
        labels[i] = labels_tmp['label_names'][i]

In [None]:
def toGrayscale(img):
    r, g, b = img[:1024], img[1024:2048], img[2048:]
    gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
    return gray

imgs_df['oneDGray'] = imgs_df['oneDColor'].apply(toGrayscale)

In [None]:
f, axes = plt.subplots(1,10)
for label in range(10):
    axes[label].axis('off')
    axes[label].imshow(imgs_df.loc[lambda df: df.label == label]['oneDGray'].sample(n=1).values[0].reshape((32,32)), cmap=plt.cm.gray)

## Linear Dimensionality Reduction via PCA

In [None]:
# Principal Components Analysis
pcas = []
for label in range(numlabels):
    x = imgs_df.loc[lambda df: df.label == label]
    
    pixels_are_cols = np.zeros(shape=(len(x), 1024))
    for i in range(len(x)):
        cur_pic = x['oneDGray'].values[i]
        for j in range(1024):
            pixels_are_cols[i][j] = cur_pic[j]
    
    x = pixels_are_cols
    y = [True]*len(x)
    n_comps = 175

    pca = PCA(n_components=n_comps)
    X_pca = pca.fit(x)
    pcas.append((x, pca, X_pca))

In [None]:
f, axes = plt.subplots(1, numlabels)
for label in range(numlabels):
    axis = axes[label] if numlabels > 1 else axes
    eigenpics = pcas[label][1].components_.reshape((n_comps, 32, 32))
    np.random.shuffle(pcas[label][0])
    recd = pca.inverse_transform(pca.transform(pcas[label][0][0].reshape(1, -1)))
    axis.axis('off')
    axis.imshow(recd.reshape((32,32)), cmap=plt.cm.gray)

## Non-Linear Dimensionality Reduction via Kernel PCA

In [None]:
# Kernel Principal Component Analysis
kpcas = []
for label in range(numlabels):
    x = imgs_df.loc[lambda df: df.label == label]
    
    pixels_are_cols = np.zeros(shape=(len(x), 1024))
    for i in range(len(x)):
        cur_pic = x['oneDGray'].values[i]
        for j in range(1024):
            pixels_are_cols[i][j] = cur_pic[j]
    
    x = pixels_are_cols
    y = [True]*len(x)
    n_comps = 175

    kpca = KernelPCA(n_components=n_comps, kernel='rbf', fit_inverse_transform=True, gamma=15)
    X_kpca = kpca.fit(x)
    kpcas.append((x, kpca, X_kpca))

In [None]:
f, axes = plt.subplots(1, numlabels)
for label in range(numlabels):
    axis = axes[label] if numlabels > 1 else axes
    #np.random.shuffle(kpcas[label][0])
    axis.axis('off')
    axis.imshow(recd.reshape((32,32)), cmap=plt.cm.gray)

## DAISY

In [None]:
def apply_daisy(row):
    row = row[2]
    feat = daisy(row.reshape((32,32)),step=10, radius=10, rings=2, histograms=6, orientations=8, visualize=False)
    return feat.reshape((-1))

In [None]:
# calculate bag of features for each image
daisies = np.apply_along_axis(apply_daisy, 1, imgs_df)

In [None]:
# pick a random image and calculate its distance from the other images according to the DAISY features
index1 = randint(0, len(daisies))
# can only calculate one image's distances at a time because memory things
dist_matrix = pairwise_distances(daisies, daisies[index1].reshape((1, -1))) 
dist_matrix[index1] = np.infty
index2 = np.argmin(dist_matrix)

# display random image and its closest match using the DAISY features
f, axes = plt.subplots(1,2)
axes[0].axis('off')
axes[1].axis('off')
tmp = axes[0].imshow(imgs_df['oneDGray'][index1].reshape((32,32)), cmap=plt.cm.gray)
tmp = axes[1].imshow(imgs_df['oneDGray'][index2].reshape((32,32)), cmap=plt.cm.gray)

## Differences between instances within classes

In [None]:
mean_distances = []
for i in range(numlabels):
    cur_imgs = imgs_df.loc[lambda df: df.label == i]
    cur_daisies = np.apply_along_axis(apply_daisy, 1, cur_imgs) # TODO: don't need to recalculate these DAISYs
    dist_matrix = pairwise_distances(cur_daisies)
    mean_distances.append(p.DataFrame(dist_matrix).mean().values)

In [None]:
plt.figure(figsize=(10,6))
ax = sns.boxplot(data=mean_distances)
tmp = ax.figure.get_axes()[0].set_xticklabels(labels.values())