In [1]:
import sys
import numpy as np
import pandas as pd

%matplotlib inline

import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# plt.style.use('whitegrid')
sns.set_style('whitegrid')

matplotlib.rcParams.update({'figure.figsize': (10, 6)})
matplotlib.rcParams.update({'font.size': 16})
matplotlib.rcParams.update({'axes.labelsize': 20})
matplotlib.rcParams.update({'xtick.labelsize': 12})
matplotlib.rcParams.update({'ytick.labelsize': 12})
matplotlib.rcParams.update({'font.family': 'Helvetica, Arial, sans-serif'})

%config InlineBackend.figure_format = 'retina'

In [8]:
data = pd.read_csv('data/diamonds.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data = data[['carat', 'x', 'y', 'z', 'depth', 'table', 'price', 
             'clarity', 'color',
             'cut']]

data['clarity-color'] = data['clarity'] + "-" + data['color']

for col in ['color', 'clarity', 'cut']:
    data[col] = data[col].astype('category');
    data[col] = data[col].cat.codes
    
normalized_data = data;
for col in ['carat', 'x', 'y', 'z', 'depth', 'table', 'price']:
    data[col] = data[col]/data[col].max()

data_cross = pd.concat([normalized_data, pd.get_dummies(data['clarity-color'])], axis=1)
data_cross.drop(['clarity-color'], axis=1, inplace=True)

data.drop(['clarity-color'], axis=1, inplace=True)

In [None]:
normalized_data.plot(kind='box', logy=True)
plt.show()


In [None]:
# Linear dimensionality reduction: Linear PCA
from sklearn.decomposition import PCA

# [CITE] "04. Dimension Reduction and Images Notebook" by Eric Larson
# i.e. the number of dimensions
test_components_count = 10

def linear_pca(n_components, matrix):
    pca = PCA(n_components=n_components)
    %time pca.fit(matrix)
    return pca


# [CITE] "04. Dimension Reduction and Images Notebook" by Eric Larson

def pca_comp_range(p):
    return np.arange(0,p.n_components)

def explained_var(p):
    return p.explained_variance_ratio_

def cumulative_explained_var(p):
    return np.cumsum(explained_var(p))

def find_nearest_x_for_y(x,y,value):
    index = np.abs(y-value).argmin()
    return x[index]

def find_90_percent_dimension_count(p):
    return find_nearest_x_for_y(pca_comp_range(p), cumulative_explained_var(p),0.9)

def plot_explained_variance(pca):    
    fig, g = plt.subplots()
    cr = pca_comp_range(pca)
    expvar = explained_var(pca)
    cumexpvar = cumulative_explained_var(pca)
    
    g.fill_between(cr, 0, expvar, color='blue', 
        label='individual explained variance')
    g.plot(cr, cumexpvar, color='orange', linestyle='-', marker='', 
        label='cumulative explained variance')
    g.axhline(y=0.9,color='gray',linestyle='--', 
        label='90% accuracy target')
    
    g.set(xlabel='Principal components',ylabel='Explained variance ratio',
        xlim=(0,pca.n_components),ylim=(0,1),
        title='Explained variance of components')
    g.legend(loc='lower right')
    plt.show()

In [None]:
data_matrix = np.asarray(normalized_data)

test_pca = linear_pca(test_components_count, data_matrix)

In [None]:
plot_explained_variance(test_pca)