In [1]:
%matplotlib inline

import pyprind

from IPython.display import HTML


import numpy as np
from numpy import genfromtxt

import pandas as pd
from pandas import DataFrame

import warnings

import sklearn as skl
from sklearn.preprocessing import normalize
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import scipy.io as sio
from scipy.spatial import distance

from scipy.misc import imread, imsave, imresize
from scipy.io import savemat, loadmat
import matplotlib.pyplot as plt

In [2]:
def split_data(dataset, train_fraction=0.8):
    
    train_size = (train_fraction * np.shape(dataset)[0])
    
    np.random.shuffle(dataset)
    
    columns = np.shape(dataset)[1]-1
    x = dataset[0::,0:columns]
    y = dataset[0::,columns:]
    
    x_training, x_test = x[:train_size,:], x[train_size:,:]
    
    y_training, y_test = y[:train_size, :], y[train_size:, :]
    
    return x_training, x_test, y_training, y_test

In [3]:
def eval_RandomForest(data, test_size=.8, folds=10, Num_trees=np.arange(1, 20, 1), crit='gini'):
    
    columns = np.shape(data)[1]-1
    x = data[0::,0:columns]
    y = data[0::,columns]
    
    
    temp = int(len(data) * test_size)
    
    kf = KFold(temp, n_folds=folds)
    model = RFC(criterion=crit)
    
    
    score_info = []
    
    
    for trees in Num_trees:
        model.n_estimators = trees
        scores = [model.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices]) for train_indices, test_indices in kf]
        score = np.mean(scores)
        stuff = (score, trees)
        score_info.append(stuff)
        
        #print("CV in progress")
        
        
        
    final_index = np.argmax(score_info, axis=0)

    final_max = score_info[final_index[0]]
    
    print("Cross-Validation done")

    return final_max

In [4]:
def test_RandomForest(dataset, train_fract=0.8, num_folds=10, recover_model=False,
                      T_range=np.arange(1, 5, 1), criterion='gini'):


    (train_acc, T_opt) = eval_RandomForest(dataset, test_size=train_fract, folds=num_folds, 
                                           Num_trees=T_range, crit=criterion)

    train_x, test_x, train_y, test_y = split_data(dataset, train_fract) 
    
    rfc_model = RFC(n_estimators=T_opt)
    
    rfc_model.fit(train_x, train_y)
    
    print("Training done")
    
    predicted = rfc_model.predict(test_x)
    
    num_correct = 0
    
    for i in range(0, len(predicted)):

        if predicted[i] == test_y[i]:

            num_correct += 1

    accuracy = num_correct/len(predicted)

    if recover_model:
        return accuracy, test_x, test_y, predicted, T_opt, rfc_model
    
    else:
        return accuracy, test_x, test_y, predicted, T_opt

In [5]:
# Just PCA on the faces
Sample_1 = pd.read_csv("pca_components.csv")

# Just RPCA on the faces
Sample_2 = pd.read_csv("rpca_components.csv")

#
Sample_2_165_comp = Sample_2.drop(Sample_2.columns[164:1023], axis=1)

# PCA done on the A matrix produced by RPCA
Sample_3 = pd.read_csv("pca_on_rpca.csv")

#Sample_2_165_comp.shape
Sample_3.shape

(164, 166)

In [6]:
Sample_1_22_comp = Sample_1.drop(Sample_1.columns[22:165], axis=1)

#Sample_1_22_comp.head

Sample_2_22_comp = Sample_2_165_comp.drop(Sample_2_165_comp.columns[22:165], axis=1)

Sample_3_22_comp = Sample_3.drop(Sample_3.columns[22:165], axis=1)

Sample_2_22_comp.shape

(164, 23)

In [7]:
Sample_1_5_comp = Sample_1.drop(Sample_1.columns[5:165], axis=1)

#Sample_1_22_comp.head

Sample_2_5_comp = Sample_2_165_comp.drop(Sample_2_165_comp.columns[5:165], axis=1)

Sample_3_5_comp = Sample_3.drop(Sample_3.columns[5:165], axis=1)

Sample_3_5_comp.shape

(164, 6)

In [8]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_2.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.7272727272727273
28




In [9]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_1.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.3333333333333333
162




In [10]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_2_165_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.42424242424242425
27




In [11]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_3.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.3939393939393939
76




In [18]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_1_22_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.696969696969697
157




In [13]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_2_22_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 200, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.42424242424242425
7




In [14]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_3_22_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.696969696969697
142




In [15]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_1_5_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.36363636363636365
67




In [16]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_2_5_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.3333333333333333
137




In [17]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(Sample_3_5_comp.values, train_fract=0.8, num_folds=10,
                                                               recover_model=False, T_range=np.arange(1, 165, 1), criterion='gini')

print(accuracy)
print(T_opt)

Cross-Validation done
Training done
0.6363636363636364
31


