In [2]:
%matplotlib inline

In [3]:
import pyprind

from IPython.display import HTML


import numpy as np
from numpy import genfromtxt

import pandas as pd
from pandas import DataFrame

import warnings

import sklearn as skl
from sklearn.preprocessing import normalize
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier

import scipy.io as sio
from scipy.spatial import distance

from scipy.misc import imread, imsave, imresize
from scipy.io import savemat, loadmat
import matplotlib.pyplot as plt

In [4]:
warnings.filterwarnings('ignore')

In [5]:
def split_data(dataset, train_fraction=0.8):
    
    train_size = (train_fraction * np.shape(dataset)[0])
    
    np.random.shuffle(dataset)
    
    columns = np.shape(dataset)[1]-1
    x = dataset[0::,0:columns]
    y = dataset[0::,columns:]
    
    x_training, x_test = x[:train_size,:], x[train_size:,:]
    
    y_training, y_test = y[:train_size, :], y[train_size:, :]
    
    return x_training, x_test, y_training, y_test

In [6]:
def eval_RandomForest(data, test_size=.8, folds=10, Num_trees=np.arange(1, 20), crit='gini'):
    
    columns = np.shape(data)[1]-1
    x = data[0::,0:columns]
    y = data[0::,columns]
    
    
    temp = int(len(data) * test_size)
    
    kf = KFold(temp, n_folds=folds)
    model = RFC(criterion=crit)
    
    
    score_info = []
    
    
    for trees in Num_trees:
        model.n_estimators = trees
        scores = [model.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices]) for train_indices, test_indices in kf]
        score = np.mean(scores)
        stuff = (score, trees)
        score_info.append(stuff)
        
        
        
    final_index = np.argmax(score_info, axis=0)

    final_max = score_info[final_index[0]]

    return final_max

In [7]:
def test_RandomForest(dataset, train_fract=0.8, num_folds=10, recover_model=False,
                      T_range=np.arange(1, 5, 1), criterion='gini'):


    (train_acc, T_opt) = eval_RandomForest(dataset, test_size=train_fract, folds=num_folds,
                                           Num_trees=T_range, crit=criterion)

    train_x, test_x, train_y, test_y = split_data(dataset, train_fract) 
    
    rfc_model = RFC(n_estimators=T_opt)
    
    rfc_model.fit(train_x, train_y)
    
    predicted = rfc_model.predict(test_x)
    
    num_correct = 0
    
    for i in range(0, len(predicted)):

        if predicted[i] == test_y[i]:

            num_correct += 1

    accuracy = num_correct/len(predicted)

    if recover_model:
        return accuracy, test_x, test_y, predicted, T_opt, knn_model
    
    else:
        return accuracy, test_x, test_y, predicted, T_opt

In [8]:
data = pd.read_csv('letter-recognition.data.txt', delimiter=',', header=0)

data.columns = ['Letter', 'X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
                       'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y', 'Y_edge', 'Corr_Y-edge_X']
                   
new_cols = ['X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
            'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y',
            'Y_edge', 'Corr_Y-edge_X', 'Letter']
                   
train_letter = data.reindex(columns=new_cols)

train_letter.head()

Unnamed: 0,X_pos,Y_pos,width,height,pixels,X_mu,Y_mu,X_sig,Y_sig,XY_corr,X*X*Y,X*Y*Y,X-edge,Corr_X-edge_Y,Y_edge,Corr_Y-edge_X,Letter
0,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,I
1,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,D
2,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,N
3,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,G
4,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7,S


In [9]:
data_features = data[['X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
            'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y',
            'Y_edge', 'Corr_Y-edge_X']]

data_classes = data[['Letter']]

features = data_features.astype(float)

#norm = normalize(features)

In [10]:
stuff = data['Letter'].values

#np.unique(stuff)

data_classes.size

19999

In [22]:
labels = np.unique(stuff)

labels

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], dtype=object)

In [81]:
results = []

data_classes = data[['Letter']]

#n = data_classes.size

n2 = len(labels)
#
bar = pyprind.ProgBar(n, monitor=True, title='OVA_swap')

bar2 = pyprind.ProgBar(n2, monitor=True, title='OVA_pred')

for j in labels:
    
    data_classes = data[['Letter']]
    
    i_ova = data_classes

    n = data_classes.size

    for i in range(0, n):
        if data_classes['Letter'][i] == j:
            i_ova['Letter'][i] = 1
        else:
            i_ova['Letter'][i] = -1
        
        if ((i_ova['Letter'][i] != -1) & (i_ova['Letter'][i] != 1)):
            i_ova['Letter'][i] = -1
        
        #bar.update()

    classes = i_ova.values


    dataset = np.hstack((features, classes))
    
    dataset2 = dataset.astype(float)
    
    rfcp_accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(dataset2, train_fract=0.8, recover_model=False,
                                                             num_folds=10, T_range=np.arange(1, 50, 1))
    
    results.append((j, rfcp_accuracy, T_opt))
    
    bar2.update()
    


OVA_swap
0%                          100%
[                              ]OVA_pred
0%                      100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:14:19
[##########################] | ETA: 00:00:00
Total time elapsed: 05:58:25


In [12]:
results = [('A', 0.997, 49),
 ('B', 0.995, 47),
 ('C', 0.9945, 47),
 ('D', 0.9915, 37),
 ('E', 0.99375, 25),
 ('F', 0.9925, 43),
 ('G', 0.98875, 15),
 ('H', 0.99, 19),
 ('I', 0.99625, 21),
 ('J', 0.99525, 41),
 ('K', 0.9935, 31),
 ('L', 0.997, 29),
 ('M', 0.997, 30),
 ('N', 0.995, 37),
 ('O', 0.99125, 35),
 ('P', 0.99325, 44),
 ('Q', 0.995, 33),
 ('R', 0.99075, 45),
 ('S', 0.99425, 21),
 ('T', 0.9945, 27),
 ('U', 0.99525, 21),
 ('V', 0.996, 35),
 ('W', 0.99825, 45),
 ('X', 0.9935, 33),
 ('Y', 0.995, 45),
 ('Z', 0.99575, 45)]

In [13]:
results

[('A', 0.997, 49),
 ('B', 0.995, 47),
 ('C', 0.9945, 47),
 ('D', 0.9915, 37),
 ('E', 0.99375, 25),
 ('F', 0.9925, 43),
 ('G', 0.98875, 15),
 ('H', 0.99, 19),
 ('I', 0.99625, 21),
 ('J', 0.99525, 41),
 ('K', 0.9935, 31),
 ('L', 0.997, 29),
 ('M', 0.997, 30),
 ('N', 0.995, 37),
 ('O', 0.99125, 35),
 ('P', 0.99325, 44),
 ('Q', 0.995, 33),
 ('R', 0.99075, 45),
 ('S', 0.99425, 21),
 ('T', 0.9945, 27),
 ('U', 0.99525, 21),
 ('V', 0.996, 35),
 ('W', 0.99825, 45),
 ('X', 0.9935, 33),
 ('Y', 0.995, 45),
 ('Z', 0.99575, 45)]

In [84]:
#print(bar)

Title: OVA_swap
  Started: 04/10/2016 01:33:21
  Finished: 04/10/2016 01:47:41
  Total time elapsed: 00:14:19
  CPU %: 98.10
  Memory %: 1.41


In [14]:
results

[('A', 0.997, 49),
 ('B', 0.995, 47),
 ('C', 0.9945, 47),
 ('D', 0.9915, 37),
 ('E', 0.99375, 25),
 ('F', 0.9925, 43),
 ('G', 0.98875, 15),
 ('H', 0.99, 19),
 ('I', 0.99625, 21),
 ('J', 0.99525, 41),
 ('K', 0.9935, 31),
 ('L', 0.997, 29),
 ('M', 0.997, 30),
 ('N', 0.995, 37),
 ('O', 0.99125, 35),
 ('P', 0.99325, 44),
 ('Q', 0.995, 33),
 ('R', 0.99075, 45),
 ('S', 0.99425, 21),
 ('T', 0.9945, 27),
 ('U', 0.99525, 21),
 ('V', 0.996, 35),
 ('W', 0.99825, 45),
 ('X', 0.9935, 33),
 ('Y', 0.995, 45),
 ('Z', 0.99575, 45)]

In [15]:
letter_train = pd.read_csv('letter-recognition.data.txt', delimiter=',')

letter_train.columns = ['Letter', 'X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
                       'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y', 'Y_edge', 'Corr_Y-edge_X']

letter_classes = {'Letter': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10,
                             'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19,
                            'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25}}

letter_train.replace(letter_classes, inplace=True)

cols = letter_train.columns.tolist()

new_cols = ['X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
            'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y',
            'Y_edge', 'Corr_Y-edge_X', 'Letter']

train_letter = letter_train.reindex(columns=new_cols)

train_letter.head()

Unnamed: 0,X_pos,Y_pos,width,height,pixels,X_mu,Y_mu,X_sig,Y_sig,XY_corr,X*X*Y,X*Y*Y,X-edge,Corr_X-edge_Y,Y_edge,Corr_Y-edge_X,Letter
0,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10,8
1,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9,3
2,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8,13
3,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10,6
4,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7,18


In [18]:
dataset = train_letter.values.astype(float)

rfcp_accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(dataset, train_fract=0.8, recover_model=False,
                                                             num_folds=10, T_range=np.arange(1, 50, 1))

print(rfcp_accuracy)

0.96175


In [31]:
T_opt

45

In [16]:
results[1][2]

47

In [19]:
train_x, test_x, train_y, test_y = split_data(dataset)

len(train_x)

15999

In [25]:
stuff = data['Letter'].values

labels = np.unique(stuff)

to_save = []

data_classes = data[['Letter']]

#n = data_classes.size

n2 = len(labels)

bar = pyprind.ProgBar(n2, monitor=True, title='OVA_pred')

for j in range(0, n2):
    
    T_opt = results[j][2]
    
    data_classes = data[['Letter']]
    
    i_ova = data_classes

    n = data_classes.size

    for i in range(0, n):
        if data_classes['Letter'][i] == labels[j]:
            i_ova['Letter'][i] = 1
        else:
            i_ova['Letter'][i] = -1
        
        if ((i_ova['Letter'][i] != -1) & (i_ova['Letter'][i] != 1)):
            i_ova['Letter'][i] = -1
        
        

    classes = i_ova.values

    dataset = np.hstack((features, classes))
    
    dataset2 = dataset.astype(float)
    
    train_x, test_x, train_y, test_y = split_data(dataset2)
    
    new_rfc = RFC(n_estimators=T_opt, criterion='entropy')

    new_rfc.fit(train_x, train_y)

    rfc_test_predicted = new_rfc.predict(test_x)

    num_correct = 0

    for l in range(0, len(rfc_test_predicted)):
        
        if rfc_test_predicted[l] == test_y[l]:
            
                num_correct += 1
    
    final_accuracy = num_correct/len(rfc_test_predicted)
    
    to_save.append((labels[j], T_opt, final_accuracy))
    
    bar.update()

OVA_pred
0%                      100%
[##########################] | ETA: 00:00:00
Total time elapsed: 06:57:27


In [26]:
to_save

[('A', 49, 0.99925),
 ('B', 47, 0.99425),
 ('C', 47, 0.99575),
 ('D', 37, 0.993),
 ('E', 25, 0.99475),
 ('F', 43, 0.992),
 ('G', 15, 0.99325),
 ('H', 19, 0.9915),
 ('I', 21, 0.99675),
 ('J', 41, 0.9945),
 ('K', 31, 0.99325),
 ('L', 29, 0.99675),
 ('M', 30, 0.996),
 ('N', 37, 0.99675),
 ('O', 35, 0.99325),
 ('P', 44, 0.99425),
 ('Q', 33, 0.99525),
 ('R', 45, 0.99125),
 ('S', 21, 0.995),
 ('T', 27, 0.99575),
 ('U', 21, 0.997),
 ('V', 35, 0.9965),
 ('W', 45, 0.99775),
 ('X', 33, 0.99725),
 ('Y', 45, 0.99625),
 ('Z', 45, 0.99725)]

In [30]:
acc = len(to_save)

samples = len(test_y)

err = 0

for i in range(0, acc):
    
    err += (samples - (to_save[i][2] * samples))
    
err

502.0

In [54]:
to_save

[('A', 49, 0.99925),
 ('B', 47, 0.99425),
 ('C', 47, 0.99575),
 ('D', 37, 0.993),
 ('E', 25, 0.99475),
 ('F', 43, 0.992),
 ('G', 15, 0.99325),
 ('H', 19, 0.9915),
 ('I', 21, 0.99675),
 ('J', 41, 0.9945),
 ('K', 31, 0.99325),
 ('L', 29, 0.99675),
 ('M', 30, 0.996),
 ('N', 37, 0.99675),
 ('O', 35, 0.99325),
 ('P', 44, 0.99425),
 ('Q', 33, 0.99525),
 ('R', 45, 0.99125),
 ('S', 21, 0.995),
 ('T', 27, 0.99575),
 ('U', 21, 0.997),
 ('V', 35, 0.9965),
 ('W', 45, 0.99775),
 ('X', 33, 0.99725),
 ('Y', 45, 0.99625),
 ('Z', 45, 0.99725)]

In [51]:
toSave_T = np.transpose(to_save)

tree_num = toSave_T[1].astype(float)

np.mean(tree_num)

34.615384615384613

In [56]:
scores = toSave_T[2].astype(float)

np.max(scores)

0.99924999999999997