In [68]:
%matplotlib inline

import pyprind

from IPython.display import HTML


import numpy as np
from numpy import genfromtxt

import pandas as pd
from pandas import DataFrame

import warnings

import sklearn as skl
from sklearn.preprocessing import normalize
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import scipy.io as sio
from scipy.spatial import distance

from scipy.misc import imread, imsave, imresize
from scipy.io import savemat, loadmat
import matplotlib.pyplot as plt

In [69]:
warnings.filterwarnings('ignore')

In [70]:
def split_data(dataset, train_fraction=0.8):
    
    train_size = (train_fraction * np.shape(dataset)[0])
    
    np.random.shuffle(dataset)
    
    columns = np.shape(dataset)[1]-1
    x = dataset[0::,0:columns]
    y = dataset[0::,columns:]
    
    x_training, x_test = x[:train_size,:], x[train_size:,:]
    
    y_training, y_test = y[:train_size, :], y[train_size:, :]
    
    return x_training, x_test, y_training, y_test

In [71]:
def eval_RandomForest(data, test_size=.8, folds=10, Num_trees=np.arange(1, 20), crit='gini'):
    
    columns = np.shape(data)[1]-1
    x = data[0::,0:columns]
    y = data[0::,columns]
    
    
    temp = int(len(data) * test_size)
    
    kf = KFold(temp, n_folds=folds)
    model = RFC(criterion=crit)
    
    
    score_info = []
    
    
    for trees in Num_trees:
        model.n_estimators = trees
        scores = [model.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices]) for train_indices, test_indices in kf]
        score = np.mean(scores)
        stuff = (score, trees)
        score_info.append(stuff)
        
        print("CV in progress")
        
        
        
    final_index = np.argmax(score_info, axis=0)

    final_max = score_info[final_index[0]]
    
    print("Cross-Validation done")

    return final_max

In [72]:
def test_RandomForest(dataset, train_fract=0.8, num_folds=10, recover_model=False,
                      T_range=np.arange(1, 5, 1), criterion='gini'):


    (train_acc, T_opt) = eval_RandomForest(dataset, test_size=train_fract, folds=num_folds,
                                           Num_trees=T_range, crit=criterion)

    train_x, test_x, train_y, test_y = split_data(dataset, train_fract) 
    
    rfc_model = RFC(n_estimators=T_opt)
    
    rfc_model.fit(train_x, train_y)
    
    print("Training done")
    
    predicted = rfc_model.predict(test_x)
    
    num_correct = 0
    
    for i in range(0, len(predicted)):

        if predicted[i] == test_y[i]:

            num_correct += 1

    accuracy = num_correct/len(predicted)

    if recover_model:
        return accuracy, test_x, test_y, predicted, T_opt, rfc_model
    
    else:
        return accuracy, test_x, test_y, predicted, T_opt

In [73]:
data = pd.read_csv('letter_sampled.csv')

data.shape

data = data.values

In [74]:
data

array([[ 6371,  6372,   835, ...,     0,     0,    14],
       [20998,    -1,  2891, ...,     0,     0,    16],
       [ 4758,    -1,   586, ...,     0,     0,     7],
       ..., 
       [22822, 22823,  3166, ...,     0,     0,    14],
       [20843, 20844,  2840, ...,     0,     0,    21],
       [22954, 22955,  3190, ...,     0,     0,     5]], dtype=int64)

In [75]:
accuracy, test_x, test_y, predicted, T_opt = test_RandomForest(data, train_fract=0.8, num_folds=10, recover_model=False,
                      T_range=np.arange(1, 200, 1), criterion='gini')

CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in progress
CV in prog

In [76]:
accuracy

0.7957813998082455

In [77]:
predicted

array([15,  1, 18, ...,  5,  9,  1], dtype=int64)

In [78]:
T_opt

189