In [123]:
%matplotlib inline

In [124]:
import numpy as np
from numpy import genfromtxt

import pandas as pd

import warnings

import sklearn as skl
from sklearn.preprocessing import normalize
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier

import scipy.io as sio
from scipy.spatial import distance

from scipy.misc import imread, imsave, imresize
from scipy.io import savemat, loadmat
import matplotlib.pyplot as plt

In [125]:
warnings.filterwarnings('ignore')

In [126]:
data = pd.read_csv('letter-recognition.data.txt', delimiter=',', header=0)

data.columns = ['Letter', 'X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
                       'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y', 'Y_edge', 'Corr_Y-edge_X']
                   
new_cols = ['X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
            'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y',
            'Y_edge', 'Corr_Y-edge_X', 'Letter']
                   
train_letter = data.reindex(columns=new_cols)

data.head()

Unnamed: 0,Letter,X_pos,Y_pos,width,height,pixels,X_mu,Y_mu,X_sig,Y_sig,XY_corr,X*X*Y,X*Y*Y,X-edge,Corr_X-edge_Y,Y_edge,Corr_Y-edge_X
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


In [127]:
data_features = data[['X_pos', 'Y_pos', 'width', 'height', 'pixels', 'X_mu', 'Y_mu',
            'X_sig', 'Y_sig', 'XY_corr', "X*X*Y", "X*Y*Y", 'X-edge', 'Corr_X-edge_Y',
            'Y_edge', 'Corr_Y-edge_X']]

data_classes = data[['Letter']]

features = data_features.astype(float)

norm = normalize(features)

In [128]:
a_ova = data_classes

for i in data_classes:
    if data_classes.loc(i) == "M":
        a_ova[i] = 1
    else:
        a_ova[i] = -1

classes = a_ova.values

#classes

dataset = np.hstack((features, classes))

dataset

array([[  5.,  12.,   3., ...,   4.,  10.,  -1.],
       [  4.,  11.,   6., ...,   3.,   9.,  -1.],
       [  7.,  11.,   6., ...,   2.,   8.,  -1.],
       ..., 
       [  6.,   9.,   6., ...,   2.,   4.,  -1.],
       [  2.,   3.,   4., ...,   5.,   8.,  -1.],
       [  4.,   9.,   6., ...,   2.,   8.,  -1.]])

In [129]:
def split_data(dataset, train_fraction=0.8):
    
    train_size = (train_fraction * np.shape(dataset)[0])
    
    np.random.shuffle(dataset)
    
    columns = np.shape(dataset)[1]-1
    x = dataset[0::,0:columns]
    y = dataset[0::,columns:]
    
    x_training, x_test = x[:train_size,:], x[train_size:,:]
    
    y_training, y_test = y[:train_size, :], y[train_size:, :]
    
    return x_training, x_test, y_training, y_test

In [130]:
x_train, x_test, y_train, y_test = split_data(dataset)

In [131]:
model = RFC(n_estimators=100)

model.fit(x_train, y_train)

predicted = model.predict(x_test)

num_correct = 0
    
for i in range(0, len(predicted)):
        
    if predicted[i] == y_test[i]:
            
        num_correct += 1
    
accuracy = num_correct/len(predicted)
    
results = accuracy
    
print(num_correct)

4000


In [132]:
stuff = data['Letter'].values

np.unique(stuff)

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], dtype=object)

In [133]:
c = "z"

ord(c)

122

In [134]:
predicted

array([-1., -1., -1., ..., -1., -1., -1.])

In [135]:
i_ova = data_classes

for i in data_classes:
    if data_classes.iloc(i) == "I":
        i_ova[i] = 1
    else:
        i_ova[i] = -1

classes = i_ova.values


dataset = np.hstack((features, classes))

dataset

array([[  5.,  12.,   3., ...,   4.,  10.,  -1.],
       [  4.,  11.,   6., ...,   3.,   9.,  -1.],
       [  7.,  11.,   6., ...,   2.,   8.,  -1.],
       ..., 
       [  6.,   9.,   6., ...,   2.,   4.,  -1.],
       [  2.,   3.,   4., ...,   5.,   8.,  -1.],
       [  4.,   9.,   6., ...,   2.,   8.,  -1.]])

In [136]:
x_train, x_test, y_train, y_test = split_data(dataset)

model = RFC(n_estimators=100)

model.fit(x_train, y_train)

i_predicted = model.predict(x_test)

num_correct = 0
    
for i in range(0, len(predicted)):
        
    if i_predicted[i] == y_test[i]:
            
        num_correct += 1
    
accuracy = num_correct/len(predicted)
    
results = accuracy
    
print(accuracy)

1.0


In [138]:
i_ova

Unnamed: 0,Letter
0,-1
1,-1
2,-1
3,-1
4,-1
5,-1
6,-1
7,-1
8,-1
9,-1
