In [39]:
import pandas as pd
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
%matplotlib inline

In [40]:
data = pd.read_csv('letters.csv')
data.head(10)

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx,class
0,2,4,4,3,2,7,8,2,9,11,7,7,1,8,5,6,Z
1,4,7,5,5,5,5,9,6,4,8,7,9,2,9,7,10,P
2,7,10,8,7,4,8,8,5,10,11,2,8,2,5,5,10,S
3,4,9,5,7,4,7,7,13,1,7,6,8,3,8,0,8,H
4,6,7,8,5,4,7,6,3,7,10,7,9,3,8,3,7,H
5,4,7,5,5,3,4,12,2,5,13,7,5,1,10,1,7,F
6,6,10,8,8,4,7,8,2,5,10,7,8,5,8,1,8,N
7,1,0,2,0,1,6,10,7,2,7,5,8,2,7,4,9,R
8,5,9,7,6,7,7,7,2,4,9,8,9,7,6,2,8,M
9,1,0,2,1,1,5,7,8,6,7,6,6,2,8,3,8,D


In [41]:
data.dtypes

x-box     int64
y-box     int64
width     int64
high      int64
onpix     int64
x-bar     int64
y-bar     int64
x2bar     int64
y2bar     int64
xybar     int64
x2ybr     int64
xy2br     int64
x-ege     int64
xegvy     int64
y-ege     int64
yegvx     int64
class    object
dtype: object

In [42]:
X = data.drop('class', axis=1)
X[0:5]

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,2,4,4,3,2,7,8,2,9,11,7,7,1,8,5,6
1,4,7,5,5,5,5,9,6,4,8,7,9,2,9,7,10
2,7,10,8,7,4,8,8,5,10,11,2,8,2,5,5,10
3,4,9,5,7,4,7,7,13,1,7,6,8,3,8,0,8
4,6,7,8,5,4,7,6,3,7,10,7,9,3,8,3,7


In [43]:
y = data['class']
y[0:5]

0    Z
1    P
2    S
3    H
4    H
Name: class, dtype: object

In [44]:
y.value_counts()

class
U    813
D    805
P    803
T    796
M    792
A    789
X    787
Y    786
N    783
Q    783
F    775
G    773
E    768
B    766
V    764
L    761
R    758
I    755
O    753
W    752
S    748
J    747
K    739
C    736
H    734
Z    734
Name: count, dtype: int64

In [45]:
le_class = preprocessing.LabelEncoder()
le_class.fit(['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'])
ynew = le_class.transform(y)
ynew

array([25, 15, 18, ..., 14, 11, 16])

In [46]:
X_train, X_test, y_train, y_test = train_test_split( X, ynew, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (16000, 16) (16000,)
Test set: (4000, 16) (4000,)


In [47]:
from sklearn import svm
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)

In [48]:
yhat = clf.predict(X_test)
yhat[0:5]

array([20, 11, 13,  4, 24])

In [49]:
from sklearn.metrics import classification_report, confusion_matrix
import itertools

In [50]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [52]:
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, yhat, labels=[2,4])
np.set_printoptions(precision=2)

print (classification_report(y_test, yhat))


              precision    recall  f1-score   support

           0       0.97      0.96      0.96       155
           1       0.82      0.95      0.88       147
           2       0.99      0.94      0.96       157
           3       0.86      0.94      0.90       159
           4       0.86      0.91      0.89       152
           5       0.90      0.95      0.92       182
           6       0.81      0.91      0.85       159
           7       0.90      0.80      0.85       141
           8       0.95      0.87      0.91       125
           9       0.93      0.96      0.94       143
          10       0.95      0.95      0.95       148
          11       0.98      0.93      0.96       183
          12       0.94      0.92      0.93       166
          13       0.97      0.91      0.94       163
          14       0.86      0.89      0.88       161
          15       0.99      0.89      0.94       154
          16       0.92      0.94      0.93       156
          17       0.86    

In [53]:
from sklearn.metrics import f1_score
f1_score(y_test, yhat, average='weighted') 

0.9275414147341398