## MNIST Data Preprocessing

In [1]:
import pickle
import gzip
import numpy as np
import matplotlib.pyplot as plt

path = 'mnist.pkl.gz'
f = gzip.open(path, 'rb')
training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
f.close()

x_train, y_train = training_data[0], training_data[1]
print (x_train.shape, y_train.shape)

x_test, y_test = test_data[0], test_data[1]
print (x_test.shape, y_test.shape)


(50000, 784) (50000,)
(10000, 784) (10000,)


## USPS Data Preprocessing

In [2]:
from PIL import Image
import os
import numpy as np

#USPS data preprocessing
USPSMat  = []
USPSTar  = []
curPath  = 'USPSdata/Numerals'
savedImg = []


for j in range(0,10):
    curFolderPath = curPath + '/' + str(j)
    imgs =  os.listdir(curFolderPath)
    for img in imgs:
        curImg = curFolderPath + '/' + img
        if curImg[-3:] == 'png':
            img = Image.open(curImg,'r')
            img = img.resize((28, 28))
            savedImg = img
            imgdata = (255-np.array(img.getdata()))/255
            USPSMat.append(imgdata)
            USPSTar.append(j)
usps_data = np.array(USPSMat) 
usps_lables = np.array(USPSTar)


## SVM Classifier

In [3]:

import matplotlib.pyplot as plt
from sklearn import svm, metrics, datasets

classifier = svm.SVC(kernel='rbf', C=2,gamma=0.001)
classifier.fit(x_train, y_train )

#testing using mnist
expected = y_test
predicted = classifier.predict(x_test)

np.savetxt("svm.csv", predicted, delimiter=",")
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.96      0.99      0.97       980
          1       0.97      0.99      0.98      1135
          2       0.93      0.94      0.94      1032
          3       0.92      0.94      0.93      1010
          4       0.93      0.95      0.94       982
          5       0.93      0.91      0.92       892
          6       0.95      0.97      0.96       958
          7       0.96      0.93      0.94      1028
          8       0.95      0.92      0.93       974
          9       0.94      0.92      0.93      1009

avg / total       0.95      0.95      0.95     10000


Confusion matrix:
[[ 967    0    1    0    0    5    4    1    2    0]
 [   0 1122    2    3    0   

### Testing using USPS

In [4]:
print("testing using USPS dataset")
expected = usps_lables
predicted = classifier.predict(usps_data)

np.savetxt("svm_usps.csv", predicted, delimiter=",")
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


testing using USPS dataset
Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.44      0.29      0.35      2000
          1       0.46      0.21      0.29      2000
          2       0.34      0.71      0.46      1999
          3       0.48      0.56      0.52      2000
          4       0.53      0.57      0.55      2000
          5       0.29      0.67      0.40      2000
          6       0.66      0.37      0.48      2000
          7       0.24      0.23      0.23      2000
          8       0.35      0.12      0.18      2000
          9       0.29      0.11      0.16      2000

avg / total       0.41      0.39      0.36     19999


Confusion matrix:
[[ 580    2  424   22  265  253   68   52    6  328]
 [

## Random Forests Classifier

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, y_train)
y_pred_rf = clf_rf.predict(x_test)
np.savetxt("rf.csv", y_pred_rf, delimiter=",")

acc_rf = accuracy_score(y_test, y_pred_rf)
print ("random forest accuracy: ",acc_rf)
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, y_pred_rf))

random forest accuracy:  0.9446
Confusion matrix:
[[ 966    0    1    0    0    5    3    1    3    1]
 [   0 1118    3    3    0    2    5    0    3    1]
 [  10    2  990   11    1    0    4    7    7    0]
 [   2    3   19  940    1   16    2    8   13    6]
 [   1    4    8    1  924    0    6    1    6   31]
 [   8    3    1   38    7  817    5    2    8    3]
 [  16    5    7    0    9    5  913    0    3    0]
 [   2    6   21    5    3    2    0  977    2   10]
 [   9    2   12   12   10   17    6    6  890   10]
 [  10    6    5   13   37    7    0   13    7  911]]


### Testing using USPS

In [6]:
print("testing using USPS dataset")
expected = usps_lables
predicted = clf_rf.predict(usps_data)
np.savetxt("rf_usps.csv", predicted, delimiter=",")

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


testing using USPS dataset
Classification report for classifier SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.33      0.30      0.32      2000
          1       0.30      0.32      0.31      2000
          2       0.27      0.47      0.34      1999
          3       0.39      0.48      0.43      2000
          4       0.41      0.39      0.40      2000
          5       0.27      0.46      0.34      2000
          6       0.50      0.27      0.35      2000
          7       0.20      0.28      0.23      2000
          8       0.31      0.06      0.09      2000
          9       0.18      0.05      0.08      2000

avg / total       0.32      0.31      0.29     19999


Confusion matrix:
[[608  36 273  94 331 170 119 153  11 205]
 [ 87 642 18