### KNN Model for 40x40 pixel Dataset

In [102]:
import numpy as np #To convert into numpy arrays
from sklearn.neighbors import KNeighborsClassifier #to build KNN
from sklearn.metrics import accuracy_score,classification_report #To calculate accuracy,precision and recall
from pandas_ml import ConfusionMatrix #To generate Confusion matrix
import os #To change root directory
import pandas as pd #To import the dataset
from sklearn.model_selection import cross_val_score #to calculate error

In [103]:
#Importing the dataset
os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('train_numbers.csv')
y_train = label_train['Label']
label_test = pd.read_csv('test_numbers.csv')
y_test = label_test['Label']

In [23]:
X_train = label_train.iloc[:,1:1601]
X_test = label_test.iloc[:,1:1601]

In [24]:
#Creating KNN Classifier
clf = KNeighborsClassifier(n_neighbors=5,algorithm='auto',n_jobs=10)
clf.fit(X_train,y_train)

print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)


ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        420    17     0    17   14      4    4     33    0     4      513
Five           2   449     2     5   20      0    8     21    0     6      513
Four           3     0   458    23   17      0    2      0    2     8      513
Nine           1     4    23   439   16     25    0      1    2     2      513
One            0     0     1     1  511      0    0      0    0     0      513
Seven          1     0     1     6   22    482    0      0    1     0      513
Six            0     7     1     0    7      0  490      0    0     8      513
Three         10     9     1     0   14      2    0    473    3     1      513
Two           10     1     0     2   10     21    5     12  441    11      513
Zero           0     2     0     0    5      0    3      3    0   500      513
__all__      447   489   487   493  636    534  512 

In [61]:
#Calculate Precision,Recall
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.93      0.82      0.88       513
       Five       0.92      0.89      0.91       513
       Four       0.91      0.91      0.91       513
       Nine       0.89      0.84      0.86       513
        One       0.83      0.99      0.91       513
      Seven       0.90      0.94      0.92       513
        Six       0.96      0.96      0.96       513
      Three       0.87      0.90      0.89       513
        Two       0.98      0.86      0.92       513
       Zero       0.92      0.97      0.95       513

avg / total       0.91      0.91      0.91      5130



#### Calculating optimum k-value

In [25]:
#Finding the best K-Value
klist = list(range(2,10))
cv_scores = []
for k in klist:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='auto',n_jobs=10)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = klist[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

The optimal number of neighbors is 4


In [73]:
clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(X_train,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        423    14     1    14   13      3    5     32    3     5      513
Five           4   458     2     5   15      1    5     16    0     7      513
Four           3     0   467    19   12      2    3      0    2     5      513
Nine           1     4    33   430   12     27    0      2    2     2      513
One            0     0     2     1  510      0    0      0    0     0      513
Seven          1     0     2     7   19    481    1      0    1     1      513
Six            1     7     1     0    6      0  491      0    0     7      513
Three         11    12     2     4   13      4    0    463    1     3      513
Two            9     1     1     2    8     19    5     15  442    11      513
Zero           0     2     0     0    5      0    3      3    0   500      513
__all__      453   498   511   482  613    537  513 

In [74]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.93      0.82      0.88       513
       Five       0.92      0.89      0.91       513
       Four       0.91      0.91      0.91       513
       Nine       0.89      0.84      0.86       513
        One       0.83      0.99      0.91       513
      Seven       0.90      0.94      0.92       513
        Six       0.96      0.96      0.96       513
      Three       0.87      0.90      0.89       513
        Two       0.98      0.86      0.92       513
       Zero       0.92      0.97      0.95       513

avg / total       0.91      0.91      0.91      5130



### Implementing PCA

In [27]:
from sklearn.decomposition import PCA

In [28]:
import numpy as np
from sklearn.decomposition import PCA

train = np.array(X_train)
n_components = 1600
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)
s = pca.explained_variance_ratio_
sum=0.0
comp=0

for _ in s:
    sum += _
    comp += 1
    if(sum>=0.99):
        break
print(comp)

260


In [75]:
train = np.array(X_train)
test = np.array(X_test)
n_components = comp

#fitting pca
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)

xtrain = pca.transform(train)
xtest = pca.transform(test)

clf = KNeighborsClassifier(n_neighbors=2,algorithm='auto',n_jobs=10)
clf.fit(xtrain,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(xtest)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        442    17     5    15    6      2    5     18    0     3      513
Five           9   467     3     2   11      0    5     14    0     2      513
Four           4     0   474    15   10      1    4      0    2     3      513
Nine           2     1    47   434   10     18    0      0    0     1      513
One            0     0     0     4  509      0    0      0    0     0      513
Seven          0     0     4    12   16    477    0      3    1     0      513
Six            1     9     2     0    6      0  491      0    0     4      513
Three         10    26     1     7    3      8    2    453    1     2      513
Two           18     2     1     1    5     27    5     22  428     4      513
Zero           0     1     0     1    3      0    5      5    2   496      513
__all__      486   523   537   491  579    533  517 

In [76]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.91      0.86      0.88       513
       Five       0.89      0.91      0.90       513
       Four       0.88      0.92      0.90       513
       Nine       0.88      0.85      0.86       513
        One       0.88      0.99      0.93       513
      Seven       0.89      0.93      0.91       513
        Six       0.95      0.96      0.95       513
      Three       0.88      0.88      0.88       513
        Two       0.99      0.83      0.90       513
       Zero       0.96      0.97      0.96       513

avg / total       0.91      0.91      0.91      5130



In [31]:
klist = list(range(2,10))
cv_scores = []
for k in klist:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='auto',n_jobs=10)
    scores = cross_val_score(knn, xtrain, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = klist[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

The optimal number of neighbors is 3


In [115]:
import numpy as np
from sklearn.decomposition import PCA

os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('train_numbers.csv')
y_train = label_train['Label']
label_test = pd.read_csv('test_numbers.csv')
y_test = label_test['Label']

X_train = label_train.iloc[:,1:1601]
X_test = label_test.iloc[:,1:1601]

train = np.array(X_train)
test = np.array(X_test)
#Number of components of PCA
n_components = 1600
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)
s = pca.explained_variance_ratio_
sum=0.0
comp=0
#No.of Components required to explain 99% variance
for _ in s:
    sum += _
    comp += 1
    if(sum>=0.99):
        break
n_components = comp
#fitting pca
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)

xtrain = pca.transform(train)
xtest = pca.transform(test)

clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(xtrain,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(xtest)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        423    14     1    14   12      3    4     32    3     7      513
Five           4   458     1     6   17      0    7     14    0     6      513
Four           3     0   468    18   11      2    3      1    2     5      513
Nine           1     4    33   434   10     24    0      2    2     3      513
One            0     0     1     2  510      0    0      0    0     0      513
Seven          1     0     1     9   18    480    1      0    1     2      513
Six            2     7     0     0    5      0  491      0    0     8      513
Three         11    13     0     3   11      4    0    468    1     2      513
Two           11     3     3     1    8     19    5     15  439     9      513
Zero           0     0     0     0    5      0    4      3    0   501      513
__all__      456   499   508   487  607    532  515 

In [116]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.93      0.82      0.87       513
       Five       0.92      0.89      0.91       513
       Four       0.92      0.91      0.92       513
       Nine       0.89      0.85      0.87       513
        One       0.84      0.99      0.91       513
      Seven       0.90      0.94      0.92       513
        Six       0.95      0.96      0.96       513
      Three       0.87      0.91      0.89       513
        Two       0.98      0.86      0.91       513
       Zero       0.92      0.98      0.95       513

avg / total       0.91      0.91      0.91      5130



### Creating for 100x100 dataset

In [33]:
os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('train100.csv')
y_train = label_train['Label']
label_test = pd.read_csv('test100.csv')
y_test = label_test['Label']

In [34]:
X_train = label_train.iloc[:,1:10001]
X_test = label_test.iloc[:,1:10001]

In [79]:
clf = KNeighborsClassifier(n_neighbors=5,algorithm='auto',n_jobs=10)
clf.fit(X_train,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        420    17     0    17   14      4    4     33    0     4      513
Five           2   449     2     5   20      0    8     21    0     6      513
Four           3     0   458    23   17      0    2      0    2     8      513
Nine           1     4    23   439   16     25    0      1    2     2      513
One            0     0     1     1  511      0    0      0    0     0      513
Seven          1     0     1     6   22    482    0      0    1     0      513
Six            0     7     1     0    7      0  490      0    0     8      513
Three         10     9     1     0   14      2    0    473    3     1      513
Two           10     1     0     2   10     21    5     12  441    11      513
Zero           0     2     0     0    5      0    3      3    0   500      513
__all__      447   489   487   493  636    534  512 

In [80]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.94      0.82      0.88       513
       Five       0.92      0.88      0.90       513
       Four       0.94      0.89      0.92       513
       Nine       0.89      0.86      0.87       513
        One       0.80      1.00      0.89       513
      Seven       0.90      0.94      0.92       513
        Six       0.96      0.96      0.96       513
      Three       0.87      0.92      0.90       513
        Two       0.98      0.86      0.92       513
       Zero       0.93      0.97      0.95       513

avg / total       0.91      0.91      0.91      5130



In [36]:
klist = list(range(2,10))
cv_scores = []
for k in klist:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='auto',n_jobs=10)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = klist[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

The optimal number of neighbors is 3


In [109]:
os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('train100.csv')
y_train = label_train['Label']
label_test = pd.read_csv('test100.csv')
y_test = label_test['Label']
X_train = label_train.iloc[:,1:10001]
X_test = label_test.iloc[:,1:10001]

clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(X_train,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)


Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.89

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        402    14     2    19   21      7    5     35    3     5      513
Five           4   447     3     6   23      1    4     15    1     9      513
Four           2     0   447    19   32      2    1      0    2     8      513
Nine           0     4    38   419   20     27    0      3    1     1      513
One            0     0     2     1  510      0    0      0    0     0      513
Seven          1     0     1     7   29    473    0      0    2     0      513
Six            1     8     2     0   13      0  476      0    1    12      513
Three         11    14     4     3   20      8    0    451    0     2      513
Two            8     2     2     3   16     30    4     14  425     9      513
Zero           0     0     0     0    5      0    4      3    0   501      513
__all__      429   489   501   477  689    548  494 

In [110]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.94      0.78      0.85       513
       Five       0.91      0.87      0.89       513
       Four       0.89      0.87      0.88       513
       Nine       0.88      0.82      0.85       513
        One       0.74      0.99      0.85       513
      Seven       0.86      0.92      0.89       513
        Six       0.96      0.93      0.95       513
      Three       0.87      0.88      0.87       513
        Two       0.98      0.83      0.90       513
       Zero       0.92      0.98      0.95       513

avg / total       0.89      0.89      0.89      5130



### Implementing PCA

In [117]:
import numpy as np
from sklearn.decomposition import PCA

os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('train100.csv')
y_train = label_train['Label']
label_test = pd.read_csv('test100.csv')
y_test = label_test['Label']
X_train = label_train.iloc[:,1:10001]
X_test = label_test.iloc[:,1:10001]

train = np.array(X_train)
test = np.array(X_test)

n_components = 1600
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)
s = pca.explained_variance_ratio_
sum=0.0
comp=0

for _ in s:
    sum += _
    comp += 1
    if(sum>=0.99):
        break
n_components = comp
#fitting pca
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)

xtrain = pca.transform(train)
xtest = pca.transform(test)

clf = KNeighborsClassifier(n_neighbors=2,algorithm='auto',n_jobs=10)
clf.fit(xtrain,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(xtest)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)


Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.89

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        430    15     6    15   12      5    8     18    1     3      513
Five           6   465     2     3   19      0    5     10    0     3      513
Four           3     0   469    12   18      1    5      0    2     3      513
Nine           2     1    62   419   11     17    0      0    0     1      513
One            0     0     2     1  510      0    0      0    0     0      513
Seven          1     0     4    10   27    469    0      1    1     0      513
Six            1     9     3     0   12      0  483      1    0     4      513
Three         10    28     3     7   13     10    1    439    1     1      513
Two           16     3     2     2   16     33    4     21  410     6      513
Zero           0     0     1     0    5      1    7      4    4   491      513
__all__      469   521   554   469  643    536  513 

In [118]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.92      0.84      0.88       513
       Five       0.89      0.91      0.90       513
       Four       0.85      0.91      0.88       513
       Nine       0.89      0.82      0.85       513
        One       0.79      0.99      0.88       513
      Seven       0.88      0.91      0.89       513
        Six       0.94      0.94      0.94       513
      Three       0.89      0.86      0.87       513
        Two       0.98      0.80      0.88       513
       Zero       0.96      0.96      0.96       513

avg / total       0.90      0.89      0.89      5130



In [85]:
klist = list(range(2,10))
cv_scores = []
for k in klist:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='auto',n_jobs=10)
    scores = cross_val_score(knn, xtrain, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = klist[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

The optimal number of neighbors is 4


In [119]:
import numpy as np
from sklearn.decomposition import PCA

os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('train100.csv')
y_train = label_train['Label']
label_test = pd.read_csv('test100.csv')
y_test = label_test['Label']
X_train = label_train.iloc[:,1:10001]
X_test = label_test.iloc[:,1:10001]

train = np.array(X_train)
test = np.array(X_test)

n_components = 1600
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)
s = pca.explained_variance_ratio_
sum=0.0
comp=0

for _ in s:
    sum += _
    comp += 1
    if(sum>=0.99):
        break
n_components = comp
#fitting pca
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)

xtrain = pca.transform(train)
xtest = pca.transform(test)

clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(xtrain,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(xtest)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)


Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.89

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        406    13     2    19   18      7    4     35    3     6      513
Five           4   449     2     6   23      1    4     14    1     9      513
Four           3     0   452    17   30      2    1      0    1     7      513
Nine           0     4    39   421   19     26    0      2    1     1      513
One            0     0     1     1  511      0    0      0    0     0      513
Seven          1     0     2     7   27    473    0      0    2     1      513
Six            1     7     2     0   10      0  479      0    1    13      513
Three         10    14     4     3   17      9    0    453    0     3      513
Two            8     2     1     3   15     31    3     14  426    10      513
Zero           0     0     0     0    5      0    4      3    0   501      513
__all__      433   489   505   477  675    549  495 

In [120]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.94      0.79      0.86       513
       Five       0.92      0.88      0.90       513
       Four       0.90      0.88      0.89       513
       Nine       0.88      0.82      0.85       513
        One       0.76      1.00      0.86       513
      Seven       0.86      0.92      0.89       513
        Six       0.97      0.93      0.95       513
      Three       0.87      0.88      0.88       513
        Two       0.98      0.83      0.90       513
       Zero       0.91      0.98      0.94       513

avg / total       0.90      0.89      0.89      5130



### Zero reversed

In [41]:
os.chdir("C:/Users/Sanjeev Varma/Desktop/Capstone/Signature Recognition/Data")
label_train = pd.read_csv('trainzero.csv')
y_train = label_train['Label']
label_test = pd.read_csv('testzero.csv')
y_test = label_test['Label']

In [42]:
X_train = label_train.iloc[:,1:1601]
X_test = label_test.iloc[:,1:1601]

In [89]:
clf = KNeighborsClassifier(n_neighbors=5,algorithm='auto',n_jobs=10)
clf.fit(X_train,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        420    17     0    17   14      4    4     33    0     4      513
Five           2   449     2     5   20      0    8     21    0     6      513
Four           3     0   458    23   17      0    2      0    2     8      513
Nine           1     4    23   439   16     25    0      1    2     2      513
One            0     0     1     1  511      0    0      0    0     0      513
Seven          1     0     1     6   22    482    0      0    1     0      513
Six            0     7     1     0    7      0  490      0    0     8      513
Three         10     9     1     0   14      2    0    473    3     1      513
Two           10     1     0     2   10     21    5     12  441    11      513
Zero           0     2     0     0    5      0    3      3    0   500      513
__all__      447   489   487   493  636    534  512 

In [90]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.94      0.82      0.88       513
       Five       0.92      0.88      0.90       513
       Four       0.94      0.89      0.92       513
       Nine       0.89      0.86      0.87       513
        One       0.80      1.00      0.89       513
      Seven       0.90      0.94      0.92       513
        Six       0.96      0.96      0.96       513
      Three       0.87      0.92      0.90       513
        Two       0.98      0.86      0.92       513
       Zero       0.93      0.97      0.95       513

avg / total       0.91      0.91      0.91      5130



In [44]:
klist = list(range(2,10))
cv_scores = []
for k in klist:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='auto',n_jobs=10)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = klist[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

The optimal number of neighbors is 4


In [91]:
clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(X_train,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(X_test)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)



Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        423    14     1    14   13      3    5     32    3     5      513
Five           4   458     2     5   15      1    5     16    0     7      513
Four           3     0   467    19   12      2    3      0    2     5      513
Nine           1     4    33   430   12     27    0      2    2     2      513
One            0     0     2     1  510      0    0      0    0     0      513
Seven          1     0     2     7   19    481    1      0    1     1      513
Six            1     7     1     0    6      0  491      0    0     7      513
Three         11    12     2     4   13      4    0    463    1     3      513
Two            9     1     1     2    8     19    5     15  442    11      513
Zero           0     2     0     0    5      0    3      3    0   500      513
__all__      453   498   511   482  613    537  513 

In [92]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.93      0.82      0.88       513
       Five       0.92      0.89      0.91       513
       Four       0.91      0.91      0.91       513
       Nine       0.89      0.84      0.86       513
        One       0.83      0.99      0.91       513
      Seven       0.90      0.94      0.92       513
        Six       0.96      0.96      0.96       513
      Three       0.87      0.90      0.89       513
        Two       0.98      0.86      0.92       513
       Zero       0.92      0.97      0.95       513

avg / total       0.91      0.91      0.91      5130



#### Implementing PCA

In [99]:
import numpy as np
from sklearn.decomposition import PCA



train = np.array(X_train)
test = np.array(X_test)
 
n_components = 1600
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)
s = pca.explained_variance_ratio_
sum=0.0
comp=0

for _ in s:
    sum += _
    comp += 1
    if(sum>=0.99):
        break
n_components = comp
#fitting pca
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)

xtrain = pca.fit_transform(train)
xtest = pca.fit_transform(test)

clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(xtrain,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(xtest)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)


Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.34

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight         47    63    96    59   68     63   18     26   20    53      513
Five          74    79     7    51   41     54   12     67   17   111      513
Four          50    18   244    65   51     43   28      3    8     3      513
Nine          50    11    92   147   67    115    2     24    2     3      513
One           32     0     3    11  411     26    0      1   19    10      513
Seven         44     8     8   138  132     95    5     61    6    16      513
Six           28    12     4     2   29     13  339      9   52    25      513
Three         22   117     7    28   15    147    2    140    7    28      513
Two           33    59    21    12   89     55  160     21   43    20      513
Zero          31   126     1     6   22     34   48     56    8   181      513
__all__      411   493   483   519  925    645  614 

In [47]:
klist = list(range(2,10))
cv_scores = []
for k in klist:
    knn = KNeighborsClassifier(n_neighbors=k,algorithm='auto',n_jobs=10)
    scores = cross_val_score(knn, xtrain, y_train, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = klist[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

The optimal number of neighbors is 4


In [95]:
import numpy as np
from sklearn.decomposition import PCA

train = np.array(X_train)
test = np.array(X_test)

n_components = 1600
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)
s = pca.explained_variance_ratio_
sum=0.0
comp=0

for _ in s:
    sum += _
    comp += 1
    if(sum>=0.99):
        break

#fitting pca
pca = PCA(n_components=n_components,svd_solver='randomized').fit(train)

xtrain = pca.transform(train)
xtest = pca.transform(test)

clf = KNeighborsClassifier(n_neighbors=optimal_k,algorithm='auto',n_jobs=10)
clf.fit(xtrain,y_train)
print('\nMaking Predictions on Validation Data...')
y_pred = clf.predict(xtest)

print('\nCalculating Accuracy of Predictions...')
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: %0.2f" % accuracy)

print('\nCreating Confusion Matrix...')
ConfusionMatrix(y_test, y_pred)


Making Predictions on Validation Data...

Calculating Accuracy of Predictions...
Accuracy score: 0.91

Creating Confusion Matrix...


Predicted  Eight  Five  Four  Nine  One  Seven  Six  Three  Two  Zero  __all__
Actual                                                                        
Eight        423    14     1    14   13      3    5     32    3     5      513
Five           4   458     2     5   15      1    5     16    0     7      513
Four           3     0   467    19   12      2    3      0    2     5      513
Nine           1     4    33   430   12     27    0      2    2     2      513
One            0     0     2     1  510      0    0      0    0     0      513
Seven          1     0     2     7   19    481    1      0    1     1      513
Six            1     7     1     0    6      0  491      0    0     7      513
Three         11    12     2     4   13      4    0    463    1     3      513
Two            9     1     1     2    8     19    5     15  442    11      513
Zero           0     2     0     0    5      0    3      3    0   500      513
__all__      453   498   511   482  613    537  513 

In [96]:
report = classification_report(y_test,y_pred)
print(report)

             precision    recall  f1-score   support

      Eight       0.93      0.82      0.88       513
       Five       0.92      0.89      0.91       513
       Four       0.91      0.91      0.91       513
       Nine       0.89      0.84      0.86       513
        One       0.83      0.99      0.91       513
      Seven       0.90      0.94      0.92       513
        Six       0.96      0.96      0.96       513
      Three       0.87      0.90      0.89       513
        Two       0.98      0.86      0.92       513
       Zero       0.92      0.97      0.95       513

avg / total       0.91      0.91      0.91      5130

