<a href="https://colab.research.google.com/github/ShenZheyi/ENSTA_ROB311/blob/master/TP4/ROB311_TP4_SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **ROB311-TP4-SVM**
## Implementation of a SVM Digit Recognition Algorithm
SHEN Zheyi & GUAN Zhaoyi

In [1]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler 
import numpy as np

Fontion used to read data:

In [2]:
def read_data(filename, feature_cols, label_cols):
  print('Read', filename)
  feature = np.loadtxt(filename, delimiter=',', dtype=int, usecols=feature_cols, skiprows=1)
  label = np.loadtxt(filename, delimiter=',', dtype=int, usecols=label_cols, skiprows=1)
  return feature, label

Fontions to calculate the accuracy and the confusion matrix:

In [3]:
def calculate_accuracy(test_label_true, test_label_pred):
  accuracy = np.mean(test_label_true == test_label_pred)
  print('The accuray of SVM model is: ', accuracy)
  return accuracy

def calculate_confusion_matrix(test_label_true, test_label_pred):
  conf_mat = confusion_matrix(test_label_true, test_label_pred)
  print('The confusion Matrix :')
  print(conf_mat)
  return conf_mat

Load files

In [4]:
train_file = "mnist_train.csv"
test_file = "mnist_test.csv"
label_col = 0
features_col = range(1,785)
trainData, trainLabel = read_data(train_file, features_col, label_col)
testData, trueLabel = read_data(test_file, features_col, label_col)

Read mnist_train.csv
Read mnist_test.csv


## 1.   A Simple Implementation of SVM





In [5]:
def simple_SVM_train(train_label, train_data, pca_components=100):
  # Create and train the model
  pca = PCA(n_components=pca_components)
  svc = SVC(class_weight='balanced')
  pca.fit(train_data)
  new_train_data = pca.transform(train_data)
  print('Size of train data before PCA: ', train_data.shape)
  print('Size of train data after PCA: ', new_train_data.shape)
  clf = SVC()
  print('Begin training...')
  clf.fit(new_train_data, train_label)
  return pca, clf

In [6]:
def simple_SVM_test(test_data, pca, clf):
  new_test_data = pca.transform(test_data)
  print('Begin testing...')
  predict_label = clf.predict(new_test_data)
  return predict_label

In [7]:
if __name__ == '__main__':
  pca, clf = simple_SVM_train(trainLabel, trainData, 100)
  predictLabel = simple_SVM_test(testData, pca, clf)
  calculate_accuracy(trueLabel, predictLabel)
  calculate_confusion_matrix(trueLabel, predictLabel)

Size of train data before PCA:  (60000, 784)
Size of train data after PCA:  (60000, 100)
Begin training...
Begin testing...
The accuray of SVM model is:  0.9841
The confusion Matrix :
[[ 974    0    1    0    0    2    0    1    2    0]
 [   0 1128    3    1    0    1    1    0    1    0]
 [   5    0 1014    0    1    0    1    7    4    0]
 [   0    0    2  996    0    3    0    5    3    1]
 [   0    0    3    0  963    0    4    0    1   11]
 [   2    0    0    7    0  878    2    1    1    1]
 [   5    2    0    0    2    3  945    0    1    0]
 [   0    5    9    1    0    0    0 1007    1    5]
 [   3    0    1    3    3    1    1    2  958    2]
 [   3    3    1    5    8    2    1    5    3  978]]


##2.   Algorithm using *make_pipeline*





In [8]:
def SVM_train_pipeline(train_label, train_data, pca_components=100):
  # Create and train the model
  pca = PCA(n_components=pca_components)
  svc = SVC(class_weight='balanced')
  clf = make_pipeline(pca, StandardScaler(), svc)
  print('Begin training...')
  clf.fit(train_data, train_label)
  return clf

def SVM_test_pipeline(test_data, clf):
  print('Begin testing...')
  predict_label = clf.predict(test_data)
  return predict_label

if __name__ == '__main__':
  clf = SVM_train_pipeline(trainLabel, trainData, 100)
  predictLabel = SVM_test_pipeline(testData, clf)
  calculate_accuracy(trueLabel, predictLabel)
  calculate_confusion_matrix(trueLabel, predictLabel)

Begin training...
Begin testing...
The accuray of SVM model is:  0.9812
The confusion Matrix :
[[ 974    0    1    1    0    0    2    0    2    0]
 [   0 1126    2    1    1    1    2    1    1    0]
 [   2    2 1013    1    1    0    1    4    7    1]
 [   0    0    7  987    0    5    0    5    5    1]
 [   0    0    6    0  963    0    1    0    1   11]
 [   2    0    0    6    0  878    3    0    3    0]
 [   6    2    0    1    3    2  944    0    0    0]
 [   0    4   12    2    0    1    0 1003    0    6]
 [   3    0    1    4    2    3    1    3  955    2]
 [   4    2    2    7   11    4    0    8    2  969]]


##3.   Use *GridSearchCV* to find best estimator



In [9]:
def SVM_train(train_label, train_data, pca_components=100):
  # Create and train the model
  pca = PCA(n_components=pca_components)
  svc = SVC(class_weight='balanced')
  model = make_pipeline(pca, StandardScaler(), svc)
  parameters = {'svc__C': [1, 5, 10],'svc__kernel': ('linear', 'poly', 'rbf', 'sigmoid')}
  clf = GridSearchCV(model, parameters)
  print('Begin training...')
  clf.fit(train_data, train_label)
  print("The best parameters is : ", clf.best_params_)
  print("The best score associated is : ", clf.best_score_)
  return clf.best_estimator_

def SVM_test(test_data, estimator):
  print('Begin test...')
  predict_label = estimator.predict(test_data)
  return predict_label
  
if __name__ == '__main__':
  best_estimator = SVM_train(trainLabel, trainData)
  predictLabel = SVM_test(testData, best_estimator)

  calculate_accuracy(trueLabel, predictLabel)
  calculate_confusion_matrix(trueLabel, predictLabel)

Begin training...
The best parameters is :  {'svc__C': 10, 'svc__kernel': 'rbf'}
The best score associated is :  0.9797166666666668
Begin test...
The accuray of SVM model is:  0.9818
The confusion Matrix :
[[ 974    0    0    1    0    0    2    0    3    0]
 [   0 1127    2    1    1    2    1    0    1    0]
 [   2    1 1013    1    1    0    1    4    8    1]
 [   0    0    7  986    0    6    0    5    5    1]
 [   0    0    6    0  965    0    1    0    1    9]
 [   2    0    0    8    0  877    4    0    1    0]
 [   4    2    0    1    3    3  945    0    0    0]
 [   0    4    9    2    2    1    0 1005    0    5]
 [   3    1    1    2    1    3    1    4  955    3]
 [   4    2    2    5   10    4    0    8    3  971]]
