In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

%matplotlib inline

np.random.seed(2)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools




sns.set(style='white', context='notebook', palette='deep')

# SVM

In [2]:
from sklearn.preprocessing import scale
from sklearn import svm
from sklearn import metrics

In [3]:
# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Y_train = train["label"]
X_train = train.drop(columns = "label")
print(train.shape, test.shape)

(42000, 785) (28000, 784)


In [4]:
# Normalize the data
X_train = X_train / 255 # y不用normalize吗
test = test / 255

# Scale features
X_scaled = scale(X_train)

# Split train and test
random_seed = 2
X_train, X_val, Y_train, Y_val = train_test_split(X_scaled, Y_train, test_size = 0.1, random_state=random_seed)

In [5]:
# SVM with rbg kernel, c=1, default value of gamma

# model
svm_model = svm.SVC(kernel='rbf')

# fit
svm_model.fit(X_train, Y_train)

# predict
Y_pred = svm_model.predict(X_val)

In [6]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=Y_val, y_pred=Y_pred), "\n")

# confusion matrix
print(metrics.confusion_matrix(y_true=Y_val, y_pred=Y_pred))

accuracy: 0.9614285714285714 

[[408   0   0   0   1   0   1   1   0   0]
 [  0 478   2   3   0   0   0   0   2   0]
 [  2   1 391   4   1   0   0   2   1   1]
 [  0   1   5 397   0   6   1   1   6   1]
 [  0   0   1   0 445   0   5   2   1   7]
 [  1   0   2   5   0 357   2   0   4   1]
 [  6   1   6   0   1   6 392   0   1   0]
 [  1   2   7   2   1   0   0 426   0   7]
 [  0   3   3   5   1   5   2   0 360   3]
 [  2   2   1   3   7   1   0   9   0 384]]


In [7]:
# Train the model again on a random sample with size = 3000

# Randomly select 3000 samples
train_sampled = train.sample(n = 3000, replace = False, random_state = random_seed)

# output the selected sample
train_sampled.to_csv("train_sampled.csv", index=False)

print(train.shape, train_sampled.shape, test.shape)

(42000, 785) (3000, 785) (28000, 784)


In [8]:
# Load the sampled data
train = pd.read_csv("train_sampled.csv")
test = pd.read_csv("test.csv")
Y_train = train["label"].to_numpy()
X_train = train.drop(columns = "label")
print(train.shape, X_train.shape, Y_train.shape)

(3000, 785) (3000, 784) (3000,)


In [9]:
# Normalize the data
X_train = X_train / 255
test = test / 255

# Scale features
X_scaled = scale(X_train)

# Split train and test
random_seed = 2
X_train, X_val, Y_train, Y_val = train_test_split(X_scaled, Y_train, test_size = 0.1, random_state=random_seed)

In [10]:
# SVM with rbg kernel, c=1, default value of gamma

# model
svm_model = svm.SVC(kernel='rbf')

# fit
svm_model.fit(X_train, Y_train)

# predict
Y_pred = svm_model.predict(X_val)

In [11]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=Y_val, y_pred=Y_pred), "\n")

# confusion matrix
print(metrics.confusion_matrix(y_true=Y_val, y_pred=Y_pred))

accuracy: 0.91 

[[28  0  0  0  0  1  1  0  1  0]
 [ 0 37  0  0  0  0  0  0  0  0]
 [ 0  0 26  0  0  0  0  0  0  0]
 [ 0  1  1 21  0  0  0  1  0  0]
 [ 0  0  1  0 31  1  0  1  0  0]
 [ 0  0  1  0  1 25  1  0  0  0]
 [ 2  1  1  0  0  0 23  0  0  0]
 [ 1  0  2  0  1  0  0 31  0  4]
 [ 0  0  0  0  0  1  0  0 28  0]
 [ 0  1  0  1  0  0  0  0  0 23]]
