In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_openml

# 准备数据

In [2]:
mnist=fetch_openml('mnist_784',version=1,cache=True)

In [3]:
X=mnist.data
y=mnist.target.astype(np.uint8)

In [4]:
X_train=X[:60000]
y_train=y[:60000]
X_test=X[60000:]
y_test=y[60000:]

# 模型训练
使用OVR策略。

In [5]:
from sklearn.svm import LinearSVC
svm_clf=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)
svm_clf.fit(X_train,y_train)



LinearSVC(random_state=42)

In [6]:
from sklearn.metrics import accuracy_score
y_pred=svm_clf.predict(X_test)
accuracy_score(y_true=y_test,y_pred=y_pred)

0.8236

This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:

In [7]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled=scaler.transform(X_test.astype(np.float32))

In [8]:
lin_clf=LinearSVC(random_state=42)
lin_clf.fit(X_train_scaled,y_train)



LinearSVC(random_state=42)

In [10]:
y_pred=lin_clf.predict(X_test_scaled)
accuracy_score(y_test,y_pred)

0.9131

经过标准化之后，准确率提升。

接下来我们尝试用SVM和高斯核。**Note**: to be future-proof we set `gamma="scale"` since it will be the default value in Scikit-Learn 0.22.

In [12]:
from sklearn.svm import SVC
svm_clf=SVC(kernel='rbf',gamma='scale')
svm_clf.fit(X_train_scaled[:10000], y_train[:10000])

SVC()

In [13]:
y_pred=svm_clf.predict(X_test_scaled)
accuracy_score(y_test,y_pred)

0.9389

精确度又略有提升。

# 交叉验证/随机搜索

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal,uniform

In [16]:
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}

In [18]:
rnd_search_cv=RandomizedSearchCV(svm_clf,param_distributions,cv=3,verbose=2,n_iter=10,n_jobs=-1)
rnd_search_cv.fit(X_train_scaled[:1000],y_train[:1000])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BF4B862648>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BF4B862308>},
                   verbose=2)

In [20]:
rnd_search_cv.best_estimator_

SVC(C=4.379617296378995, gamma=0.0018266449280382)

In [21]:
rnd_search_cv.best_score_

0.8589937242631853

In [22]:
rnd_search_cv.best_params_

{'C': 4.379617296378995, 'gamma': 0.0018266449280382}

In [23]:
# 用最好的模型进行训练
rnd_search_cv.best_estimator_.fit(X_train_scaled,y_train)

SVC(C=4.379617296378995, gamma=0.0018266449280382)

In [24]:
y_pred=rnd_search_cv.best_estimator_.predict(X_test_scaled)

In [25]:
accuracy_score(y_test,y_pred)

0.9716

准确率达到非常高了！！