In [1]:
import numpy as np
import os

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import joblib

In [2]:
DATA_PATH = '../data'
IMAGE_SIZE = 784  # 28 * 28

In [3]:
def load_mnist(path, kind='train'):
    """
    'train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz',
    't10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz'
    使用前需要把上面四个文件下载到 `path` 目录下并解压
    """
    labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)

    with open(labels_path, 'rb') as label_file:
        labels = np.frombuffer(label_file.read(), dtype=np.uint8, offset=8)

    with open(images_path, 'rb') as image_file:
        images = np.frombuffer(image_file.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), IMAGE_SIZE)

    return images, labels

In [4]:
# 读取 MNIST 数据集
x_train, y_train = load_mnist(DATA_PATH, kind='train')
# x_test, y_test = load_mnist(DATA_PATH, kind='t10k')

In [5]:
# 特征工程：标准化
transfer4 = StandardScaler()
x_train = transfer4.fit_transform(x_train)

In [6]:
# x_train = x_train[:1000]
# y_train = y_train[:1000]

In [7]:
# SVM 分类器
svm_model4 = SVC(kernel='poly', max_iter=5000, gamma='scale', probability=True)

In [8]:
# 网格搜索与交叉验证
param_dict = {
    'C': [18, 18.5, 19, 19.5, 20],
     # 'kernel': ['linear', 'rbf', 'poly'],
     # 'gamma': ['scale', 'auto']
}
svm_model4 = GridSearchCV(svm_model4, param_dict, n_jobs=-1, cv=2)

In [9]:
# 训练模型
svm_model4.fit(x_train, y_train)

In [None]:
# 在训练数据上超参数调优的结果
print("最佳参数: \n", svm_model4.best_params_)
print("最佳结果（在验证集中的结果）: \n", svm_model4.best_score_)
print("最佳估计器: \n", svm_model4.best_estimator_)
print("交叉验证结果: \n", svm_model4.cv_results_)

In [None]:
# 保存模型
joblib.dump(svm_model4, '../models/svm_model4.pkl')
# 保存StandardScaler
joblib.dump(transfer4, '../models/transfer4.pkl')