In [3]:
import numpy as np, humanfriendly as hf, random
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [4]:
def get_scores(model, xtrain, ytrain, xtest, ytest):
    ypred = model.predict(xtest)
    train = model.score(xtrain, ytrain)
    test = model.score(xtest, y_test)
    name = model.__class__.__name__
    return (name, train, test)

In [5]:
def prep_data(data, target):
    d = [data[i] for i, _ in enumerate(data)]
    t = [target[i] for i, _ in enumerate(target)]
    return list(zip(d, t))

In [6]:
def create_sample(d, n, replace='yes'):
    if replace == 'yes': s = random.sample(d, n)
    else: s = [random.choice(d) for i, _ in enumerate(d)
               if i < n]
    Xs = [row[0] for i, row in enumerate(s)]
    ys = [row[1] for i, row in enumerate(s)]
    return np.array(Xs), np.array(ys)

In [7]:
def see_time(note,start):
    end = time.perf_counter()
    elapsed = end - start
    print (note,hf.format_timespan(elapsed, detailed=True))

In [8]:
X_file = 'data/X_mnist'
y_file = 'data/y_mnist'
X = np.load('data/X_mnist.npy')
y = np.load('data/y_mnist.npy')

In [9]:
X = X.astype(np.float32)
data = prep_data(X, y)
sample_size = 7000
Xs, ys = create_sample(data, sample_size)

In [10]:
pca = PCA(n_components=0.95, random_state=0)
Xs = StandardScaler().fit_transform(Xs)
Xs_reduced = pca.fit_transform(Xs)
X_train, X_test, y_train, y_test = train_test_split(Xs_reduced, ys, random_state=0)

In [13]:
svm_model = SVC(gamma='scale', random_state=0)
start = time.perf_counter()
svm_model.fit(X_train, y_train)
see_time('time:',start)
svm_model_scores = get_scores(svm_model, X_train, y_train,X_test, y_test)
print (svm_model_scores[0] + 'train:',svm_model_scores[1],'test:',svm_model_scores[2])

time: 5 seconds, 635 milliseconds, 348 microseconds and 800 nanoseconds
SVCtrain: 0.9803809523809524 test: 0.9262857142857143


In [14]:
param_grid = {'C': [30, 35, 40], 'kernel': ['poly'],'gamma': ['scale'], 'degree': [3],'coef0': [0.1]}
start = time.perf_counter()
rand = RandomizedSearchCV(svm_model, param_grid, cv=3, n_jobs = -1,random_state=0, n_iter=3,verbose=2)
rand.fit(X_train, y_train)
see_time('RandomizedSearchCV total tuning time:',start)
bp = rand.best_params_
print (bp)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:   19.9s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   23.8s finished


RandomizedSearchCV total tuning time: 29 seconds, 872 milliseconds, 711 microseconds and 700 nanoseconds
{'kernel': 'poly', 'gamma': 'scale', 'degree': 3, 'coef0': 0.1, 'C': 30}


In [16]:
svm_bpmodel = SVC(**bp, random_state=0)
start = time.perf_counter()
svm_bpmodel.fit(X_train, y_train)
svm_scores = get_scores(svm_bpmodel, X_train, y_train,X_test, y_test)
print (svm_scores[0] + 'train:',svm_scores[1],'test:',svm_scores[2])
see_time('total time:',start)

SVCtrain: 1.0 test: 0.9508571428571428
total time: 12 seconds, 982 milliseconds, 448 microseconds and 300 nanoseconds
