### Excercise 8

In [8]:
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

% matplotlib inline

In [9]:
# Load MNIST dataset
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

# Downsample the image
X_high_reso = mnist.data.reshape(70000, 28, 28)
idx = np.arange(0, 28, 2)
X = np.zeros((70000, 14, 14))
for i in range(70000):
    for j in range(len(idx)):
        X[i,j,:] = X_high_reso[i, idx[j], idx]

X = X.reshape(70000, -1)
y = mnist.target
# y = y.reshape(y.shape[0], -1)

In [10]:
# Split into training, validation and testing set
np.random.seed(23)
datasize = y.shape[0]
shuffle_idx = np.random.permutation(datasize)
ratio = np.array([0.7, 0.2, 0.1])
train_idx = shuffle_idx[np.arange(0, int(datasize * 0.7))]
vali_idx = shuffle_idx[np.arange(int(datasize * 0.7), int(datasize * 0.9))]
test_idx= shuffle_idx[np.arange(int(datasize * 0.9),datasize)]
X_train = X[train_idx, :]
y_train = y[train_idx]
X_vali = X[vali_idx, :]
y_vali = y[vali_idx]
X_test = X[test_idx, :]
y_test = y[test_idx]

In [11]:
# Perform pre-precessing
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train.astype(np.float32))
X_vali_scale = scaler.transform(X_vali.astype(np.float32))

In [12]:
# Train with logistic regression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
SGD_clf = SGDClassifier(loss='log', random_state=23)
SGD_clf.fit(X_train_scale, y_train)
SGD_pred_vali= SGD_clf.predict(X_vali_scale)
SGD_accu = accuracy_score(SGD_pred_vali, y_vali)
print("logistic regression:{accu}".format(accu=SGD_accu))

logistic regression:0.8841428571428571


In [13]:
# Train with SVM
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

# Set the hardness parameter
C = 1
m = X_train_scale.shape[1]
SVC_cl = SVC(kernel='linear', C=C)       # Bad for large dataset
linear_svc_cl = LinearSVC(loss="hinge", C=C)     # Good for large dataset, no kernel option
SGD_svm_cl = SGDClassifier(alpha = 1/(m*C))    # Good for large dataset, no kernel option

# SVC_cl.fit(X_train_scale, y_train)
# linear_svc_cl.fit(X_train_scale, y_train)
SGD_svm_cl.fit(X_train_scale, y_train)
y_pred = SGD_svm_cl.predict(X_vali_scale)
svm_accu = accuracy_score(y_pred, y_vali)
print("SVM(C = 1): {accu}".format(accu=svm_accu))

SVM(C = 1): 0.8888571428571429


In [19]:
# Perform random search over hardness of the parameter
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

SGD_svm_cl = SGDClassifier(alpha = 1/(m*C))    # Good for large dataset, no kernel option
param_distributions = {"alpha": uniform(1/(m*10),1/m)}
rnd_search_cv = RandomizedSearchCV(SGD_svm_cl, param_distributions, n_iter=5, verbose=2)
rnd_search_cv.fit(X_train_scale, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] alpha=0.00258532754695 ..........................................
[CV] ........................... alpha=0.00258532754695, total=   0.8s
[CV] alpha=0.00258532754695 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ........................... alpha=0.00258532754695, total=   0.8s
[CV] alpha=0.00258532754695 ..........................................
[CV] ........................... alpha=0.00258532754695, total=   0.8s
[CV] alpha=0.00224661144025 ..........................................
[CV] ........................... alpha=0.00224661144025, total=   0.9s
[CV] alpha=0.00224661144025 ..........................................
[CV] ........................... alpha=0.00224661144025, total=   0.9s
[CV] alpha=0.00224661144025 ..........................................
[CV] ........................... alpha=0.00224661144025, total=   1.2s
[CV] alpha=0.00103161936164 ..........................................
[CV] ........................... alpha=0.00103161936164, total=   1.4s
[CV] alpha=0.00103161936164 ..........................................
[CV] ........................... alpha=0.00103161936164, total=   1.0s
[CV] alpha=0.00103161936164 ..........................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   15.1s finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SGDClassifier(alpha=0.00510204081632653, average=False, class_weight=None,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f215a7d8780>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=2)

In [20]:
alpha_best = rnd_search_cv.best_estimator_.alpha

In [21]:
SGD_svm_cl = SGDClassifier(alpha=alpha_best)
SGD_svm_cl.fit(X_train_scale, y_train)
y_pred = SGD_svm_cl.predict(X_vali_scale)
accuracy_score(y_pred, y_vali)

0.89907142857142852

In [26]:
# Train with random forest
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=2, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_vali)
accuracy_score(y_pred, y_vali)

0.78249999999999997

In [17]:
# Train with extra-forest
from sklearn.ensemble import ExtraTreesClassifier

extra_clf = ExtraTreesClassifier(n_estimators=500, n_jobs=-1)
extra_clf.fit(X_train, y_train)
y_pred = extra_clf.predict(X_vali)
accuracy_score(y_pred, y_vali)

0.9663571428571428

In [27]:
# Combine the above mentioned classfier into an essemble that out performs all of them in validation set
# Use voting classifier
from sklearn.ensemble import VotingClassifier

SGD_clf = SGDClassifier(loss='log', random_state=23)
SGD_svm_clf = SGDClassifier(alpha=alpha_best)
rnd_clf = RandomForestClassifier(n_estimators=2)
extra_clf = ExtraTreesClassifier(n_estimators=2, n_jobs=-1)

votting_clf = VotingClassifier([('lg_clf', SGD_clf), ('SVM', SGD_svm_clf), ('rf2', rnd_clf),  ('extra',extra_clf)], voting='hard')

votting_clf.fit(X_train_scale, y_train)
y_pred = votting_clf.predict(X_vali_scale)
accuracy_score(y_pred, y_vali)

0.90064285714285719

### Excercise 9

In [13]:
# Run individual classifier shown above to make predictions on validation sets 

In [14]:
# Create new training set with the result predictions

In [15]:
# Train the blender

In [16]:
# Check performance on test set