# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
type(X)

pandas.core.frame.DataFrame

In [4]:
# X von dataframe zu Array konvertieren, da er meinnt er muss es anderst runterladen...
X = X.to_numpy()
y = y.to_numpy()
type(X)

numpy.ndarray

In [5]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [6]:
X.min()

0.0

In [7]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 10000, random_state=42)

In [10]:
%%time
clf_SVM = make_pipeline(StandardScaler(),SVC(C = 1.0, gamma='auto', kernel='rbf', max_iter = 50))
clf_SVM.fit(X_train, y_train)

y_train_pred = clf_SVM.predict(X_train)
y_test_pred = clf_SVM.predict(X_test)

print(accuracy_score(y_test, y_test_pred))
print(accuracy_score(y_train, y_train_pred))



0.7263
0.729
Wall time: 6min 59s


In [13]:
%%time
from sklearn.model_selection import RandomizedSearchCV
from time import time
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

clf_svm = SVC(max_iter = 30, random_state = 42)

# use a full grid over all parameters
param_grid = {'C': np.linspace(0, 1, num=10),
              'gamma': np.linspace(0.0005, 1.5, num=10)}

# run grid search
SVM_random_search = RandomizedSearchCV(estimator= clf_svm, param_distributions = param_grid, cv = 4, random_state=42, n_jobs=-1)
start = time()
SVM_random_search.fit(X_train, y_train)

print("RandomizedSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(SVM_random_search.cv_results_['params'])))
#report(SVM_random_search.cv_results_)

y_pred_train = SVM_random_search.predict(X_train)
y_pred_test = SVM_random_search.predict(X_test)

print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))


 0.0987     0.23826667 0.2379            nan]


RandomizedSearchCV took 606.26 seconds for 10 candidate parameter settings.
0.24246666666666666
0.2459
Wall time: 10min 33s


### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [49]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 10000, random_state=42)

### Different Number of Layers

In [66]:
%%time
max_iterations = 50
alpha = 0.001
layers = 30

import warnings
warnings.filterwarnings('ignore')

clf_MLP_1 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = layers, alpha = alpha, max_iter=max_iterations, random_state=42))
clf_MLP_1.fit(X_train, y_train)                       
clf_MLP_2 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42))
clf_MLP_2.fit(X_train, y_train)                          
clf_MLP_3 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42))
clf_MLP_3.fit(X_train, y_train)


#y_predprob_test_1 = clf_MLP_1.predict_proba(X_test)
y_pred_test_1 = clf_MLP_1.predict(X_test)
#y_predprob_train_1 = clf_MLP_1.predict_proba(X_train)
y_pred_train_1 = clf_MLP_1.predict(X_train)

#print(np.shape(y_test), np.shape(y_pred_test_1))

print('\n-----1 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_1))
print('Test score prediction : ',r2_score(y_test, y_pred_test_1))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_1))
print('Train score prediction : ',r2_score(y_train, y_pred_train_1))


#y_predprob_test_2 = clf_MLP_2.predict_proba(X_test)
y_pred_test_2 = clf_MLP_2.predict(X_test)
#y_predprob_train_2 = clf_MLP_2.predict_proba(X_train)
y_pred_train_2 = clf_MLP_2.predict(X_train)

print('\n-----2 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_2))
print('Test score prediction : ',r2_score(y_test, y_pred_test_2))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_2))
print('Train score prediction : ',r2_score(y_train, y_pred_train_2))


#y_predprob_test_3 = clf_MLP_3.predict_proba(X_test)
y_pred_test_3 = clf_MLP_3.predict(X_test)
#y_predprob_train_3 = clf_MLP_3.predict_proba(X_train)
y_pred_train_3 = clf_MLP_3.predict(X_train)

print('\n-----3 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_3))
print('Test score prediction : ',r2_score(y_test, y_pred_test_3))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_3))
print('Train score prediction : ',r2_score(y_train, y_pred_train_3))


-----1 Layer-----
Test score prediction :  0.8600650535386345
Train score prediction :  0.9031336397278868

-----2 Layer-----
Test score prediction :  0.8824105933518355
Train score prediction :  0.9244702651131514

-----3 Layer-----
Test score prediction :  0.880681685660936
Train score prediction :  0.93557634443117
Wall time: 3min 17s


### Different Alpha

In [67]:
%%time
max_iterations = 50
alpha = 0.1
layers = 30

import warnings
warnings.filterwarnings('ignore')

clf_MLP_1 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = layers, alpha = alpha, max_iter=max_iterations, random_state=42))
clf_MLP_1.fit(X_train, y_train)                       
clf_MLP_2 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42))
clf_MLP_2.fit(X_train, y_train)                          
clf_MLP_3 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42))
clf_MLP_3.fit(X_train, y_train)


#y_predprob_test_1 = clf_MLP_1.predict_proba(X_test)
y_pred_test_1 = clf_MLP_1.predict(X_test)
#y_predprob_train_1 = clf_MLP_1.predict_proba(X_train)
y_pred_train_1 = clf_MLP_1.predict(X_train)

#print(np.shape(y_test), np.shape(y_pred_test_1))

print('\n-----1 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_1))
print('Test score prediction : ',r2_score(y_test, y_pred_test_1))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_1))
print('Train score prediction : ',r2_score(y_train, y_pred_train_1))


#y_predprob_test_2 = clf_MLP_2.predict_proba(X_test)
y_pred_test_2 = clf_MLP_2.predict(X_test)
#y_predprob_train_2 = clf_MLP_2.predict_proba(X_train)
y_pred_train_2 = clf_MLP_2.predict(X_train)

print('\n-----2 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_2))
print('Test score prediction : ',r2_score(y_test, y_pred_test_2))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_2))
print('Train score prediction : ',r2_score(y_train, y_pred_train_2))


#y_predprob_test_3 = clf_MLP_3.predict_proba(X_test)
y_pred_test_3 = clf_MLP_3.predict(X_test)
#y_predprob_train_3 = clf_MLP_3.predict_proba(X_train)
y_pred_train_3 = clf_MLP_3.predict(X_train)

print('\n-----3 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_3))
print('Test score prediction : ',r2_score(y_test, y_pred_test_3))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_3))
print('Train score prediction : ',r2_score(y_train, y_pred_train_3))


-----1 Layer-----
Test score prediction :  0.8615808082265464
Train score prediction :  0.901488516747554

-----2 Layer-----
Test score prediction :  0.8836895113697611
Train score prediction :  0.9229470771289308

-----3 Layer-----
Test score prediction :  0.878858043302042
Train score prediction :  0.9341670957056115
Wall time: 4min 16s


### Different learning rate
#### invscaling

In [68]:
%%time
max_iterations = 50
alpha = 0.1
layers = 30
learning_rate = 'invscaling'

import warnings
warnings.filterwarnings('ignore')

clf_MLP_1 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = layers, alpha = alpha, max_iter=max_iterations, random_state=42, learning_rate=learning_rate))
clf_MLP_1.fit(X_train, y_train)                       
clf_MLP_2 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42,learning_rate=learning_rate))
clf_MLP_2.fit(X_train, y_train)                          
clf_MLP_3 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42,learning_rate=learning_rate))
clf_MLP_3.fit(X_train, y_train)


#y_predprob_test_1 = clf_MLP_1.predict_proba(X_test)
y_pred_test_1 = clf_MLP_1.predict(X_test)
#y_predprob_train_1 = clf_MLP_1.predict_proba(X_train)
y_pred_train_1 = clf_MLP_1.predict(X_train)

#print(np.shape(y_test), np.shape(y_pred_test_1))

print('\n-----1 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_1))
print('Test score prediction : ',r2_score(y_test, y_pred_test_1))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_1))
print('Train score prediction : ',r2_score(y_train, y_pred_train_1))


#y_predprob_test_2 = clf_MLP_2.predict_proba(X_test)
y_pred_test_2 = clf_MLP_2.predict(X_test)
#y_predprob_train_2 = clf_MLP_2.predict_proba(X_train)
y_pred_train_2 = clf_MLP_2.predict(X_train)

print('\n-----2 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_2))
print('Test score prediction : ',r2_score(y_test, y_pred_test_2))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_2))
print('Train score prediction : ',r2_score(y_train, y_pred_train_2))


#y_predprob_test_3 = clf_MLP_3.predict_proba(X_test)
y_pred_test_3 = clf_MLP_3.predict(X_test)
#y_predprob_train_3 = clf_MLP_3.predict_proba(X_train)
y_pred_train_3 = clf_MLP_3.predict(X_train)

print('\n-----3 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_3))
print('Test score prediction : ',r2_score(y_test, y_pred_test_3))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_3))
print('Train score prediction : ',r2_score(y_train, y_pred_train_3))


-----1 Layer-----
Test score prediction :  0.6829112243889324
Train score prediction :  0.6742596530910101

-----2 Layer-----
Test score prediction :  0.6038077766135307
Train score prediction :  0.6058381287971915

-----3 Layer-----
Test score prediction :  0.5585956563131589
Train score prediction :  0.5601005116489586
Wall time: 4min 10s


#### adaptive

In [72]:
%%time
max_iterations = 50
alpha = 0.1
layers = 30
learning_rate = 'adaptive'

import warnings
warnings.filterwarnings('ignore')

clf_MLP_1 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = layers, alpha = alpha, max_iter=max_iterations, random_state=42, learning_rate=learning_rate))
clf_MLP_1.fit(X_train, y_train)                       
clf_MLP_2 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42,learning_rate=learning_rate))
clf_MLP_2.fit(X_train, y_train)                          
clf_MLP_3 = make_pipeline(StandardScaler(), MLPClassifier(solver = 'sgd', activation = 'tanh', hidden_layer_sizes = (layers,layers,layers), alpha = alpha, max_iter=max_iterations, random_state=42,learning_rate=learning_rate))
clf_MLP_3.fit(X_train, y_train)


#y_predprob_test_1 = clf_MLP_1.predict_proba(X_test)
y_pred_test_1 = clf_MLP_1.predict(X_test)
#y_predprob_train_1 = clf_MLP_1.predict_proba(X_train)
y_pred_train_1 = clf_MLP_1.predict(X_train)

#print(np.shape(y_test), np.shape(y_pred_test_1))

print('\n-----1 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_1))
print('Test score prediction : ',r2_score(y_test, y_pred_test_1))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_1))
print('Train score prediction : ',r2_score(y_train, y_pred_train_1))


#y_predprob_test_2 = clf_MLP_2.predict_proba(X_test)
y_pred_test_2 = clf_MLP_2.predict(X_test)
#y_predprob_train_2 = clf_MLP_2.predict_proba(X_train)
y_pred_train_2 = clf_MLP_2.predict(X_train)

print('\n-----2 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_2))
print('Test score prediction : ',r2_score(y_test, y_pred_test_2))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_2))
print('Train score prediction : ',r2_score(y_train, y_pred_train_2))


#y_predprob_test_3 = clf_MLP_3.predict_proba(X_test)
y_pred_test_3 = clf_MLP_3.predict(X_test)
#y_predprob_train_3 = clf_MLP_3.predict_proba(X_train)
y_pred_train_3 = clf_MLP_3.predict(X_train)

print('\n-----3 Layer-----')
#print('Test score prediction probabilistic: ',r2_score(y_test, y_predprob_test_3))
print('Test score prediction : ',r2_score(y_test, y_pred_test_3))
#print('Test score prediction probabilistic: ',r2_score(y_train, y_predprob_train_3))
print('Train score prediction : ',r2_score(y_train, y_pred_train_3))


-----1 Layer-----
Test score prediction :  0.8615808082265464
Train score prediction :  0.901488516747554

-----2 Layer-----
Test score prediction :  0.8836895113697611
Train score prediction :  0.9229470771289308

-----3 Layer-----
Test score prediction :  0.878858043302042
Train score prediction :  0.9341670957056115
Wall time: 4min 24s
