In [86]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, plot_roc_curve,\
                             precision_recall_curve, plot_precision_recall_curve, f1_score, average_precision_score,\
                             hinge_loss, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, average_precision_score, f1_score,\
                            log_loss, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import label_binarize, LabelBinarizer, LabelEncoder, OneHotEncoder
from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import RMSprop
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical

# Data load

In [22]:
fmnist = fetch_openml("Fashion-MNIST", data_home="./fmnist", cache=True)
classes = [str(x) for x in range(0, 10)]
num_classes = len(classes)

def mk_dataset(total, fmnist=fmnist, classes=classes):
    samples = int(fmnist.data.shape[0]*total)
    return resample(fmnist.data, fmnist.target, n_samples=samples)

def plot_imgs(x, y, w=28, h=28):
    plt.figure(figsize=(10,10))
    for i in range(min(25, x.shape[0])):
        plt.subplot(5,5,i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        img = x[i]
        img = img.reshape((w, h))
        plt.imshow(img)
        plt.xlabel(y[i])
    plt.show()    

# Model evaluation

In [128]:
def binarized_scorer(metric, **kwargs):
    lb = LabelBinarizer()
    def score(y_test, y_pred, metric=metric, lb=lb, kwargs=kwargs):
        lb.fit(y_test)
        y_test = lb.transform(y_test)
        y_pred = lb.transform(y_pred)
        return metric(y_test, y_pred, **kwargs)
    return make_scorer(score)

def mk_test(clf, name, gpu=False):
    def run_test(X, Y, clf=clf, name=name):
        scoring = {
            "accuracy":     binarized_scorer(accuracy_score), 
            "f1_score":     binarized_scorer(f1_score, average='macro'), 
            "log_loss":     binarized_scorer(log_loss), 
            "precision":    binarized_scorer(precision_score, average='macro'), 
            "recall":       binarized_scorer(recall_score, average='macro'), 
            "roc_auc":      binarized_scorer(roc_auc_score, average='macro'),
            # to je pole pod Precision-Recall, albo jakaś średnia. nie wiem.
            "prec_rec_auc": binarized_scorer(average_precision_score, average='macro') 
        }
        n_jobs = None if gpu else 8
        scores = cross_validate(clf, X, Y, cv=5, n_jobs=n_jobs, scoring=scoring)
        del scores['fit_time']
        del scores['score_time']
        keys = list(scores.keys())
        for key in keys:
            scores[key.replace('test_', '')] = [np.mean(scores.pop(key))]
        df = pd.DataFrame.from_dict(scores)
        df.insert(loc=0, column='Name', value=name)
        return df
    return run_test

# Models

In [141]:
def mk_adaboost(depth=5, n=100, seed=1):
    return AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=depth),
        n_estimators=n,
        random_state=seed)

class MyLittleKerasClassifier(KerasClassifier):
    def predict(self, X):
        y_pred = KerasClassifier.predict(self, X)
        return to_categorical(y_pred, num_classes)

def mk_mlp(epochs=10):
    def build():
        model = Sequential()
        # starannie dobrane wartosci, wiem co robie
        model.add(Dense(256, activation='relu', input_shape=(28*28,)))
        model.add(Dropout(0.2))
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    return MyLittleKerasClassifier(build_fn=build, epochs=epochs)

def mk_cnn(epochs=10):
    def build():
        model = Sequential()
        model.add(Conv2D(filters=32, kernel_size=(3, 3), input_shape=(28,28, 1), activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv2D(32, kernel_size = 5, strides=2, padding='same', activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
    
        model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv2D(64, kernel_size = 5, strides=2, padding='same', activation='relu'))
        model.add(BatchNormalization())
        model.add(Dropout(0.4))
    
        model.add(Conv2D(128, kernel_size = 4, activation='relu'))
        model.add(BatchNormalization())
        model.add(Flatten())
        model.add(Dropout(0.4))
        model.add(Dense(10, activation='softmax'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    return MyLittleKerasClassifier(build_fn=build, epochs=epochs)

In [143]:
def test_cnn(data_sz, duration):
    name = 'CNN | {}% of data'.format(data_sz*100.0)
    x, y = mk_dataset(data_sz)
    y = to_categorical(y, num_classes)
    x /= 255.0
    x = x.reshape((x.shape[0], 28, 28, 1))
    return mk_test(mk_cnn(), name, gpu=True)(x, y)

def test_mlp(data_sz, duration):
    name = 'MLP | {}% of data'.format(data_sz*100.0)
    x, y = mk_dataset(data_sz)
    y = to_categorical(y, num_classes)
    x /= 255.0
    return mk_test(mk_mlp(), name, gpu=True)(x, y)

def test_adaboost(data_sz, duration):
    name = 'AdaBoost | {}% of data'.format(data_sz*100.0)
    return mk_test(mk_adaboost(), name)(*mk_dataset(data_sz))

In [144]:
tests = [method(data_sz, None) for method in [test_adaboost, test_mlp, test_cnn] for data_sz in [0.1, 0.5, 1.0]]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10


Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [147]:
df = pd.concat(tests)
df.to_pickle("3a.pkl")
df

Unnamed: 0,Name,accuracy,f1_score,log_loss,precision,recall,roc_auc,prec_rec_auc
0,AdaBoost | 10.0% of data,0.749,0.751889,8.669233,0.760712,0.749163,0.860646,0.619367
0,AdaBoost | 50.0% of data,0.742371,0.743587,8.898176,0.747366,0.742739,0.857054,0.615169
0,AdaBoost | 100.0% of data,0.721543,0.72052,9.617569,0.722992,0.721553,0.845308,0.586956
0,MLP | 10.0% of data,0.837286,0.834244,5.619952,0.841715,0.838094,0.910006,0.732207
0,MLP | 50.0% of data,0.874686,0.873188,4.328202,0.876692,0.875419,0.930747,0.787478
0,MLP | 100.0% of data,0.883429,0.881977,4.026235,0.886163,0.883997,0.93552,0.80215
0,CNN | 10.0% of data,0.867143,0.862843,4.588723,0.868083,0.864648,0.924962,0.773621
0,CNN | 50.0% of data,0.923371,0.923179,2.646657,0.924378,0.92317,0.957329,0.864555
0,CNN | 100.0% of data,0.9418,0.941464,2.010157,0.942539,0.941428,0.967485,0.895273
