In [2]:
from keras.datasets import mnist
import numpy as np
from sklearn.model_selection import train_test_split

(train_X, train_y), (test_X, test_y) = mnist.load_data()

vect_size = train_X.shape[1]*train_X.shape[2]

train_X = train_X.astype("float64")/255.0
train_X = train_X.reshape((train_X.shape[0], vect_size))

train_X, _, train_y, _ = train_test_split(train_X, train_y, train_size=5000, random_state=123, shuffle=True, stratify=train_y)

test_X = test_X.astype("float64")/255.0
test_X = test_X.reshape((test_X.shape[0], vect_size))

train_y.shape




(5000,)

In [3]:
from sklearn.neural_network import MLPClassifier
MLPClassifier().get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

param_grid = [                                  # parametros para MLP
    {"hidden_layer_sizes": [(300,), (800,)],    # capas ocultas
     "batch_size": [128, 512]}                  # tamaño lotes 
]

mlp = MLPClassifier(                # multi layer perceptron MLP
    random_state=123,               # controlamos random
    batch_size=512,                 # lote tam
    activation="relu",              # funcion de activacion
    solver="adam",                  # ...
    max_iter=50,
    early_stopping=True,
    verbose=True
)

cv = RepeatedStratifiedKFold(       # validacion cruzada 
    n_splits=5,                     # nº particiones
    n_repeats=3,                    # nº repeticiones
    random_state=123                # random seed
)                                 

search = GridSearchCV(
    estimator=mlp,                  # algoritmo
    param_grid=param_grid,          # parametros sobre los que experimentar
    scoring="accuracy",             # parametro -> precision
    cv=cv,                          # validacion cruzada
    n_jobs=10                       # nº processors
)       

search.fit(X=train_X, y=train_y)

Iteration 1, loss = 0.76822771
Validation score: 0.868000
Iteration 2, loss = 0.29523788
Validation score: 0.886000
Iteration 3, loss = 0.23792212
Validation score: 0.896000
Iteration 4, loss = 0.17508434
Validation score: 0.908000
Iteration 5, loss = 0.13764777
Validation score: 0.922000
Iteration 6, loss = 0.11052168
Validation score: 0.922000
Iteration 7, loss = 0.09101708
Validation score: 0.926000
Iteration 8, loss = 0.07027773
Validation score: 0.928000
Iteration 9, loss = 0.05248032
Validation score: 0.928000
Iteration 10, loss = 0.04241682
Validation score: 0.932000
Iteration 11, loss = 0.03124058
Validation score: 0.930000
Iteration 12, loss = 0.02536572
Validation score: 0.924000
Iteration 13, loss = 0.02175076
Validation score: 0.934000
Iteration 14, loss = 0.01817904
Validation score: 0.934000
Iteration 15, loss = 0.01433103
Validation score: 0.928000
Iteration 16, loss = 0.01209373
Validation score: 0.928000
Iteration 17, loss = 0.01046789
Validation score: 0.930000
Iterat

In [6]:
import pandas as pd

# resultados del gridsearch

results_df = pd.DataFrame(search.cv_results_)
results_df = results_df.sort_values(by=["rank_test_score"])
results_df = results_df.set_index(
    results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
).rename_axis("setting")
results_df[["params", "rank_test_score", "mean_test_score", "std_test_score"]]

Unnamed: 0_level_0,params,rank_test_score,mean_test_score,std_test_score
setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"128_(800,)","{'batch_size': 128, 'hidden_layer_sizes': (800,)}",1,0.935067,0.009801
"128_(300,)","{'batch_size': 128, 'hidden_layer_sizes': (300,)}",2,0.929533,0.0075
"512_(800,)","{'batch_size': 512, 'hidden_layer_sizes': (800,)}",3,0.926533,0.007873
"512_(300,)","{'batch_size': 512, 'hidden_layer_sizes': (300,)}",4,0.919733,0.011084


In [7]:
from itertools import combinations
from math import factorial

import numpy as np
from scipy.stats import t


def corrected_std(differences, n_train, n_test):
    """Corrects standard deviation using Nadeau and Bengio's approach.

    Parameters
    ----------
    differences : ndarray of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    corrected_std : float
        Variance-corrected standard deviation of the set of differences.
    """
    # kr = k times r, r times repeated k-fold crossvalidation,
    # kr equals the number of times the model was evaluated
    kr = len(differences)
    corrected_var = np.var(differences, ddof=1) * (1 / kr + n_test / n_train)
    corrected_std = np.sqrt(corrected_var)
    return corrected_std


def compute_corrected_ttest(differences, df, n_train, n_test):
    """Computes right-tailed paired t-test with corrected variance.

    Parameters
    ----------
    differences : array-like of shape (n_samples,)
        Vector containing the differences in the score metrics of two models.
    df : int
        Degrees of freedom.
    n_train : int
        Number of samples in the training set.
    n_test : int
        Number of samples in the testing set.

    Returns
    -------
    t_stat : float
        Variance-corrected t-statistic.
    p_val : float
        Variance-corrected p-value.
    """
    mean = np.mean(differences)
    std = corrected_std(differences, n_train, n_test)
    t_stat = mean / std
    p_val = t.sf(np.abs(t_stat), df)  # right-tailed t-test
    return t_stat, p_val

model_scores = results_df.filter(regex=r"split\d*_test_score")

n = model_scores.shape[0]
df = n - 1
n_train = len(list(cv.split(train_X, train_y))[0][0])
n_test = len(list(cv.split(train_X, train_y))[0][1])



n_comparisons = factorial(len(model_scores)) / (
    factorial(2) * factorial(len(model_scores) - 2)
)
pairwise_t_test = []

for model_i, model_k in combinations(range(len(model_scores)), 2):
    model_i_scores = model_scores.iloc[model_i].values
    model_k_scores = model_scores.iloc[model_k].values
    differences = model_i_scores - model_k_scores
    t_stat, p_val = compute_corrected_ttest(differences, df, n_train, n_test)
    p_val *= n_comparisons  # implement Bonferroni correction
    # Bonferroni can output p-values higher than 1
    p_val = 1 if p_val > 1 else p_val
    pairwise_t_test.append(
        [model_scores.index[model_i], model_scores.index[model_k], t_stat, p_val]
    )

pairwise_comp_df = pd.DataFrame(
    pairwise_t_test, columns=["model_1", "model_2", "t_stat", "p_val"]
).round(3)

pairwise_comp_df

Unnamed: 0,model_1,model_2,t_stat,p_val
0,"128_(800,)","128_(300,)",1.558,0.651
1,"128_(800,)","512_(800,)",1.273,0.878
2,"128_(800,)","512_(300,)",2.466,0.271
3,"128_(300,)","512_(800,)",0.711,1.0
4,"128_(300,)","512_(300,)",1.789,0.515
5,"512_(800,)","512_(300,)",1.17,0.979


In [8]:
model_scores

Unnamed: 0_level_0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score
setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"128_(800,)",0.914,0.936,0.936,0.947,0.932,0.94,0.948,0.92,0.929,0.949,0.942,0.928,0.927,0.939,0.939
"128_(300,)",0.923,0.934,0.938,0.935,0.924,0.934,0.937,0.916,0.925,0.941,0.929,0.918,0.926,0.925,0.938
"512_(800,)",0.939,0.932,0.927,0.936,0.924,0.931,0.932,0.91,0.922,0.929,0.918,0.917,0.934,0.918,0.929
"512_(300,)",0.914,0.918,0.914,0.93,0.91,0.924,0.928,0.907,0.911,0.939,0.896,0.916,0.928,0.93,0.931


In [9]:
# initialize random variable
t_post = t(
    df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test)
)

rope_interval = [-0.01, 0.01]

pairwise_bayesian = []

for model_i, model_k in combinations(range(len(model_scores)), 2):
    model_i_scores = model_scores.iloc[model_i].values
    model_k_scores = model_scores.iloc[model_k].values
    differences = model_i_scores - model_k_scores
    t_post = t(
        df, loc=np.mean(differences), scale=corrected_std(differences, n_train, n_test)
    )
    worse_prob = t_post.cdf(rope_interval[0])
    better_prob = 1 - t_post.cdf(rope_interval[1])
    rope_prob = t_post.cdf(rope_interval[1]) - t_post.cdf(rope_interval[0])

    pairwise_bayesian.append([worse_prob, better_prob, rope_prob])

pairwise_bayesian_df = pd.DataFrame(
    pairwise_bayesian, columns=["worse_prob", "better_prob", "rope_prob"]
).round(3)

pairwise_comp_df = pairwise_comp_df.join(pairwise_bayesian_df)
pairwise_comp_df

Unnamed: 0,model_1,model_2,t_stat,p_val,worse_prob,better_prob,rope_prob
0,"128_(800,)","128_(300,)",1.558,0.651,0.011,0.149,0.84
1,"128_(800,)","512_(800,)",1.273,0.878,0.035,0.42,0.545
2,"128_(800,)","512_(300,)",2.466,0.271,0.013,0.773,0.214
3,"128_(300,)","512_(800,)",0.711,1.0,0.027,0.098,0.875
4,"128_(300,)","512_(300,)",1.789,0.515,0.018,0.487,0.495
5,"512_(800,)","512_(300,)",1.17,0.979,0.031,0.31,0.658
