### Hypothesis test
- T test to compare the means of each batch to verify improvement for $i \in epochs$ 
- Bootsrapping
- Permutation test 

In [None]:
from bayesian_optimization import BayesianOptimization
import pandas as pd 
from botorch.test_functions import Ackley
from botorch.test_functions import Levy 
from botorch.test_functions import Beale
from botorch.test_functions import DixonPrice
from botorch.test_functions import DropWave
from botorch.test_functions import Hartmann
from botorch.test_functions import StyblinskiTang
# Variables that determine the size of the dataset
N_INITIAL = 50
EPOCHS = 3
BATCH_SIZE = 50
DIM = 10
LOWER = -5
UPPER = 5

optimizer = BayesianOptimization(fun=StyblinskiTang(dim=DIM,negate=True), 
                                 batch_size=BATCH_SIZE, 
                                 dim=DIM, 
                                 epochs=EPOCHS, 
                                 n_init=N_INITIAL, 
                                 lower_bound=LOWER,
                                 upper_bound=UPPER,
                                 seed=5,
                                 acqf_type='qUCB')

x_max, y_max = optimizer.run()
data = optimizer.get_data()
full = optimizer.format(data, dim=DIM, n_init=N_INITIAL, batch_size=BATCH_SIZE, epochs=EPOCHS)
display(full)

### Permutation test

In [None]:
import numpy as np
import pandas as pd


def permutation_test_max(id2, id1, n_permutations=100000, df=full):
    X = df[df['Batch'] == id2][['y']].to_numpy()
    Y = df[df['Batch'] == id1][['y']].to_numpy()

    np.random.seed(0)
    combined = np.concatenate([X, Y])
    n = len(X)
    obs_diff = np.max(X) - np.max(Y)
    
    perm_diffs = []
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        X_permuted = combined[:n]
        Y_permuted = combined[n:]
        max_diff = np.max(X_permuted) - np.max(Y_permuted)
        perm_diffs.append(max_diff)
        
    p_value = np.mean(perm_diffs >= obs_diff)
    return np.array(perm_diffs), obs_diff, p_value

In [None]:
perm_diffs, obs_diff_perm, p_value = permutation_test_max(id2=3, id1=1, n_permutations=10000,df=full)

print(f"Permutation Observed difference of maxima (normalized): {obs_diff_perm}")
print(f"Permutation p-value: {p_value}")

In [None]:
import matplotlib.pyplot as plt

plt.hist(perm_diffs, bins=10, edgecolor='k', alpha=0.7)
plt.axvline(obs_diff_perm, color='r', linestyle='--', linewidth=2, label='Observed difference')
plt.title('Distribution of Permuted Differences of Maxima')
plt.xlabel('Difference of Maxima')
plt.ylabel('Frequency')
plt.legend()
plt.show()

### T-test

In [None]:
import pandas as pd
from scipy.stats import ttest_ind

batches = full['Batch'].unique()
ttest_results = []
batch_0 = full[full['Batch'] == 0]['y']

for t in batches:
    if t != 0:
        batch_t = full[full['Batch'] == t]['y']
        t_stat, p_value = ttest_ind(batch_t, batch_0, alternative='greater')
        ttest_results.append({
            'Batch': f'Batch {t} vs Batch 0',
            't-statistic': round(t_stat, 2),
            'p-value': round(p_value, 4)
        })

# Create a DataFrame
results_df = pd.DataFrame(ttest_results)

print(results_df)

### Bootstrapping

In [None]:
import numpy as np
mport pandas as pd



def get_batches(id1,id2,data):
    cols = ['Batch','y']
    df = data[cols]
    y_0 = df[df['Batch'] == id1][['y']].to_numpy()
    y_1 = df[df['Batch'] == id2][['y']].to_numpy()
    return y_0, y_1


def bootstrap_diff_max(X, Y, n_bootstrap=1000):
    np.random.seed(0)
    max_diffs = []

    X = X.squeeze()
    Y = Y.squeeze()
    
    for _ in range(n_bootstrap):
        X_resample = np.random.choice(X, size=len(X), replace=True)
        Y_resample = np.random.choice(Y, size=len(Y), replace=True)
        max_diff = np.max(X_resample) - np.max(Y_resample)
        max_diffs.append(max_diff)
        
    # Calculating the confience intervals    
    max_diffs_bootstrap = np.array(max_diffs)
    obs_diff = np.max(X) - np.max(Y)
    lower_bound_bootstrap = np.percentile(max_diffs_bootstrap, 2.5)
    upper_bound_bootstrap = np.percentile(max_diffs_bootstrap, 97.5)
    print(f"Bootstrap Observed difference of maxima: {obs_diff}")
    # print(f"Bootstrap Observed difference of maxima: {max_diffs_bootstrap}")
    print(f"Bootstrap 95% Confidence interval: [{lower_bound_bootstrap}, {upper_bound_bootstrap}]")

In [None]:
y1, y2 = get_batches(1,0,data)
print(y1,y2)

bootstrap_diff_max(y1, y2)