# Assignment 3 - Classification of Image Data

## Task 1. Data Preprocessing

### 1.1 Importing Modules and Dataset

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import math

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
test_df = pd.read_csv('/content/gdrive/MyDrive/asg3_comp551/archive/sign_mnist_test.csv')
train_df = pd.read_csv('/content/gdrive/MyDrive/asg3_comp551/archive/sign_mnist_train.csv')
print(test_df.info())
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7172 entries, 0 to 7171
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 43.0 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27455 entries, 0 to 27454
Columns: 785 entries, label to pixel784
dtypes: int64(785)
memory usage: 164.4 MB
None


In [None]:
print(train_df.head())
print(test_df.head())

   label  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0      3     107     118     127     134     139     143     146     150   
1      6     155     157     156     156     156     157     156     158   
2      2     187     188     188     187     187     186     187     188   
3      2     211     211     212     212     211     210     211     210   
4     13     164     167     170     172     176     179     180     184   

   pixel9  ...  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  \
0     153  ...       207       207       207       207       206       206   
1     158  ...        69       149       128        87        94       163   
2     187  ...       202       201       200       199       198       199   
3     210  ...       235       234       233       231       230       226   
4     185  ...        92       105       105       108       133       163   

   pixel781  pixel782  pixel783  pixel784  
0       206       204       20

### 1.2 Data Processing - Seperation of Feature and Label with Vectorization

In [None]:
# Seperating the label and features.
train_label = train_df['label']
test_label = test_df['label']
train_feature = train_df.drop(['label'],axis = 1)
test_feature = test_df.drop(['label'],axis = 1)

In [None]:
# shape verification of label nparray
print(train_label.shape)
print(test_label.shape)

(27455,)
(7172,)


In [None]:
# vectorizing & shape verification (N X D) s.t. D = 784
X_train = train_feature.values
X_test = test_feature.values
print(X_train.shape)
print(X_test.shape)

(27455, 784)
(7172, 784)


In [None]:
from sklearn.preprocessing import LabelBinarizer
lb=LabelBinarizer()
y_train=lb.fit_transform(train_label)
y_test=lb.fit_transform(test_label)

In [None]:
# check if the one-hot encoding is successful - we compare it with the offered data-processing guidance
print(y_train)
print(y_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]]


In [None]:
print(y_train.shape)
print(y_test.shape)

(27455, 24)
(7172, 24)


### 1.3 Normalization

Now we perform the normalization on data to prevent scaling bias. This can be done easily by the broatcasting mechanism of numpy.

In [None]:
X_train = X_train.astype('float64')

In [None]:
# centralization
central_mean = np.mean(X_train,axis = 0)
X_train -= central_mean

For the process of normalization. According to the offered reference (CS231n), we only carry out the min-max normalization once the relative of scales vary much but have equal importance. However, in here, we notice that: 1. pixels are already approximately equal (0 to 255); 2. they are not approximately having euqal importance since the corner pixels usually provides scarce information. Thus, we only carry the std version of normalizaiton.

In [None]:
# Normalization - std version
normalizing_std = np.std(X_train, axis = 0)
X_train /= normalizing_std

We further normalize the testing set.

In [None]:
X_test = X_test.astype('float64')
X_test -= central_mean
X_test /= normalizing_std

### Variables

From now on, you may use the following:
1. `X_train` - training feature matrix
2. `X_test` - testing feature matrix
3. `Y_train` - trainning label
4. `Y_test` - testing label
5. `central_mean` - centralization mean
6. `normalizing_std` - normalizing std

## Task 2

### 2.1 MLP

Most classes and functions are borrowed from the NumpyDeepMLP.ipynb. We majorly modified the MLP class to adapt it to a more flexible and concise input.

In [None]:
# define the parent class of layers used in NN
class NeuralNetLayer:
    def __init__(self):
        self.gradient = None # for gradient calculation
        self.parameters = None # storing the corresponding parameters

    def forward(self, x): # calculating the parameter and output
        raise NotImplementedError

    def backward(self, gradient): # for gradient descent
        raise NotImplementedError

# parameter layer
class LinearLayer(NeuralNetLayer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.ni = input_size # the shape of the W matrix
        self.no = output_size
        self.w = np.random.randn(output_size, input_size)/np.sqrt(input_size) # standard gaussian initialization to -- weight matrix --
        self.b = np.zeros(output_size) # gaus-init --- bias term ---
        #self.b = np.random.randn(output_size)
        self.cur_input = None # record current input for gradient calculation
        self.parameters = [self.w, self.b]

    def forward(self, x):
        self.cur_input = x # current input vector (update the current property)
        return (self.w[None, :, :] @ x[:, :, None]).squeeze() + self.b # shape: batch_size x next_layer_input_size

    def backward(self, gradient):
        assert self.cur_input is not None, "Must call forward before backward"
        #dw = gradient.dot(self.cur_input)
        dw = gradient[:, :, None] @ self.cur_input[:, None, :]
        #dw = np.dot(gradient,self.cur_input)
        db = gradient.sum(axis=0)
        self.gradient = [dw, db]
        return gradient.dot(self.w)

# activation layer - hidden
class LeakyReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.08)
        return np.maximum(0.08*x, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient # backprop - geadient pass to the last layer

class ReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.0)
        return np.maximum(0, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

# activation layer - output
class SoftmaxOutputLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None

    def forward(self, x):
        exps = np.exp(x)
        probs = exps / np.sum(exps, axis=-1)[:, None]
        self.cur_probs = probs
        return probs

    def backward(self, target):
        assert self.cur_probs is not None, "Must call forward before backward"
        return self.cur_probs - target

# define the parent class of layers used in NN
class NeuralNetLayer:
    def __init__(self):
        self.gradient = None # for gradient calculation
        self.parameters = None # storing the corresponding parameters

    def forward(self, x): # calculating the parameter and output
        raise NotImplementedError

    def backward(self, gradient): # for gradient descent
        raise NotImplementedError

# parameter layer
class LinearLayer(NeuralNetLayer):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.ni = input_size # the shape of the W matrix
        self.no = output_size
        self.w = np.random.normal(0,2.0/(input_size + output_size),size = (output_size, input_size)) # standard gaussian initialization to -- weight matrix --
        self.b = np.zeros(output_size) # gaus-init --- bias term ---
        #self.b = np.random.randn(output_size)
        self.cur_input = None # record current input for gradient calculation
        self.parameters = [self.w, self.b]

    def forward(self, x):
        self.cur_input = x # current input vector (update the current property)
        return (self.w[None, :, :] @ x[:, :, None]).squeeze() #+ self.b # shape: batch_size x next_layer_input_size

    def backward(self, gradient):
        assert self.cur_input is not None, "Must call forward before backward"
        #dw = gradient.dot(self.cur_input)
        dw = gradient[:, :, None] @ self.cur_input[:, None, :]
        #dw = np.dot(gradient,self.cur_input)
        db = gradient.sum(axis=0)
        self.gradient = [dw, db]
        return gradient.dot(self.w)

# activation layer - hidden
class LeakyReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.08)
        return np.maximum(0.08*x, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient # backprop - geadient pass to the last layer

class ReLULayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        self.gradient = np.where(x > 0, 1.0, 0.0)
        return np.maximum(0, x)

    def backward(self, gradient):
        assert self.gradient is not None, "Must call forward before backward"
        return gradient * self.gradient

# activation layer - output
class SoftmaxOutputLayer(NeuralNetLayer):
    def __init__(self):
        super().__init__()
        self.cur_probs = None

    def forward(self, x):
        exps = np.exp(x)
        probs = exps / np.sum(exps, axis=-1)[:, None]
        self.cur_probs = probs
        return probs

    def backward(self, target):
        assert self.cur_probs is not None, "Must call forward before backward"
        print(self.cur_probs.shape)
        print(self.cur_probs)
        return self.cur_probs - target

In [None]:
class MLP:
    # input_size - feature shape (flatten)
    # layer_sizes - a list of : # of nodes in the layer
    # output_size - output shape (flatten)
    def __init__(self, input_size, layer_sizes, output_size, activation_function=ReLULayer,optimizer = None):
        self.layers = [] # create the layer list
        sizes = [input_size] + layer_sizes + [output_size]
        for i in range(len(sizes) - 1): # interpolation of linear layers
            self.layers.append(LinearLayer(sizes[i], sizes[i+1]))
            if i < len(sizes) - 2:  # interpolation of activation layers
                self.layers.append(activation_function())
        self.layers.append(SoftmaxOutputLayer())  # multi-classifier activation
        self.optimizer = optimizer if optimizer is not None else GradientDescentOptimizer(self, 0.001) # default optimizer
        #print(self.layers[len(self.layers)-2].ni,self.layers[len(self.layers)-2].no)
        #print(self.layers[len(self.layers)-1].ni,self.layers[len(self.layers)-1].no)

    def forward(self, x): # forward automated through the entire hierarchy by input x
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, gradient): # automated backprop start from y
        for layer in self.layers[::-1]:
            gradient = layer.backward(gradient)
            ####print(gradient)
        return gradient

    def fit(self, X, y, lr, max_iter):
        counter = 0
        # designation of learning rate (if applicable)
        if hasattr(self.optimizer, 'lr'):
            self.optimizer.lr = lr
        # loss recorder
        losses = []
        for _ in range(max_iter):
            # Compute predictions
            predictions = self.forward(X)
            # Compute loss (with regularization)
            loss = self.compute_loss(X, y)
            losses.append(loss)  # Track loss for visualization
            # Backpropagation
            self.backward(y)
            # Update model parameters
            self.optimizer.step()
            # Print loss (optional)
            print(f"Loss: {loss}")
            counter += 1
        return losses

    def predict(self, X):
        predictions = self.forward(X)
        return np.argmax(predictions, axis=1)

    def compute_loss(self, X, y):
        preds = self.forward(X)
        return -(y * np.log(preds)).sum(axis=-1).mean()



    def fit(self, X, y, lr, max_iter):
        # designation of learning rate (if applicable)
        if hasattr(self.optimizer, 'lr') and self.optimizer.lr != lr:
            self.optimizer.lr = lr
        for _ in range(max_iter):
            preds = self.forward(X)
            #gradient = self.backward(y - preds)
            gradient = self.backward(y)
            self.optimizer.step()

In [None]:
class Optimizer:
    def __init__(self, net: MLP):
        self.net = net # optimizer should attach to the NN

    def step(self): # layer-by-layer update
        for layer in self.net.layers[::-1]:
            if layer.parameters is not None:
                self.update(layer.parameters, layer.gradient)

    def update(self, params, gradient):
        raise NotImplementedError

class GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float):
        super().__init__(net) # NN initialization
        self.lr = lr # learning rate

    def update(self, params, gradient):
      # layer-by-layer weight update
      # p - weight , g - gradient
        for (p, g) in zip(params, gradient):
            p -= self.lr * g.mean(axis=0) # update based on expected gradient


class GradientDescentOptimizer(Optimizer):
    def __init__(self, net, lr):
        self.net = net
        self.lr = lr

    def step(self):
        for layer in self.net.layers:
            if hasattr(layer, 'parameters') and layer.parameters is not None:
                updated_parameters = []
                for param, grad in zip(layer.parameters, layer.gradient):
                    param_update = self.lr * grad
                    updated_parameters.append(param - param_update)
                layer.parameters = updated_parameters


class GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float, epsilon=1e-8):
        super().__init__(net)
        self.lr = lr
        self.epsilon = epsilon
        self.acc_sq_grads = {}
        
        param_index = 0
        for layer in self.net.layers:
            if hasattr(layer, 'parameters') and layer.parameters is not None:
                for param in layer.parameters:
                    self.acc_sq_grads[param_index] = np.zeros_like(param)
                    param_index += 1

    def update(self, params_flat, gradients_flat):
        param_index = 0
        for layer in self.net.layers:
            if hasattr(layer, 'parameters') and layer.parameters is not None:
                for i, param in enumerate(layer.parameters):
                    # Calculate the accumulated square gradients
                    self.acc_sq_grads[param_index] += np.square(gradients_flat[param_index])
                    # Compute the updated parameter with AdaGrad adjustment
                    adjusted_lr = self.lr / (np.sqrt(self.acc_sq_grads[param_index]) + self.epsilon)
                    param -= adjusted_lr * gradients_flat[param_index]
                    param_index += 1

class GradientDescentOptimizer(Optimizer):
    def __init__(self, net: MLP, lr: float):
        super().__init__(net)
        self.lr = lr

    def update(self, params, gradient):
        # Update each parameter based on the gradient
        # This modification assumes that 'gradient' is not averaged across the batch
        for (p, g) in zip(params, gradient):
            # Update the parameter without averaging the gradient across the batch
            p -= self.lr * g


In [None]:
def evaluate_acc(y_true, y_pred):
  print("y_true:",y_true)
  print("y_pred:",y_pred)
  return np.mean(y_true == y_pred)

### 2.2 Verfiy Gradients

from scipy.optimize import check_grad
import numpy as np
from functools import partial

def flatten_parameters(mlp):
    """Flatten all parameters of MLP into a single vector."""
    params = []
    for layer in mlp.layers:
        if hasattr(layer, 'parameters') and layer.parameters is not None:
            for param in layer.parameters:
                params.append(param.flatten())
    return np.concatenate(params)

def unflatten_parameters(mlp, params):
    """Unflatten a single vector to set the parameters of MLP."""
    start = 0
    for layer in mlp.layers:
        if hasattr(layer, 'parameters') and layer.parameters is not None:
            for i, param in enumerate(layer.parameters):
                end = start + np.prod(param.shape)
                layer.parameters[i] = params[start:end].reshape(param.shape)
                start = end

def compute_loss_flat(params_flat, mlp, X, y):
    """Compute loss for a flattened parameter vector."""
    unflatten_parameters(mlp, params_flat)
    preds = mlp.forward(X)
    loss = mlp.compute_loss(X, y)
    return loss

def compute_grad_flat(params_flat, mlp, X, y):
    """Compute gradient for a flattened parameter vector."""
    unflatten_parameters(mlp, params_flat)
    mlp.forward(X)
    mlp.backward(y)
    grads_flat = flatten_parameters(mlp)
    return grads_flat

# Define loss and gradient functions for scipy.optimize.check_grad
def loss_func(params_flat):
    # Set the parameters to the network
    unflatten_parameters(mlp, params_flat)
    # Compute loss
    return compute_loss_flat(params_flat, mlp, X_subset, y_subset)

def grad_func(params_flat):
    # Set the parameters to the network
    unflatten_parameters(mlp, params_flat)
    # Compute and return the gradient
    return compute_grad_flat(params_flat, mlp, X_subset, y_subset)

# mlp = MLP(input_size=784, layer_sizes=[64, 64], output_size=25)
mlp = MLP(input_size=784, layer_sizes=[64, 64], output_size=24)

X_subset = X_train[:10]
y_subset = y_train[:10]
params_flat_initial = flatten_parameters(mlp)

# Verify the gradient
difference = check_grad(func=loss_func, grad=grad_func, x0=params_flat_initial)

print(f"Gradient check difference: {difference}")


'''
def cross_entropy_loss(predictions, targets):
    predictions = np.clip(predictions, 1e-7, 1 - 1e-7) # prevent log(0)
    log_preds = np.log(predictions)
    loss = -np.sum(log_preds * targets) / predictions.shape[0]
    return loss

def compute_cost(mlp, X, y):
    predictions = mlp.forward(X)
    return cross_entropy_loss(predictions, y)

def estimate_gradient(mlp, X, y, layer_index, param_type, param_index, epsilon=1e-4):
    # get original parameter value
    original_param = getattr(mlp.layers[layer_index], param_type)[param_index]

    # positive
    getattr(mlp.layers[layer_index], param_type)[param_index] = original_param + epsilon
    loss_positive = compute_cost(mlp, X, y)

    # negative
    getattr(mlp.layers[layer_index], param_type)[param_index] = original_param - epsilon
    loss_negative = compute_cost(mlp, X, y)

    # Reset to original value
    getattr(mlp.layers[layer_index], param_type)[param_index] = original_param

    # numerical estimate gradient
    grad_estimated = (loss_positive - loss_negative) / (2 * epsilon)
    return grad_estimated



'''
#init mlp
mlp = MLP(input_size=784, layer_sizes=[64, 64], output_size=y_train.shape[1]) # y_train is one-hot encoded

# Example usage for a single weight in the first LinearLayer
grad_estimated = estimate_gradient(mlp, X_train, y_train, layer_index=0, param_type='w', param_index=(0, 0))
print(f"Estimated gradient: {grad_estimated}")

###  2.3 Hyper-parameters turning

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

#==================================================================== Sklearn ======================================
#rename just to be clear
y_train_flat = y_train
y_test_flat = y_test

# Define hyperparameters
param_grid = {
    'hidden_layer_sizes': [(32, 32), (64, 64), (128, 128), (256,256)],
    'learning_rate_init': [0.001, 0.005, 0.007, 0.01, 0.03],
}

# Init Classifier
mlp = MLPClassifier(max_iter=200)

print("Initializing GridSearchCV...")

# Setup GridSearchCV
# Set n_jobs=-1 to use all available CPUs for parallel computation
grid_search = GridSearchCV(mlp, param_grid, scoring=make_scorer(accuracy_score), cv=3, n_jobs=-1, verbose=2)

print("Starting GridSearchCV hyperparameter tuning...")

# Fit the model
grid_search.fit(X_train, y_train_flat)

print("GridSearchCV tuning completed.")

# Best parameters and best accuracy
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy achieved: {grid_search.best_score_}")

# Predict on the test set with the best found parameters
preds = grid_search.predict(X_test)
test_acc = accuracy_score(y_test_flat, preds)

print(f"Final test accuracy with best parameters: {test_acc}")


In [None]:
from itertools import product

def evaluate_accuracy(predictions, labels):
    """Compute the accuracy of predictions."""
    correct_predictions = np.sum(predictions == labels.argmax(axis=1))
    accuracy = correct_predictions / labels.shape[0]
    return accuracy

# Define hyperparameters
# hidden_layer_sizes = [(32, 32)]
# learning_rate_inits = [0.001, 0.005]
hidden_layer_sizes = [(32, 32)]
learning_rate_inits = [0.001]
best_acc = 0
best_params = {}

for layer_sizes, lr in product(hidden_layer_sizes, learning_rate_inits):
    print(f"Training with layer sizes {layer_sizes} and learning rate {lr}")

    # Initialize your custom MLP model
    mlp = MLP(input_size=784, layer_sizes=list(layer_sizes), output_size=y_train.shape[1])

    mlp.fit(X_train, y_train, lr=lr, max_iter=10000)

    # Predict on the test set
    preds = mlp.predict(X_test)
    #test_acc = evaluate_accuracy(preds, y_test)
    test_acc = evaluate_accuracy(preds, y_test)
    print(f"Test accuracy: {test_acc}")

    # Update best parameters if the current model is better
    if test_acc > best_acc:
        best_acc = test_acc
        best_params = {'layer_sizes': layer_sizes, 'learning_rate': lr}

print(f"Best test accuracy: {best_acc} with parameters {best_params}")


Training with layer sizes (32, 32) and learning rate 0.001
Loss: 3.2772489275992207
Loss: 3.227674568812961


KeyboardInterrupt: 

## Task 3

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import numpy as np


#==================================================================== Sklearn ======================================
y_train_flat = np.argmax(y_train, axis=1) if y_train.ndim > 1 else y_train
y_test_flat = np.argmax(y_test, axis=1) if y_test.ndim > 1 else y_test

hidden_units = [32, 64, 128, 256]
models_config = {
    "No hidden layer": [()],
    "Single hidden layer": [(h,) for h in hidden_units],
    "Two hidden layers": [(h, h) for h in hidden_units]
}

best_scores = {}
best_models = {}

for model_name, configs in models_config.items():
    best_acc = 0
    best_config = None
    for config in configs:
        print(f"Training {model_name} with hidden units configuration: {config}")
        mlp = MLPClassifier(hidden_layer_sizes=config, max_iter=200, random_state=42)
        mlp.fit(X_train, y_train_flat)
        preds = mlp.predict(X_test)
        acc = accuracy_score(y_test_flat, preds)
        print(f"Accuracy: {acc}")

        if acc > best_acc:
            best_acc = acc
            best_config = config
            best_models[model_name] = mlp

    best_scores[model_name] = (best_config, best_acc)
    print(f"Best configuration for {model_name}: {best_config} with accuracy: {best_acc}\n")

print("Summary of best configurations and accuracies:")
for model_name, (config, acc) in best_scores.items():
    print(f"{model_name}: Configuration: {config}, Accuracy: {acc}")


In [None]:
from sklearn.metrics import accuracy_score
import numpy as np

hidden_units = [32]
models_config = {
    "No hidden layer": [()],
    "Single hidden layer": [(h,) for h in hidden_units],
    "Two hidden layers": [(h, h) for h in hidden_units]
}

best_scores = {}
best_models = {}

for model_name, configs in models_config.items():
    best_acc = 0
    best_config = None
    for config in configs:
        print(f"Training {model_name} with hidden units configuration: {config}")
        # Adjust the way your MLP is initialized based on its expected parameters
        mlp = MLP(input_size=784, layer_sizes=list(config) if config != () else [], output_size=y_train.shape[1], activation_function=LeakyReLULayer)

        # Fit your MLP model; ensure your fit method can handle one-hot encoded labels directly
        mlp.fit(X_train, y_train, lr=0.001, max_iter=100)

        # Predict on the test set
        preds = mlp.predict(X_test)

        # Convert y_test from one-hot to class labels
        y_test_labels = np.argmax(y_test, axis=1)

        # Calculate accuracy
        acc = accuracy_score(y_test_labels, preds)
        print(f"Accuracy: {acc}")

        if acc > best_acc:
            best_acc = acc
            best_config = config
            best_models[model_name] = mlp

    best_scores[model_name] = (best_config, best_acc)
    print(f"Best configuration for {model_name}: {best_config} with accuracy: {best_acc}\n")

print("Summary of best configurations and accuracies:")
for model_name, (config, acc) in best_scores.items():
    print(f"{model_name}: Configuration: {config}, Accuracy: {acc}")
