In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from keras import models
from keras.models import Sequential
from keras.optimizers import Adam
import tensorflow as tf
import os
from tensorflow.keras.layers import Input
import copy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten

In [2]:
# Function to load datasets
def get_dataset(obj_name):
    filename_labels = 'train_labels.npy'
    filename_origin = 'origin_cifar10.npy'
    filename_small_noise = 'small_noise_cifar10.npy'
    filename_medium_noise = 'medium_noise_cifar10.npy'
    filename_large_noise = 'large_noise_cifar10.npy'
    
    train_set = None
    match obj_name:
        case 'origin':
            train_set = np.load(filename_origin)
        case 'small_noise':
            train_set = np.load(filename_small_noise)
        case 'medium_noise':
            train_set = np.load(filename_medium_noise)
        case 'large_noise':
            train_set = np.load(filename_large_noise)
        case _:
            raise ValueError(f"Unknown obj_name: {obj_name}")
    label_set = np.load(filename_labels) if train_set.size > 0 else None

    if label_set is None or train_set is None:
        print("Run Noise_process.ipynb before running this file")
    return train_set, label_set

In [3]:
# Function to build MLP model
def build_mlp_model(input_shape, num_classes, learning_rate=0.0005, num_units=128, dropout_rate=0):
    model = Sequential([
        Input(shape=input_shape),
        Flatten(),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [4]:
# Function to process dataset
def get_processed_dataset(model_name):
    train_set, label_set = get_dataset(model_name)
    X_combined_reshaped = train_set.reshape(-1, 32, 32, 3)
    X_combined_reshaped = X_combined_reshaped.astype('float32') / 255.0
    y_combined_categorical = to_categorical(label_set, num_classes=10)
    return X_combined_reshaped, y_combined_categorical

In [5]:
# Set constants
ORIGIN = 'origin'
SMALL = 'small_noise'
MEDIUM = 'medium_noise'
LARGE = 'large_noise'
all_dataset = [ORIGIN, SMALL, MEDIUM, LARGE]
unit_sizes = [32, 64, 128, 256, 512]
dropout_rate = 0  # Dropout rate can be adjusted as needed
batch_size = 128
num_epochs = 5
learning_rate = 0.0005

In [6]:
# Outer cross-validation setup
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
best_params = {}
performance_results = []

# Assuming you have your dataset ready from the previous function
x_combined, y_combined = get_processed_dataset(MEDIUM)

In [7]:
#Outer cross-validation loop
for outer_train_idx, outer_val_idx in outer_cv.split(x_combined):
    X_outer_train, X_outer_val = x_combined[outer_train_idx], x_combined[outer_val_idx]
    y_outer_train, y_outer_val = y_combined[outer_train_idx], y_combined[outer_val_idx]

    best_score = -np.inf
    best_num_units = None

    # Inner cross-validation to find the best unit size
    for num_units in unit_sizes:
        inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
        scores = []

        for inner_train_idx, inner_val_idx in inner_cv.split(X_outer_train):
            X_inner_train, X_inner_val = X_outer_train[inner_train_idx], X_outer_train[inner_val_idx]
            y_inner_train, y_inner_val = y_outer_train[inner_train_idx], y_outer_train[inner_val_idx]

            # Build and train the MLP model
            model = build_mlp_model(input_shape=(32, 32, 3), num_classes=10, learning_rate=learning_rate, num_units=num_units, dropout_rate=dropout_rate)
            history = model.fit(
                x=X_inner_train,
                y=y_inner_train,
                batch_size=batch_size,
                epochs=num_epochs,
                validation_data=(X_inner_val, y_inner_val),
                verbose=0
            )
            #print(history.history)

            # Evaluate on validation set
            score = model.evaluate(X_inner_val, y_inner_val, verbose=0)[1]
            scores.append(score)

        # Get the mean score from the inner cross-validation
        mean_score = np.mean(scores)

        if mean_score > best_score:
            best_score = mean_score
            best_num_units = num_units

    # Store the best parameters for the outer fold
    best_params[outer_val_idx[0]] = best_num_units

    # Build final model using the best number of units
    final_model = build_mlp_model(input_shape=(32, 32, 3), num_classes=10, learning_rate=learning_rate, num_units=best_num_units, dropout_rate=dropout_rate)
    final_model.fit(X_outer_train, y_outer_train, epochs=num_epochs, batch_size=batch_size, verbose=0)
    final_performance = final_model.evaluate(X_outer_val, y_outer_val)[1]
    print("Final performance: ", final_performance)
    performance_results.append(final_performance)

# Calculate and display the average performance across all outer folds
average_performance = np.mean(performance_results)
print(f'Average Performance across all outer folds: {average_performance}')
print(f'Best Parameters for each fold: {best_params}')

[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3262 - loss: 1.8992
Final performance:  0.33916667103767395
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3501 - loss: 1.8173 
Final performance:  0.3499999940395355
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3727 - loss: 1.7489  
Final performance:  0.38499999046325684
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3362 - loss: 1.8732  
Final performance:  0.34333333373069763
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 892us/step - accuracy: 0.3422 - loss: 1.8399
Final performance:  0.33916667103767395
Average Performance across all outer folds: 0.3513333320617676
Best Parameters for each fold: {8: 256, 12: 256, 0: 512, 1: 512, 3: 256}
