In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
import os
import zipfile
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from mnist_skeptic_v9 import skeptic_v9

In [2]:
# Cell 1: Ensemble Model Creation and Prediction
class EnsembleModel(nn.Module):
    def __init__(self, model_paths):
        super(EnsembleModel, self).__init__()
        self.models = nn.ModuleList([skeptic_v9() for _ in range(len(model_paths))])
        for model, path in zip(self.models, model_paths):
            model.load_state_dict(torch.load(path))
            model.eval()

    def forward(self, x):
        outputs = [model(x) for model in self.models]
        return torch.stack(outputs).mean(dim=0)

def create_ensemble(model_dir='best_boi_models'):
    model_paths = [os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith('.pth')]
    return EnsembleModel(model_paths)

ensemble_model = create_ensemble()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ensemble_model.to(device)

  model.load_state_dict(torch.load(path))


EnsembleModel(
  (models): ModuleList(
    (0-19): 20 x skeptic_v9(
      (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (batchnorm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (batchnorm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc1): Linear(in_features=512, out_features=128, bias=True)
      (fc2): Linear(in_features=128, out_features=10, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
  )
)

In [3]:
# Cell 2: Data Loading Functions

def load_mnist_test_data():
    transform = transforms.Compose([
        transforms.Resize((16, 16)),
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    test_dataset = MNIST(root='./data', train=False, download=True, transform=transform)
    return DataLoader(test_dataset, batch_size=64, shuffle=False)

def load_all_experimental_data(test_digits_folder):
    train_images = []
    train_labels = []
    test_images = []
    test_labels = []
    participant_data = {}

    transform = transforms.Compose([
        transforms.Resize((16, 16)),
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,)) #PLEASE CHANGE THIS TO THE TRAINING NORMALIZATION VALUES (0.1307,), (0.3081,) IF YOU WANT TO USE THE TRAINING NORMALIZATION VALUES - ELSE (0.5,), (0.5,) WILL BE USED
    ])

    for filename in os.listdir(test_digits_folder):
        if filename.endswith('.zip') and filename.startswith('experiment_results_participant'):
            participant_number = int(filename.split('participant')[1].split('.')[0])
            zip_filepath = os.path.join(test_digits_folder, filename)

            participant_train_images = []
            participant_train_labels = []
            participant_test_images = []
            participant_test_labels = []

            with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                for img_filename in zip_ref.namelist():
                    if img_filename.endswith('.png'):
                        with zip_ref.open(img_filename) as file:
                            img = Image.open(file).convert('L')  # Convert to grayscale
                            img_tensor = transform(img)
                            
                            digit = int(img_filename.split('_')[0])
                            
                            if 'composite' in img_filename:
                                test_images.append(img_tensor)
                                test_labels.append(digit)
                                participant_test_images.append(img_tensor)
                                participant_test_labels.append(digit)
                            else:
                                train_images.append(img_tensor)
                                train_labels.append(digit)
                                participant_train_images.append(img_tensor)
                                participant_train_labels.append(digit)

            participant_data[participant_number] = {
                'train': (torch.stack(participant_train_images), torch.tensor(participant_train_labels)),
                'test': (torch.stack(participant_test_images), torch.tensor(participant_test_labels))
            }

    print(f"Total training images: {len(train_images)}")
    print(f"Total test images: {len(test_images)}")
    
    for participant, data in participant_data.items():
        print(f"Participant {participant}:")
        print(f"  Training images: {len(data['train'][0])}")
        print(f"  Test images: {len(data['test'][0])}")

    return (torch.stack(train_images), torch.tensor(train_labels), 
            torch.stack(test_images), torch.tensor(test_labels),
            participant_data)

In [4]:
# Cell 3: Evaluation Functions

def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return np.array(all_preds), np.array(all_labels)

def plot_confusion_matrix(true_labels, pred_labels, title):
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'{title.lower().replace(" ", "_")}.png')
    plt.close()

def analyze_confusion(cm):
    n_classes = cm.shape[0]
    
    # Most confusable pairs
    confusable_pairs = []
    for i in range(n_classes):
        for j in range(i+1, n_classes):
            if i != j:
                confusion_score = cm[i, j] + cm[j, i]
                confusable_pairs.append((i, j, confusion_score))
    
    confusable_pairs.sort(key=lambda x: x[2], reverse=True)
    
    # Most discriminable digits
    discriminability = np.diag(cm) / cm.sum(axis=1)
    most_discriminable = np.argsort(discriminability)[::-1]
    
    # Digits the model always guesses (overfitting)
    guess_bias = cm.sum(axis=0) / cm.sum()
    most_guessed = np.argsort(guess_bias)[::-1]
    
    return confusable_pairs[:5], most_discriminable[:5], most_guessed[:5]

In [5]:
# Cell 4: Main Evaluation Script

def main_evaluation():
    # Evaluate on MNIST
    mnist_loader = load_mnist_test_data()
    mnist_preds, mnist_labels = evaluate_model(ensemble_model, mnist_loader)
    plot_confusion_matrix(mnist_labels, mnist_preds, "MNIST Confusion Matrix")
    
    # Analyze MNIST results
    mnist_cm = confusion_matrix(mnist_labels, mnist_preds)
    mnist_confusable, mnist_discriminable, mnist_guessed = analyze_confusion(mnist_cm)
    
    print("MNIST Analysis:")
    print("Most confusable pairs:", mnist_confusable)
    print("Most discriminable digits:", mnist_discriminable)
    print("Most frequently guessed digits:", mnist_guessed)
    
    # Evaluate on Experimental Data
    exp_data = load_all_experimental_data('path_to_your_experimental_data_folder')
    exp_train_images, exp_train_labels, exp_test_images, exp_test_labels, participant_data = exp_data
    
    exp_dataset = ExperimentalDataset(exp_test_images, exp_test_labels)
    exp_loader = DataLoader(exp_dataset, batch_size=64, shuffle=False)
    
    exp_preds, exp_labels = evaluate_model(ensemble_model, exp_loader)
    plot_confusion_matrix(exp_labels, exp_preds, "Experimental Data Confusion Matrix")
    
    # Analyze Experimental results
    exp_cm = confusion_matrix(exp_labels, exp_preds)
    exp_confusable, exp_discriminable, exp_guessed = analyze_confusion(exp_cm)
    
    print("\nExperimental Data Analysis:")
    print("Most confusable pairs:", exp_confusable)
    print("Most discriminable digits:", exp_discriminable)
    print("Most frequently guessed digits:", exp_guessed)
    
    # Evaluate by participant
    participant_accuracies = evaluate_by_participant(ensemble_model, participant_data, device)
    print("\nParticipant Accuracies:", participant_accuracies)


In [6]:
# Run the main evaluation
main_evaluation()

MNIST Analysis:
Most confusable pairs: [(4, 9, 70), (7, 9, 51), (8, 9, 40), (1, 4, 39), (3, 5, 35)]
Most discriminable digits: [0 6 5 1 4]
Most frequently guessed digits: [1 9 0 4 7]


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'path_to_your_experimental_data_folder'