Given the information you've shared, I suggest we focus on the test_digits dataset first, as it's already prepared and more manageable in size. Here's a proposed approach:

1. Use the test_digits dataset:
   - 31,000 judgements for training
   - 140 composites for initial testing

2. Data preparation:
   a. Create more composites to balance the classes. Aim for about 200-300 composites per digit.
   b. Split the data:
      - 80% of judgements for training
      - 10% of judgements for validation
      - 10% of judgements + all composites for testing

3. Transfer learning approach:
   Instead of fine-tuning the entire model, which might lead to overfitting given the relatively small dataset, I suggest using transfer learning:
   
   a. Freeze the earlier layers of the ensemble model.
   b. Replace the final classification layer with a new one.
   c. Train only the new layer(s) on the test_digits training set.

4. Training process:
   a. Use the validation set to monitor performance and prevent overfitting.
   b. Implement early stopping.
   c. Use data augmentation to increase the effective size of the training set.

5. Evaluation:
   a. Use the held-out test set (10% of judgements + composites) for final evaluation.
   b. Compare performance on judgements vs. composites.

6. Iterative improvement:
   If the results are still not satisfactory, we can gradually unfreeze more layers of the original model and continue training.

This approach allows us to leverage the pre-trained model's knowledge while adapting it to the specific characteristics of the test_digits dataset. It's also computationally efficient compared to training from scratch or using the larger JSON dataset.

Regarding the JSON dataset, we can keep it as a backup option. If the transfer learning approach with the test_digits dataset doesn't yield satisfactory results, we can explore using the JSON data to create a larger, more diverse training set. However, this would require more preprocessing and computational resources.

To implement this plan, we'll need to modify your existing code to:
1. Split the test_digits data as described.
2. Implement the transfer learning approach (freezing layers, replacing the classification layer).
3. Add data augmentation.
4. Implement early stopping and validation checks during training.
5. Create a new evaluation function for the final test set.

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision.transforms as transforms
from PIL import Image
import os
import zipfile
import numpy as np
from sklearn.model_selection import train_test_split
from mnist_skeptic_v9 import skeptic_v9
import time

In [2]:
# Cell 1: Ensemble Model Creation and Prediction
class EnsembleModel(nn.Module):
    def __init__(self, model_paths):
        super(EnsembleModel, self).__init__()
        self.models = nn.ModuleList([skeptic_v9() for _ in range(len(model_paths))])
        for model, path in zip(self.models, model_paths):
            model.load_state_dict(torch.load(path))
            model.eval()

    def forward(self, x):
        outputs = [model(x) for model in self.models]
        return torch.stack(outputs).mean(dim=0)

def create_ensemble(model_dir='best_boi_models'):
    model_paths = [os.path.join(model_dir, f) for f in os.listdir(model_dir) if f.endswith('.pth')]
    return EnsembleModel(model_paths)

ensemble_model = create_ensemble()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ensemble_model.to(device)

  model.load_state_dict(torch.load(path))


EnsembleModel(
  (models): ModuleList(
    (0-19): 20 x skeptic_v9(
      (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (batchnorm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (batchnorm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc1): Linear(in_features=512, out_features=128, bias=True)
      (fc2): Linear(in_features=128, out_features=10, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
  )
)

In [9]:
class TransferEnsembleModel(nn.Module):
    def __init__(self, base_model):
        super(TransferEnsembleModel, self).__init__()
        self.base_model = base_model
        
        # Freeze the base model parameters
        for param in self.base_model.parameters():
            param.requires_grad = False
        
        # Inspect the base model structure
        last_layer = list(self.base_model.models[0].children())[-1]
        if isinstance(last_layer, nn.Linear):
            num_features = last_layer.in_features
        else:
            # If the last layer is not Linear, we need to flatten the output
            # and determine the number of features
            dummy_input = torch.randn(1, 1, 16, 16)  # Assuming 16x16 input size
            with torch.no_grad():
                dummy_output = self.base_model.models[0](dummy_input)
            num_features = dummy_output.numel()
        
        # Replace the final layer
        self.new_fc = nn.Linear(num_features, 10)

    def forward(self, x):
        with torch.no_grad():
            features = self.base_model(x)
        # If features is not already flattened, flatten it
        if features.dim() > 2:
            features = features.view(features.size(0), -1)
        return self.new_fc(features)

def load_and_prepare_data(test_digits_folder):
    train_images = []
    train_labels = []
    test_images = []
    test_labels = []

    transform = transforms.Compose([
        transforms.Resize((16, 16)),
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    for filename in os.listdir(test_digits_folder):
        if filename.endswith('.zip') and filename.startswith('experiment_results_participant'):
            zip_filepath = os.path.join(test_digits_folder, filename)

            with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                for img_filename in zip_ref.namelist():
                    if img_filename.endswith('.png'):
                        with zip_ref.open(img_filename) as file:
                            img = Image.open(file).convert('L')  # Convert to grayscale
                            img_tensor = transform(img)
                            
                            digit = int(img_filename.split('_')[0])
                            
                            if 'composite' in img_filename:
                                test_images.append(img_tensor)
                                test_labels.append(digit)
                            else:
                                train_images.append(img_tensor)
                                train_labels.append(digit)

    train_images = torch.stack(train_images)
    train_labels = torch.tensor(train_labels)
    test_images = torch.stack(test_images)
    test_labels = torch.tensor(test_labels)

    # Split train data into train and validation
    train_images, val_images, train_labels, val_labels = train_test_split(
        train_images, train_labels, test_size=0.1, stratify=train_labels, random_state=42
    )

    return (train_images, train_labels), (val_images, val_labels), (test_images, test_labels)

In [4]:
class DigitDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

In [6]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, save_dir):
    best_val_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        start_time = time.time()

        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

            if (batch_idx + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

        train_loss = train_loss / len(train_loader.dataset)
        train_acc = train_correct / train_total
        epoch_time = time.time() - start_time

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss = val_loss / len(val_loader.dataset)
        val_acc = val_correct / val_total

        print(f'Epoch [{epoch+1}/{num_epochs}], Time: {epoch_time:.2f}s')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), os.path.join(save_dir, f'best_transfer_model_epoch_{epoch+1}.pth'))
            print(f'New best model saved at epoch {epoch+1}')

    print(f'Best validation accuracy: {best_val_acc:.4f}')

In [11]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Load and prepare data
    (train_images, train_labels), (val_images, val_labels), (test_images, test_labels) = load_and_prepare_data('test_digits')

    # Create datasets and dataloaders
    train_transform = transforms.Compose([
        transforms.RandomRotation(10),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    ])

    train_dataset = DigitDataset(train_images, train_labels, transform=train_transform)
    val_dataset = DigitDataset(val_images, val_labels)
    test_dataset = DigitDataset(test_images, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

    # Create and prepare the model
    base_ensemble = create_ensemble()
    print("Base ensemble structure:")
    print(base_ensemble)
    
    transfer_model = TransferEnsembleModel(base_ensemble).to(device)
    print("Transfer model structure:")
    print(transfer_model)

    # Training setup
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(transfer_model.new_fc.parameters(), lr=0.001)

    # Create directory for saving models
    save_dir = 'best_boi_models_finetuned_standard'
    os.makedirs(save_dir, exist_ok=True)

    # Train the model
    train_model(transfer_model, train_loader, val_loader, criterion, optimizer, num_epochs=50, device=device, save_dir=save_dir)

    # Evaluate the model
    best_model_path = os.path.join(save_dir, 'best_transfer_model_epoch_*.pth')
    best_model_file = max(glob.glob(best_model_path), key=os.path.getctime)
    transfer_model.load_state_dict(torch.load(best_model_file))
    transfer_model.eval()

    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = transfer_model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

    test_acc = test_correct / test_total
    print(f'Test Accuracy: {test_acc:.4f}')

if __name__ == "__main__":
    main()

Using device: cuda


  model.load_state_dict(torch.load(path))


Base ensemble structure:
EnsembleModel(
  (models): ModuleList(
    (0-19): 20 x skeptic_v9(
      (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (batchnorm1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (batchnorm2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (fc1): Linear(in_features=512, out_features=128, bias=True)
      (fc2): Linear(in_features=128, out_features=10, bias=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
  )
)
Transfer model structure:
TransferEnsembleModel(
  (base_model): EnsembleModel(
    (models): ModuleList(
      (0-19): 20 x skeptic_v9(
        (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (conv2): Conv2d(16, 32, kernel_size=(3, 

NameError: name 'glob' is not defined