# Imports

In [1]:
import os
import math
import torch
import torch.nn as nn
import numpy as np
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import math

# Dataset

In [2]:

from sklearn.discriminant_analysis import StandardScaler

class CustomDataset(Dataset):
    def __init__(self, csv_path="data/dataset_train_2024.csv"):
        
        data = pd.read_csv(csv_path)
        
        self.sequences_1 = data.iloc[:, 1:129].values * 100  # Columns 1-128 (1-based indexing)
        self.sequences_2 = data.iloc[:, 129:257].values * 100  # Columns 129-256
        self.extra_feature = data.iloc[:, 257].values.reshape(-1, 1)  # Column 257

        all_features = np.hstack([self.sequences_1, self.sequences_2, self.extra_feature])
        
        self.scaler = StandardScaler()
        self.normalized_features = self.scaler.fit_transform(all_features)
        self.features = torch.tensor(self.normalized_features, dtype=torch.float32)

        self.label_encoder = LabelEncoder()
        self.labels = torch.tensor(self.label_encoder.fit_transform(data.iloc[:, -1]), dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
    def inverseTransform(self, array):
        return self.label_encoder.inverse_transform(array)

# Classifier

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, seq_len: int = 128):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(seq_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

def reformat_tensor(tensor):
  batch_size = tensor.shape[0]
  seq1 = tensor[:, :128]
  seq2 = tensor[:, 128:256]
  noise = tensor[:, -1].unsqueeze(1).expand(batch_size, 128)
  return torch.stack([seq1, seq2, noise], dim=2)

#Encoder
class TransformerClassifier(nn.Module):
    def __init__(self, seq_len, input_dim, d_model, nhead, dim_feedforward, num_layers, num_classes):
        super(TransformerClassifier, self).__init__()
        self.input_fc = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=0.1
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

        mlp_hidden_dim = 64
        self.mlp = nn.Sequential(
            nn.Linear(d_model, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, num_classes)
        )

    def forward(self, x):
        x = reformat_tensor(x)
        x = x.permute(1, 0, 2)
        x = self.input_fc(x)  
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)
        x = self.mlp(x)
        return x


# Initialization

In [4]:
# Parameters

batch_size = 32
epochs = 50

learning_rate = 0.001
momentum = 0.001
weight_decay=0.001

seq_len = 128
input_dim = 3

num_layers = 1
nhead = 4
num_classes = 5
d_model = 128
dim_feedforward = 4 * d_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

dataset = CustomDataset()
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

model = TransformerClassifier(
    seq_len=seq_len,
    input_dim=input_dim,
    d_model=d_model,
    nhead=nhead,
    num_layers=num_layers,
    dim_feedforward=dim_feedforward,
    num_classes=num_classes,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


Using device: cuda




# Training

In [5]:
print("Training the model...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    start_time = time.time()

    for features, labels in train_loader:

        
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    end_time = time.time()
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Time: {epoch_time:.2f} seconds")


Training the model...
Epoch 1/50, Loss: 0.9841, Time: 3.09 seconds
Epoch 2/50, Loss: 0.4503, Time: 2.45 seconds
Epoch 3/50, Loss: 0.3698, Time: 2.34 seconds
Epoch 4/50, Loss: 0.3281, Time: 2.33 seconds
Epoch 5/50, Loss: 0.2983, Time: 2.78 seconds
Epoch 6/50, Loss: 0.2561, Time: 3.28 seconds
Epoch 7/50, Loss: 0.2519, Time: 3.06 seconds
Epoch 8/50, Loss: 0.2144, Time: 3.14 seconds
Epoch 9/50, Loss: 0.1925, Time: 3.35 seconds
Epoch 10/50, Loss: 0.1734, Time: 2.89 seconds
Epoch 11/50, Loss: 0.1579, Time: 2.97 seconds
Epoch 12/50, Loss: 0.1628, Time: 2.88 seconds
Epoch 13/50, Loss: 0.1487, Time: 3.22 seconds
Epoch 14/50, Loss: 0.1392, Time: 3.03 seconds
Epoch 15/50, Loss: 0.1371, Time: 3.10 seconds
Epoch 16/50, Loss: 0.1365, Time: 3.06 seconds
Epoch 17/50, Loss: 0.1279, Time: 2.74 seconds
Epoch 18/50, Loss: 0.1365, Time: 3.42 seconds
Epoch 19/50, Loss: 0.1225, Time: 3.15 seconds
Epoch 20/50, Loss: 0.1012, Time: 3.44 seconds
Epoch 21/50, Loss: 0.1097, Time: 3.05 seconds
Epoch 22/50, Loss: 0.

# Testing

In [6]:
print("Testing the model...")
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, preds = torch.max(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
from sklearn.metrics import f1_score
f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Testing the model...
F1 Score: 0.9561
Accuracy: 0.9567


# Prepare for Kaggle


In [7]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader

class UnlabeledDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.data = dataframe

        self.scaler = StandardScaler()
        normalized_values = self.scaler.fit_transform(self.data.values)
        self.normalized_data = pd.DataFrame(
            normalized_values, columns=self.data.columns, index=self.data.index
        )
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs = self.normalized_data.iloc[idx].values.astype('float32')
        if self.transform:
            inputs = self.transform(inputs)
        return inputs

csv_path = "data/dataset_test_no_label_2024.csv"
unlabeled_df = pd.read_csv(csv_path)
unlabeled_df = unlabeled_df.drop(unlabeled_df.columns[0], axis=1)
unlabeled_dataset = UnlabeledDataset(unlabeled_df)
unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=64, shuffle=False)

model.eval()

predictions = []
indices = []

with torch.no_grad():
    for idx, inputs in enumerate(unlabeled_dataloader):
        inputs = inputs.to(device)
        
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)  # Get predicted class
        
        start_idx = idx * unlabeled_dataloader.batch_size
        batch_indices = list(range(start_idx, start_idx + len(inputs)))  # Adjusting the index properly
        indices.extend(batch_indices)
        predictions.extend(preds.cpu().numpy())

output_df = pd.DataFrame({"ID": indices, "MODULATION": dataset.inverseTransform(predictions)})

# Save to a CSV file
output_df.to_csv("predictions_with_indices.csv", index=False)

print("Predictions saved to 'predictions_with_indices.csv'")

Predictions saved to 'predictions_with_indices.csv'


In [None]:
from itertools import product

# Define hyperparameter grid
hyperparameter_grid = {
    'learning_rate': [0.001, 0.0001, 0.0005],
    'weight_decay': [0.001, 0.0001],
    'nhead': [2, 4, 8],
    'd_model': [32, 64, 128, 256],
    'dim_feedforward': [128, 256, 512, 1024, 2048],
    'num_layers': [1],
    'batch_size': [16, 32, 64, 128,]
}

param_combinations = list(product(*hyperparameter_grid.values()))
param_names = list(hyperparameter_grid.keys())

best_accuracy = 0
best_params = None

for param_set in param_combinations:
    params = dict(zip(param_names, param_set))
    print(f"Testing params: {params}")

    model = TransformerClassifier(
        seq_len=seq_len,
        input_dim=input_dim,
        d_model=params['d_model'],
        nhead=params['nhead'],
        num_layers=params['num_layers'],
        dim_feedforward=params['dim_feedforward'],
        num_classes=num_classes,
    ).to(device)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=params['learning_rate'],
        weight_decay=params['weight_decay']
    )

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Validation Accuracy: {accuracy:.4f}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print(f"Best Accuracy: {best_accuracy:.4f} with params: {best_params}")


Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9258
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.8696
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.8938
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9221
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9113
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.8842
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.8892
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9204
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9300
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.8925
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.8446
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9392
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9392
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9533
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9396
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.8621
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9600
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9233
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9254
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9175
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9383
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9563
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9675
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9421
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9637
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9521
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9471
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9604
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9629
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9525
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9717
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9537
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9425
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9663
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9650
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9758
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9712
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9750
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9387
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9667
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9496
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9650
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9567
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9663
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9483
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9708
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9625
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9704
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9550
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9829
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9738
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9621
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9700
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9725
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9625
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9587
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9313
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9529
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9287
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9229
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9625
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9521
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9421
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9367
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9392
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9404
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9404
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9358
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9646
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9471
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9458
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.8921
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9617
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9192
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9283
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9296
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9567
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9229
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.8821
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9200
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9304
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9400
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9408
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.8387
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9463
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9079
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.8846
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9204
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9400
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9483
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9375
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9458
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.8667
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9421
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9408
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9350
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9442
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9587
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9613
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9762
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9646
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9450
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9496
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9637
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9712
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9571
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9575
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9579
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9308
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9696
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9579
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9458
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9550
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9571
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9633
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9679
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9521
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9579
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9654
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9500
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9579
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9554
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9667
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9708
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9513
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9663
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9571
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9375
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9642
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9383
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9508
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9604
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9471
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9546
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9400
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9688
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9471
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9608
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9675
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9571
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9667
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9592
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9483
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9613
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9675
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9383
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9321
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9587
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 4, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9329
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9508
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9546
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9563
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9633
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9554
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9483
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9321
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9496
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9067
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9467
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9442
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9554
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9454
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9617
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9487
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9350
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9267
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9525
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9479
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9592
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9642
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9583
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9621
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9654
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9708
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9583
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9625
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9712
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9504
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9721
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9712
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9637
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9712
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9642
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9812
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9429
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9500
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9692
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9613
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9646
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9671
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9633
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9729
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9721
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9717
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9679
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9729
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9754
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9613
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9688
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9712
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9613
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9675
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9692
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9683
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 128, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9571
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9625
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9546
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9563
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9379
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9629
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9487
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9454
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9571
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9433
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9733
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9379
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9446
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9537
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9654
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9596
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9458
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9529
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 8, 'd_model': 256, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9696
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9175
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.8838
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9058
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9129
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.8900
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9221
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9258
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9563
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9246
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9367
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.8554
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9508
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9533
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9104
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9379
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9521
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9071
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9454
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9517
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 32, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9567
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9658
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9537
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9592
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9758
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9679
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9487
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9546
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9304
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9637
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 512, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9579
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9579
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9608
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9525
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9558
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9650
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9600
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9771
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 64, 'dim_feedforward': 2048, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9663
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 16}




Validation Accuracy: 0.9354
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 32}




Validation Accuracy: 0.9608
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 64}




Validation Accuracy: 0.9650
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 128, 'num_layers': 1, 'batch_size': 128}




Validation Accuracy: 0.9625
Testing params: {'learning_rate': 0.001, 'weight_decay': 0.0001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 256, 'num_layers': 1, 'batch_size': 16}




Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 32}

Validation Accuracy: 0.9829

Testing params: {'learning_rate': 0.001, 'weight_decay': 0.001, 'nhead': 2, 'd_model': 128, 'dim_feedforward': 1024, 'num_layers': 1, 'batch_size': 64}

Validation Accuracy: 0.9738