# Build MNIST-1D

In [1]:
# Run this if you're in a Colab
!git clone https://github.com/greydanus/mnist1d

fatal: destination path 'mnist1d' already exists and is not an empty directory.


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import time, copy
import torch
import torch.nn.functional as F
import random
from scipy.ndimage.filters import gaussian_filter
from scipy.interpolate import interp1d
import tensorflow as tf
import sys ; sys.path.append('..')  # useful if you're running locally
import mnist1d
from numpy.random.mtrand import rand

from numpy.random.mtrand import rand
import random
from datetime import datetime

from mnist1d.data import get_templates, get_dataset_args, get_dataset
from mnist1d.train import get_model_args, train_model
from mnist1d.models import ConvBase, GRUBase, MLPBase, LinearBase
from mnist1d.utils import set_seed, plot_signals, ObjectView, from_pickle

# tqdm - package used to shoe a progress bar when loops executing
# tqdm - "progress" in arabic and obriviation for  "Te Quiero DeMaciado" 
# in Spanish (I love you so much)
from tqdm import tqdm 

from google.colab import drive
drive.mount('/content/gdrive')
project_dir = "/content/gdrive/My Drive/Research/metalearn_afunc/"

PROJECT_DIR = './'

class ObjectView(object):
    def __init__(self, d): self.__dict__ = d

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Attaching GPU if any

In [3]:
# Try attaching to GPU
DEVICE = str(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
print('Using:', DEVICE) 

Using: cpu


## Download the MNIST-1D dataset

In [4]:
args = mnist1d.get_dataset_args()
data = mnist1d.get_dataset(args, path='./mnist1d_data.pkl', download=True) # This is the default setting

print("Examples in training set: {}".format(len(data['y'])))
print("Examples in test set: {}".format(len(data['y_test'])))

print("Length of each input: {}".format(data['x'].shape[-1]))

print("Number of classes: {}".format(len(data['templates']['y'])))

train_data_size = len(data['x'])
test_data_size = len(data['x_test'])

print(train_data_size)
print(test_data_size)

File already exists. Skipping download.
Successfully loaded data from ./mnist1d_data.pkl
Examples in training set: 4000
Examples in test set: 1000
Length of each input: 40
Number of classes: 10
4000
1000


In [5]:
print(list(data))

['x', 'x_test', 'y', 'y_test', 't', 'templates']


## Initialize Variables

In [6]:
train_data_size = len(data['x'])
test_data_size = len(data['x_test'])

## Initialize the hyperparameters

In [7]:
t = 100
m = int( 0.7 * train_data_size)

## Sample t random subsets of [n] of size m: I_1, I_2, I_3...

In [8]:
# generate a random subset of indices for the training and test data
random_num_generator = np.random.RandomState(15)

# Generate subset of random indices of size m from (0,train_data_size) without replacement.
random_indices = np.random.choice(train_data_size, size = m, replace = False)

train_images = data["x"][random_indices]

In [9]:
def create_subsets(dataset, t_iterations, m_ratio):
  '''Creates a subset of a dataset with given ratio. Parameter t_iterations - number of iterations / subsets. m_ratio - size of the subset.'''

  random.seed(50)
  # Create a list of subsets
  list_of_subsets = []


  possible_choices = [item for item in range(0, train_data_size)]

  for i in range(t_iterations):
    # Create a subset structure with the same testing data
    subset = {'x':None,'y':None, 'x_test':dataset['x_test'],'y_test':dataset['y_test'],'indices':None}
    
    # Generate subset of random indices of size m from (0,train_data_size) without replacement.
    # random_indices = random_num_generator.choice(train_data_size, size = m, replace = False)
    random_indices = random.choices(possible_choices, k = m)
    
    # Save the random indices in the subset structure
    subset['indices'] = random_indices
    
    # Create a subset of training images and then of testing images / labels
    subset['x'] = dataset["x"][random_indices]
    subset['y'] = dataset["y"][random_indices]

    # Append the subset to the lists of subsets
    list_of_subsets.append(subset)

  return list_of_subsets


In [10]:
get_model_args(as_dict=True)

{'batch_size': 100,
 'checkpoint_every': 1000,
 'device': 'cpu',
 'eval_every': 250,
 'hidden_size': 256,
 'input_size': 40,
 'learning_rate': 0.01,
 'output_size': 10,
 'print_every': 1000,
 'seed': 42,
 'total_steps': 8000,
 'weight_decay': 0}

## Create all necessary subsets

In [11]:
subsets = create_subsets(data,t,m) # this should be t instead of 3


## Algorithm A

In [12]:
# get the model info
args = get_model_args()

# list to keep all the models
list_of_mlp_models = []
list_of_ConvBase_models = []

# list ot keep all the training results
trained_mlp_model_results = []
trained_ConvBase_model_results = []

def train_MLPBase_models(t,subsets,args):
  '''Creates t MLPBase models. Parameter t - number of trials.'''
  # Create and traing t models
  for k in tqdm(range(t)):
    # set the seed
    set_seed(k)

    # create a model
    model = MLPBase(args.input_size, args.output_size)
    
    # append model to the list of models
    list_of_mlp_models.append(model)

    # define the subset of data you want to use
    data_subset = subsets[k]

    # train the model
    mlp_training_results = train_model(data_subset, model, args)

    # append the results of the model
    trained_mlp_model_results.append(mlp_training_results)

  return list_of_mlp_models, trained_mlp_model_results

def train_ConvBase_models(t,subsets,args):
  '''Creates t ConvBase models. Parameter t - number of trials.'''
  # Create and traing t models
  for k in tqdm(range(t)):
    # set the seed
    #set_seed(args.seed)
    set_seed(k)

    # create a model
    model = ConvBase(output_size=args.output_size)
    
    # append model to the list of models
    list_of_ConvBase_models.append(model)

    # define the subset of data you want to use
    data_subset = subsets[k]

    # train the model
    ConvBase_training_results = train_model(data_subset, model, args)

    # append the results of the model
    trained_ConvBase_model_results.append(ConvBase_training_results)

  return list_of_ConvBase_models, trained_ConvBase_model_results



## Train the models on the subsets

In [13]:
# MLP base
list_of_mlp_models, trained_mlp_model_results = train_MLPBase_models(t,subsets,args)

# ConvBase
list_of_ConvBase_models, trained_ConvBase_model_results = train_ConvBase_models(t,subsets,args)

  0%|          | 0/100 [00:00<?, ?it/s]

Initialized MLPBase model with 15210 parameters
step 1000, dt 1.42s, train_loss 1.610e-01, test_loss 3.271e+00, train_acc 99.2, test_acc 54.1
step 2000, dt 1.42s, train_loss 2.064e-04, test_loss 4.099e+00, train_acc 100.0, test_acc 54.6
step 3000, dt 1.43s, train_loss 7.325e-05, test_loss 4.202e+00, train_acc 100.0, test_acc 53.8
step 4000, dt 1.42s, train_loss 4.667e-05, test_loss 4.295e+00, train_acc 100.0, test_acc 54.0
step 5000, dt 1.43s, train_loss 3.233e-05, test_loss 4.390e+00, train_acc 100.0, test_acc 53.9
step 6000, dt 1.45s, train_loss 1.811e-05, test_loss 4.486e+00, train_acc 100.0, test_acc 54.1
step 7000, dt 1.48s, train_loss 7.438e-06, test_loss 4.587e+00, train_acc 100.0, test_acc 54.3


  1%|          | 1/100 [00:11<19:00, 11.52s/it]

step 8000, dt 1.46s, train_loss 8.724e-06, test_loss 4.694e+00, train_acc 100.0, test_acc 54.3
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 1.235e-02, test_loss 3.211e+00, train_acc 99.8, test_acc 52.8
step 2000, dt 1.45s, train_loss 3.911e-04, test_loss 3.664e+00, train_acc 100.0, test_acc 55.5
step 3000, dt 1.38s, train_loss 1.488e-04, test_loss 3.830e+00, train_acc 100.0, test_acc 55.4
step 4000, dt 1.39s, train_loss 5.494e-05, test_loss 3.962e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.40s, train_loss 5.239e-05, test_loss 4.086e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.46s, train_loss 2.099e-05, test_loss 4.217e+00, train_acc 100.0, test_acc 55.1
step 7000, dt 1.43s, train_loss 1.304e-05, test_loss 4.347e+00, train_acc 100.0, test_acc 55.5


  2%|▏         | 2/100 [00:22<18:34, 11.37s/it]

step 8000, dt 1.44s, train_loss 1.137e-05, test_loss 4.479e+00, train_acc 100.0, test_acc 55.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 3.875e-02, test_loss 3.683e+00, train_acc 97.9, test_acc 53.1
step 2000, dt 1.42s, train_loss 1.409e-02, test_loss 5.086e+00, train_acc 99.2, test_acc 55.6
step 3000, dt 1.44s, train_loss 1.149e-04, test_loss 5.497e+00, train_acc 100.0, test_acc 56.6
step 4000, dt 1.49s, train_loss 2.782e-05, test_loss 5.531e+00, train_acc 100.0, test_acc 56.6
step 5000, dt 1.50s, train_loss 2.454e-05, test_loss 5.568e+00, train_acc 100.0, test_acc 57.0
step 6000, dt 1.48s, train_loss 1.068e-05, test_loss 5.610e+00, train_acc 100.0, test_acc 57.0
step 7000, dt 1.48s, train_loss 5.604e-06, test_loss 5.659e+00, train_acc 100.0, test_acc 56.9


  3%|▎         | 3/100 [00:34<18:35, 11.50s/it]

step 8000, dt 1.47s, train_loss 5.014e-06, test_loss 5.714e+00, train_acc 100.0, test_acc 56.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 9.929e-02, test_loss 3.602e+00, train_acc 97.3, test_acc 52.1
step 2000, dt 1.42s, train_loss 1.595e-04, test_loss 3.996e+00, train_acc 100.0, test_acc 54.4
step 3000, dt 1.46s, train_loss 1.048e-04, test_loss 4.120e+00, train_acc 100.0, test_acc 54.5
step 4000, dt 1.44s, train_loss 5.593e-05, test_loss 4.236e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.46s, train_loss 3.055e-05, test_loss 4.357e+00, train_acc 100.0, test_acc 54.8
step 6000, dt 1.44s, train_loss 1.821e-05, test_loss 4.480e+00, train_acc 100.0, test_acc 54.7
step 7000, dt 1.43s, train_loss 1.167e-05, test_loss 4.605e+00, train_acc 100.0, test_acc 54.5


  4%|▍         | 4/100 [00:45<18:22, 11.49s/it]

step 8000, dt 1.44s, train_loss 4.655e-06, test_loss 4.731e+00, train_acc 100.0, test_acc 54.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 2.950e-01, test_loss 3.627e+00, train_acc 97.9, test_acc 54.6
step 2000, dt 1.42s, train_loss 3.004e-04, test_loss 4.500e+00, train_acc 100.0, test_acc 56.5
step 3000, dt 1.45s, train_loss 6.298e-05, test_loss 4.607e+00, train_acc 100.0, test_acc 56.5
step 4000, dt 1.43s, train_loss 4.231e-05, test_loss 4.711e+00, train_acc 100.0, test_acc 56.6
step 5000, dt 1.41s, train_loss 1.544e-05, test_loss 4.816e+00, train_acc 100.0, test_acc 56.8
step 6000, dt 1.43s, train_loss 1.426e-05, test_loss 4.923e+00, train_acc 100.0, test_acc 56.8
step 7000, dt 1.47s, train_loss 7.839e-06, test_loss 5.036e+00, train_acc 100.0, test_acc 56.9


  5%|▌         | 5/100 [00:57<18:08, 11.46s/it]

step 8000, dt 1.44s, train_loss 5.449e-06, test_loss 5.153e+00, train_acc 100.0, test_acc 56.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 8.403e-02, test_loss 3.671e+00, train_acc 97.6, test_acc 53.9
step 2000, dt 1.42s, train_loss 3.153e-04, test_loss 3.996e+00, train_acc 100.0, test_acc 56.6
step 3000, dt 1.44s, train_loss 1.156e-04, test_loss 4.115e+00, train_acc 100.0, test_acc 56.9
step 4000, dt 1.44s, train_loss 4.073e-05, test_loss 4.224e+00, train_acc 100.0, test_acc 56.9
step 5000, dt 1.44s, train_loss 1.977e-05, test_loss 4.330e+00, train_acc 100.0, test_acc 56.7
step 6000, dt 2.23s, train_loss 1.423e-05, test_loss 4.434e+00, train_acc 100.0, test_acc 56.7
step 7000, dt 1.84s, train_loss 8.976e-06, test_loss 4.538e+00, train_acc 100.0, test_acc 56.2


  6%|▌         | 6/100 [01:09<18:34, 11.86s/it]

step 8000, dt 1.45s, train_loss 6.471e-06, test_loss 4.646e+00, train_acc 100.0, test_acc 56.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.31s, train_loss 1.239e-01, test_loss 3.687e+00, train_acc 96.2, test_acc 49.7
step 2000, dt 1.39s, train_loss 7.859e-02, test_loss 5.111e+00, train_acc 97.4, test_acc 52.0
step 3000, dt 1.42s, train_loss 8.696e-05, test_loss 4.851e+00, train_acc 100.0, test_acc 54.1
step 4000, dt 1.41s, train_loss 4.219e-05, test_loss 4.928e+00, train_acc 100.0, test_acc 54.1
step 5000, dt 1.42s, train_loss 2.443e-05, test_loss 4.996e+00, train_acc 100.0, test_acc 54.1
step 6000, dt 1.45s, train_loss 1.351e-05, test_loss 5.065e+00, train_acc 100.0, test_acc 54.2
step 7000, dt 1.48s, train_loss 8.815e-06, test_loss 5.138e+00, train_acc 100.0, test_acc 54.3


  7%|▋         | 7/100 [01:21<18:07, 11.69s/it]

step 8000, dt 1.45s, train_loss 4.253e-06, test_loss 5.217e+00, train_acc 100.0, test_acc 54.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 7.787e-02, test_loss 3.867e+00, train_acc 96.6, test_acc 50.0
step 2000, dt 1.42s, train_loss 2.342e-04, test_loss 4.464e+00, train_acc 100.0, test_acc 53.8
step 3000, dt 1.41s, train_loss 8.852e-05, test_loss 4.590e+00, train_acc 100.0, test_acc 54.3
step 4000, dt 1.42s, train_loss 5.753e-05, test_loss 4.704e+00, train_acc 100.0, test_acc 54.4
step 5000, dt 1.45s, train_loss 3.169e-05, test_loss 4.824e+00, train_acc 100.0, test_acc 53.8
step 6000, dt 1.45s, train_loss 2.527e-05, test_loss 4.949e+00, train_acc 100.0, test_acc 53.6
step 7000, dt 1.43s, train_loss 1.088e-05, test_loss 5.077e+00, train_acc 100.0, test_acc 53.7


  8%|▊         | 8/100 [01:32<17:46, 11.59s/it]

step 8000, dt 1.43s, train_loss 4.979e-06, test_loss 5.209e+00, train_acc 100.0, test_acc 54.0
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.033e-01, test_loss 3.056e+00, train_acc 98.1, test_acc 54.9
step 2000, dt 1.40s, train_loss 2.143e-04, test_loss 3.631e+00, train_acc 100.0, test_acc 56.4
step 3000, dt 1.41s, train_loss 1.161e-04, test_loss 3.686e+00, train_acc 100.0, test_acc 57.0
step 4000, dt 1.42s, train_loss 5.275e-05, test_loss 3.754e+00, train_acc 100.0, test_acc 56.9
step 5000, dt 1.43s, train_loss 2.974e-05, test_loss 3.828e+00, train_acc 100.0, test_acc 57.1
step 6000, dt 1.46s, train_loss 1.620e-05, test_loss 3.908e+00, train_acc 100.0, test_acc 57.4
step 7000, dt 1.41s, train_loss 7.841e-06, test_loss 3.994e+00, train_acc 100.0, test_acc 57.6


  9%|▉         | 9/100 [01:43<17:27, 11.51s/it]

step 8000, dt 1.43s, train_loss 8.011e-06, test_loss 4.089e+00, train_acc 100.0, test_acc 57.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 1.361e-03, test_loss 2.871e+00, train_acc 100.0, test_acc 55.9
step 2000, dt 1.35s, train_loss 3.638e-04, test_loss 3.311e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.38s, train_loss 2.131e-04, test_loss 3.614e+00, train_acc 100.0, test_acc 55.8
step 4000, dt 1.42s, train_loss 8.854e-05, test_loss 3.861e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.41s, train_loss 3.435e-05, test_loss 4.083e+00, train_acc 100.0, test_acc 55.5
step 6000, dt 1.40s, train_loss 3.027e-05, test_loss 4.295e+00, train_acc 100.0, test_acc 55.3
step 7000, dt 1.44s, train_loss 1.282e-05, test_loss 4.500e+00, train_acc 100.0, test_acc 55.5


 10%|█         | 10/100 [01:55<17:04, 11.39s/it]

step 8000, dt 1.39s, train_loss 6.139e-06, test_loss 4.699e+00, train_acc 100.0, test_acc 55.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 4.052e-02, test_loss 3.650e+00, train_acc 98.1, test_acc 52.9
step 2000, dt 1.42s, train_loss 1.142e-01, test_loss 6.102e+00, train_acc 92.3, test_acc 48.4
step 3000, dt 1.46s, train_loss 4.195e-05, test_loss 5.238e+00, train_acc 100.0, test_acc 54.1
step 4000, dt 1.49s, train_loss 4.985e-05, test_loss 5.302e+00, train_acc 100.0, test_acc 53.7
step 5000, dt 1.44s, train_loss 2.477e-05, test_loss 5.366e+00, train_acc 100.0, test_acc 53.6
step 6000, dt 1.48s, train_loss 1.707e-05, test_loss 5.433e+00, train_acc 100.0, test_acc 53.6
step 7000, dt 1.47s, train_loss 1.081e-05, test_loss 5.507e+00, train_acc 100.0, test_acc 54.0


 11%|█         | 11/100 [02:06<17:00, 11.46s/it]

step 8000, dt 1.49s, train_loss 5.302e-06, test_loss 5.587e+00, train_acc 100.0, test_acc 54.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.606e-02, test_loss 3.435e+00, train_acc 99.4, test_acc 52.3
step 2000, dt 1.36s, train_loss 1.582e-04, test_loss 3.496e+00, train_acc 100.0, test_acc 54.6
step 3000, dt 1.40s, train_loss 7.934e-05, test_loss 3.649e+00, train_acc 100.0, test_acc 54.3
step 4000, dt 1.40s, train_loss 5.848e-05, test_loss 3.788e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.38s, train_loss 2.919e-05, test_loss 3.918e+00, train_acc 100.0, test_acc 54.2
step 6000, dt 1.40s, train_loss 1.404e-05, test_loss 4.049e+00, train_acc 100.0, test_acc 54.9
step 7000, dt 1.38s, train_loss 6.339e-06, test_loss 4.184e+00, train_acc 100.0, test_acc 54.8


 12%|█▏        | 12/100 [02:17<16:37, 11.34s/it]

step 8000, dt 1.38s, train_loss 4.634e-06, test_loss 4.322e+00, train_acc 100.0, test_acc 54.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 2.050e-03, test_loss 3.143e+00, train_acc 100.0, test_acc 54.9
step 2000, dt 1.39s, train_loss 6.368e-04, test_loss 3.586e+00, train_acc 100.0, test_acc 54.6
step 3000, dt 1.41s, train_loss 1.455e-04, test_loss 3.896e+00, train_acc 100.0, test_acc 55.0
step 4000, dt 1.41s, train_loss 1.239e-04, test_loss 4.159e+00, train_acc 100.0, test_acc 55.0
step 5000, dt 1.42s, train_loss 3.145e-05, test_loss 4.400e+00, train_acc 100.0, test_acc 55.1
step 6000, dt 1.45s, train_loss 3.298e-05, test_loss 4.624e+00, train_acc 100.0, test_acc 55.2
step 7000, dt 1.44s, train_loss 1.278e-05, test_loss 4.840e+00, train_acc 100.0, test_acc 55.3


 13%|█▎        | 13/100 [02:29<16:25, 11.33s/it]

step 8000, dt 1.43s, train_loss 8.577e-06, test_loss 5.054e+00, train_acc 100.0, test_acc 55.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.212e-02, test_loss 3.517e+00, train_acc 99.6, test_acc 54.2
step 2000, dt 1.39s, train_loss 3.122e-04, test_loss 3.970e+00, train_acc 100.0, test_acc 55.7
step 3000, dt 1.45s, train_loss 9.615e-05, test_loss 4.123e+00, train_acc 100.0, test_acc 56.5
step 4000, dt 1.42s, train_loss 3.854e-05, test_loss 4.256e+00, train_acc 100.0, test_acc 56.4
step 5000, dt 1.40s, train_loss 2.961e-05, test_loss 4.388e+00, train_acc 100.0, test_acc 56.9
step 6000, dt 1.42s, train_loss 1.562e-05, test_loss 4.513e+00, train_acc 100.0, test_acc 56.4
step 7000, dt 1.42s, train_loss 8.293e-06, test_loss 4.639e+00, train_acc 100.0, test_acc 56.5


 14%|█▍        | 14/100 [02:40<16:13, 11.32s/it]

step 8000, dt 1.46s, train_loss 6.587e-06, test_loss 4.770e+00, train_acc 100.0, test_acc 56.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.38s, train_loss 1.485e-01, test_loss 3.963e+00, train_acc 95.9, test_acc 51.6
step 2000, dt 1.42s, train_loss 2.518e-04, test_loss 4.293e+00, train_acc 100.0, test_acc 54.2
step 3000, dt 1.40s, train_loss 1.129e-04, test_loss 4.449e+00, train_acc 100.0, test_acc 54.6
step 4000, dt 1.43s, train_loss 9.039e-05, test_loss 4.580e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.42s, train_loss 4.737e-05, test_loss 4.709e+00, train_acc 100.0, test_acc 54.8
step 6000, dt 1.43s, train_loss 1.256e-05, test_loss 4.840e+00, train_acc 100.0, test_acc 55.1
step 7000, dt 1.48s, train_loss 1.128e-05, test_loss 4.977e+00, train_acc 100.0, test_acc 55.1


 15%|█▌        | 15/100 [02:51<16:04, 11.35s/it]

step 8000, dt 1.43s, train_loss 8.398e-06, test_loss 5.112e+00, train_acc 100.0, test_acc 54.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 6.470e-02, test_loss 3.377e+00, train_acc 97.6, test_acc 54.2
step 2000, dt 1.43s, train_loss 2.227e-04, test_loss 4.589e+00, train_acc 100.0, test_acc 55.5
step 3000, dt 1.45s, train_loss 5.419e-05, test_loss 4.724e+00, train_acc 100.0, test_acc 55.5
step 4000, dt 1.46s, train_loss 5.441e-05, test_loss 4.834e+00, train_acc 100.0, test_acc 55.4
step 5000, dt 1.45s, train_loss 2.746e-05, test_loss 4.938e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.43s, train_loss 2.379e-05, test_loss 5.044e+00, train_acc 100.0, test_acc 55.3
step 7000, dt 1.47s, train_loss 1.122e-05, test_loss 5.157e+00, train_acc 100.0, test_acc 55.8


 16%|█▌        | 16/100 [03:03<15:57, 11.40s/it]

step 8000, dt 1.47s, train_loss 8.662e-06, test_loss 5.279e+00, train_acc 100.0, test_acc 55.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 4.164e-03, test_loss 3.466e+00, train_acc 99.1, test_acc 54.3
step 2000, dt 1.39s, train_loss 2.197e-04, test_loss 4.274e+00, train_acc 100.0, test_acc 56.8
step 3000, dt 1.39s, train_loss 5.751e-05, test_loss 4.326e+00, train_acc 100.0, test_acc 57.1
step 4000, dt 1.44s, train_loss 2.520e-05, test_loss 4.389e+00, train_acc 100.0, test_acc 57.1
step 5000, dt 1.46s, train_loss 2.383e-05, test_loss 4.460e+00, train_acc 100.0, test_acc 56.9
step 6000, dt 1.40s, train_loss 1.265e-05, test_loss 4.540e+00, train_acc 100.0, test_acc 56.8
step 7000, dt 1.40s, train_loss 5.963e-06, test_loss 4.623e+00, train_acc 100.0, test_acc 56.8


 17%|█▋        | 17/100 [03:14<15:43, 11.37s/it]

step 8000, dt 1.47s, train_loss 5.667e-06, test_loss 4.713e+00, train_acc 100.0, test_acc 56.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 6.927e-02, test_loss 3.733e+00, train_acc 98.7, test_acc 55.0
step 2000, dt 1.38s, train_loss 2.094e-04, test_loss 4.760e+00, train_acc 100.0, test_acc 56.7
step 3000, dt 1.42s, train_loss 8.630e-05, test_loss 4.856e+00, train_acc 100.0, test_acc 56.6
step 4000, dt 1.44s, train_loss 2.993e-05, test_loss 4.942e+00, train_acc 100.0, test_acc 57.2
step 5000, dt 1.48s, train_loss 2.483e-05, test_loss 5.027e+00, train_acc 100.0, test_acc 57.0
step 6000, dt 1.47s, train_loss 1.437e-05, test_loss 5.116e+00, train_acc 100.0, test_acc 56.9
step 7000, dt 1.48s, train_loss 8.697e-06, test_loss 5.206e+00, train_acc 100.0, test_acc 56.5


 18%|█▊        | 18/100 [03:26<15:34, 11.40s/it]

step 8000, dt 1.47s, train_loss 4.498e-06, test_loss 5.300e+00, train_acc 100.0, test_acc 57.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 1.339e-01, test_loss 3.752e+00, train_acc 96.2, test_acc 51.5
step 2000, dt 1.43s, train_loss 1.793e-04, test_loss 3.760e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.42s, train_loss 1.054e-04, test_loss 3.904e+00, train_acc 100.0, test_acc 55.3
step 4000, dt 1.40s, train_loss 8.375e-05, test_loss 4.040e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.40s, train_loss 3.683e-05, test_loss 4.176e+00, train_acc 100.0, test_acc 55.7
step 6000, dt 1.45s, train_loss 1.607e-05, test_loss 4.310e+00, train_acc 100.0, test_acc 56.3
step 7000, dt 1.46s, train_loss 1.286e-05, test_loss 4.445e+00, train_acc 100.0, test_acc 56.1


 19%|█▉        | 19/100 [03:37<15:21, 11.38s/it]

step 8000, dt 1.43s, train_loss 4.574e-06, test_loss 4.584e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.376e-02, test_loss 3.475e+00, train_acc 98.1, test_acc 53.7
step 2000, dt 1.43s, train_loss 3.850e-04, test_loss 4.040e+00, train_acc 100.0, test_acc 55.6
step 3000, dt 1.48s, train_loss 1.277e-04, test_loss 4.142e+00, train_acc 100.0, test_acc 56.0
step 4000, dt 1.46s, train_loss 3.058e-05, test_loss 4.239e+00, train_acc 100.0, test_acc 56.0
step 5000, dt 1.48s, train_loss 2.073e-05, test_loss 4.333e+00, train_acc 100.0, test_acc 56.2
step 6000, dt 1.45s, train_loss 1.066e-05, test_loss 4.429e+00, train_acc 100.0, test_acc 56.2
step 7000, dt 1.44s, train_loss 1.145e-05, test_loss 4.533e+00, train_acc 100.0, test_acc 56.2


 20%|██        | 20/100 [03:49<15:15, 11.44s/it]

step 8000, dt 1.46s, train_loss 6.423e-06, test_loss 4.638e+00, train_acc 100.0, test_acc 56.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 1.600e-01, test_loss 4.024e+00, train_acc 97.0, test_acc 50.5
step 2000, dt 1.42s, train_loss 2.867e-04, test_loss 5.300e+00, train_acc 100.0, test_acc 52.3
step 3000, dt 1.45s, train_loss 6.217e-05, test_loss 5.367e+00, train_acc 100.0, test_acc 52.9
step 4000, dt 1.48s, train_loss 2.574e-05, test_loss 5.435e+00, train_acc 100.0, test_acc 53.4
step 5000, dt 1.46s, train_loss 1.862e-05, test_loss 5.511e+00, train_acc 100.0, test_acc 53.1
step 6000, dt 1.43s, train_loss 1.330e-05, test_loss 5.592e+00, train_acc 100.0, test_acc 52.7
step 7000, dt 1.46s, train_loss 8.944e-06, test_loss 5.682e+00, train_acc 100.0, test_acc 52.8


 21%|██        | 21/100 [04:00<15:04, 11.45s/it]

step 8000, dt 1.44s, train_loss 3.657e-06, test_loss 5.781e+00, train_acc 100.0, test_acc 52.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 4.427e-02, test_loss 3.660e+00, train_acc 96.4, test_acc 52.8
step 2000, dt 1.39s, train_loss 2.099e-02, test_loss 5.203e+00, train_acc 97.5, test_acc 52.7
step 3000, dt 1.44s, train_loss 5.739e-05, test_loss 5.107e+00, train_acc 100.0, test_acc 54.7
step 4000, dt 1.43s, train_loss 5.108e-05, test_loss 5.154e+00, train_acc 100.0, test_acc 55.2
step 5000, dt 1.44s, train_loss 1.507e-05, test_loss 5.205e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.42s, train_loss 6.391e-06, test_loss 5.259e+00, train_acc 100.0, test_acc 55.3
step 7000, dt 1.44s, train_loss 9.486e-06, test_loss 5.322e+00, train_acc 100.0, test_acc 55.4


 22%|██▏       | 22/100 [04:11<14:50, 11.42s/it]

step 8000, dt 1.45s, train_loss 5.177e-06, test_loss 5.392e+00, train_acc 100.0, test_acc 55.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.31s, train_loss 2.600e-01, test_loss 3.267e+00, train_acc 93.1, test_acc 51.1
step 2000, dt 1.41s, train_loss 3.644e-04, test_loss 3.457e+00, train_acc 100.0, test_acc 56.4
step 3000, dt 1.39s, train_loss 1.453e-04, test_loss 3.671e+00, train_acc 100.0, test_acc 56.8
step 4000, dt 1.42s, train_loss 9.943e-05, test_loss 3.846e+00, train_acc 100.0, test_acc 56.6
step 5000, dt 1.41s, train_loss 4.284e-05, test_loss 4.007e+00, train_acc 100.0, test_acc 56.6
step 6000, dt 1.37s, train_loss 2.088e-05, test_loss 4.165e+00, train_acc 100.0, test_acc 56.6
step 7000, dt 1.40s, train_loss 1.171e-05, test_loss 4.325e+00, train_acc 100.0, test_acc 56.7


 23%|██▎       | 23/100 [04:22<14:32, 11.33s/it]

step 8000, dt 1.42s, train_loss 9.136e-06, test_loss 4.485e+00, train_acc 100.0, test_acc 56.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.38s, train_loss 1.948e-03, test_loss 3.014e+00, train_acc 100.0, test_acc 56.6
step 2000, dt 1.38s, train_loss 3.521e-04, test_loss 3.354e+00, train_acc 100.0, test_acc 57.0
step 3000, dt 1.40s, train_loss 1.597e-04, test_loss 3.597e+00, train_acc 100.0, test_acc 56.6
step 4000, dt 1.39s, train_loss 8.899e-05, test_loss 3.813e+00, train_acc 100.0, test_acc 56.4
step 5000, dt 1.41s, train_loss 4.307e-05, test_loss 4.017e+00, train_acc 100.0, test_acc 56.1
step 6000, dt 1.42s, train_loss 2.654e-05, test_loss 4.209e+00, train_acc 100.0, test_acc 56.2
step 7000, dt 1.44s, train_loss 8.942e-06, test_loss 4.401e+00, train_acc 100.0, test_acc 56.1


 24%|██▍       | 24/100 [04:34<14:19, 11.30s/it]

step 8000, dt 1.41s, train_loss 5.784e-06, test_loss 4.589e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.377e-01, test_loss 3.248e+00, train_acc 94.9, test_acc 53.6
step 2000, dt 1.38s, train_loss 2.724e-04, test_loss 3.791e+00, train_acc 100.0, test_acc 56.6
step 3000, dt 1.43s, train_loss 5.769e-05, test_loss 3.906e+00, train_acc 100.0, test_acc 56.9
step 4000, dt 1.41s, train_loss 6.013e-05, test_loss 4.008e+00, train_acc 100.0, test_acc 56.8
step 5000, dt 1.43s, train_loss 2.599e-05, test_loss 4.115e+00, train_acc 100.0, test_acc 56.6
step 6000, dt 1.46s, train_loss 2.594e-05, test_loss 4.231e+00, train_acc 100.0, test_acc 56.6
step 7000, dt 1.46s, train_loss 9.223e-06, test_loss 4.353e+00, train_acc 100.0, test_acc 56.6


 25%|██▌       | 25/100 [04:45<14:09, 11.33s/it]

step 8000, dt 1.45s, train_loss 5.990e-06, test_loss 4.478e+00, train_acc 100.0, test_acc 56.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 5.313e-02, test_loss 3.599e+00, train_acc 98.6, test_acc 51.4
step 2000, dt 1.40s, train_loss 1.897e-04, test_loss 3.817e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.39s, train_loss 9.331e-05, test_loss 3.984e+00, train_acc 100.0, test_acc 55.3
step 4000, dt 1.41s, train_loss 3.921e-05, test_loss 4.129e+00, train_acc 100.0, test_acc 55.2
step 5000, dt 1.43s, train_loss 3.318e-05, test_loss 4.270e+00, train_acc 100.0, test_acc 54.8
step 6000, dt 1.39s, train_loss 1.240e-05, test_loss 4.415e+00, train_acc 100.0, test_acc 54.4
step 7000, dt 1.42s, train_loss 8.548e-06, test_loss 4.565e+00, train_acc 100.0, test_acc 54.7


 26%|██▌       | 26/100 [04:56<13:55, 11.30s/it]

step 8000, dt 1.41s, train_loss 7.006e-06, test_loss 4.718e+00, train_acc 100.0, test_acc 54.3
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.31s, train_loss 3.388e-02, test_loss 3.471e+00, train_acc 98.3, test_acc 52.1
step 2000, dt 1.39s, train_loss 2.693e-04, test_loss 4.285e+00, train_acc 100.0, test_acc 54.1
step 3000, dt 1.46s, train_loss 8.497e-05, test_loss 4.434e+00, train_acc 100.0, test_acc 54.8
step 4000, dt 1.42s, train_loss 5.624e-05, test_loss 4.570e+00, train_acc 100.0, test_acc 54.8
step 5000, dt 1.45s, train_loss 3.258e-05, test_loss 4.700e+00, train_acc 100.0, test_acc 54.7
step 6000, dt 1.45s, train_loss 1.843e-05, test_loss 4.832e+00, train_acc 100.0, test_acc 54.5
step 7000, dt 1.45s, train_loss 1.239e-05, test_loss 4.968e+00, train_acc 100.0, test_acc 54.6


 27%|██▋       | 27/100 [05:08<13:46, 11.32s/it]

step 8000, dt 1.45s, train_loss 7.653e-06, test_loss 5.110e+00, train_acc 100.0, test_acc 54.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 3.898e-02, test_loss 3.617e+00, train_acc 97.3, test_acc 54.8
step 2000, dt 1.39s, train_loss 2.347e-04, test_loss 4.154e+00, train_acc 100.0, test_acc 55.0
step 3000, dt 1.42s, train_loss 9.042e-05, test_loss 4.262e+00, train_acc 100.0, test_acc 54.7
step 4000, dt 1.42s, train_loss 8.896e-05, test_loss 4.367e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.46s, train_loss 3.496e-05, test_loss 4.474e+00, train_acc 100.0, test_acc 54.7
step 6000, dt 1.42s, train_loss 1.562e-05, test_loss 4.582e+00, train_acc 100.0, test_acc 54.4
step 7000, dt 1.42s, train_loss 7.597e-06, test_loss 4.693e+00, train_acc 100.0, test_acc 54.5


 28%|██▊       | 28/100 [05:19<13:35, 11.32s/it]

step 8000, dt 1.44s, train_loss 6.974e-06, test_loss 4.808e+00, train_acc 100.0, test_acc 54.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 1.278e-01, test_loss 3.433e+00, train_acc 96.9, test_acc 53.6
step 2000, dt 1.38s, train_loss 1.851e-04, test_loss 3.525e+00, train_acc 100.0, test_acc 58.5
step 3000, dt 1.38s, train_loss 9.604e-05, test_loss 3.673e+00, train_acc 100.0, test_acc 58.5
step 4000, dt 1.40s, train_loss 4.222e-05, test_loss 3.806e+00, train_acc 100.0, test_acc 58.9
step 5000, dt 1.40s, train_loss 2.442e-05, test_loss 3.939e+00, train_acc 100.0, test_acc 58.7
step 6000, dt 1.41s, train_loss 2.121e-05, test_loss 4.069e+00, train_acc 100.0, test_acc 59.0
step 7000, dt 1.43s, train_loss 8.306e-06, test_loss 4.200e+00, train_acc 100.0, test_acc 58.8


 29%|██▉       | 29/100 [05:30<13:20, 11.27s/it]

step 8000, dt 1.41s, train_loss 9.009e-06, test_loss 4.328e+00, train_acc 100.0, test_acc 58.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.31s, train_loss 1.268e-03, test_loss 2.817e+00, train_acc 100.0, test_acc 57.1
step 2000, dt 1.36s, train_loss 2.704e-04, test_loss 3.140e+00, train_acc 100.0, test_acc 57.1
step 3000, dt 1.41s, train_loss 1.376e-04, test_loss 3.366e+00, train_acc 100.0, test_acc 57.6
step 4000, dt 1.37s, train_loss 5.186e-05, test_loss 3.565e+00, train_acc 100.0, test_acc 57.7
step 5000, dt 1.35s, train_loss 3.595e-05, test_loss 3.749e+00, train_acc 100.0, test_acc 57.7
step 6000, dt 1.36s, train_loss 2.424e-05, test_loss 3.925e+00, train_acc 100.0, test_acc 57.5
step 7000, dt 1.37s, train_loss 1.219e-05, test_loss 4.095e+00, train_acc 100.0, test_acc 57.6


 30%|███       | 30/100 [05:41<13:02, 11.17s/it]

step 8000, dt 1.41s, train_loss 6.952e-06, test_loss 4.261e+00, train_acc 100.0, test_acc 57.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 4.073e-02, test_loss 3.696e+00, train_acc 98.0, test_acc 52.3
step 2000, dt 1.39s, train_loss 2.103e-04, test_loss 3.944e+00, train_acc 100.0, test_acc 54.9
step 3000, dt 1.42s, train_loss 8.636e-05, test_loss 4.102e+00, train_acc 100.0, test_acc 55.0
step 4000, dt 1.42s, train_loss 5.627e-05, test_loss 4.246e+00, train_acc 100.0, test_acc 54.9
step 5000, dt 1.43s, train_loss 2.981e-05, test_loss 4.383e+00, train_acc 100.0, test_acc 55.1
step 6000, dt 1.43s, train_loss 1.468e-05, test_loss 4.523e+00, train_acc 100.0, test_acc 55.4
step 7000, dt 1.45s, train_loss 1.618e-05, test_loss 4.666e+00, train_acc 100.0, test_acc 55.2


 31%|███       | 31/100 [05:52<12:53, 11.21s/it]

step 8000, dt 1.41s, train_loss 7.354e-06, test_loss 4.812e+00, train_acc 100.0, test_acc 55.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.691e-01, test_loss 3.596e+00, train_acc 94.3, test_acc 53.3
step 2000, dt 1.40s, train_loss 3.078e-04, test_loss 3.750e+00, train_acc 100.0, test_acc 55.1
step 3000, dt 1.41s, train_loss 1.221e-04, test_loss 3.923e+00, train_acc 100.0, test_acc 54.9
step 4000, dt 1.38s, train_loss 6.447e-05, test_loss 4.081e+00, train_acc 100.0, test_acc 54.9
step 5000, dt 1.41s, train_loss 4.289e-05, test_loss 4.237e+00, train_acc 100.0, test_acc 54.6
step 6000, dt 1.41s, train_loss 2.239e-05, test_loss 4.388e+00, train_acc 100.0, test_acc 54.5
step 7000, dt 1.40s, train_loss 1.349e-05, test_loss 4.536e+00, train_acc 100.0, test_acc 54.9


 32%|███▏      | 32/100 [06:04<12:41, 11.20s/it]

step 8000, dt 1.42s, train_loss 7.460e-06, test_loss 4.686e+00, train_acc 100.0, test_acc 55.0
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.982e-03, test_loss 2.780e+00, train_acc 100.0, test_acc 54.8
step 2000, dt 1.39s, train_loss 3.392e-04, test_loss 3.191e+00, train_acc 100.0, test_acc 53.8
step 3000, dt 1.35s, train_loss 1.749e-04, test_loss 3.475e+00, train_acc 100.0, test_acc 54.0
step 4000, dt 1.36s, train_loss 7.352e-05, test_loss 3.711e+00, train_acc 100.0, test_acc 54.3
step 5000, dt 1.36s, train_loss 3.974e-05, test_loss 3.925e+00, train_acc 100.0, test_acc 54.4
step 6000, dt 1.40s, train_loss 2.539e-05, test_loss 4.129e+00, train_acc 100.0, test_acc 54.3
step 7000, dt 1.41s, train_loss 1.465e-05, test_loss 4.331e+00, train_acc 100.0, test_acc 54.3


 33%|███▎      | 33/100 [06:15<12:26, 11.15s/it]

step 8000, dt 1.40s, train_loss 9.012e-06, test_loss 4.527e+00, train_acc 100.0, test_acc 54.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 3.019e-02, test_loss 3.639e+00, train_acc 99.0, test_acc 52.9
step 2000, dt 1.40s, train_loss 2.656e-04, test_loss 3.886e+00, train_acc 100.0, test_acc 54.4
step 3000, dt 1.37s, train_loss 8.737e-05, test_loss 4.054e+00, train_acc 100.0, test_acc 54.5
step 4000, dt 1.41s, train_loss 5.515e-05, test_loss 4.202e+00, train_acc 100.0, test_acc 54.8
step 5000, dt 1.43s, train_loss 4.952e-05, test_loss 4.342e+00, train_acc 100.0, test_acc 55.1
step 6000, dt 1.42s, train_loss 2.682e-05, test_loss 4.486e+00, train_acc 100.0, test_acc 55.3
step 7000, dt 1.42s, train_loss 9.990e-06, test_loss 4.633e+00, train_acc 100.0, test_acc 55.0


 34%|███▍      | 34/100 [06:26<12:18, 11.18s/it]

step 8000, dt 1.45s, train_loss 6.208e-06, test_loss 4.778e+00, train_acc 100.0, test_acc 54.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 4.559e-01, test_loss 3.473e+00, train_acc 94.2, test_acc 51.2
step 2000, dt 1.39s, train_loss 3.273e-04, test_loss 3.605e+00, train_acc 100.0, test_acc 56.0
step 3000, dt 1.40s, train_loss 1.319e-04, test_loss 3.775e+00, train_acc 100.0, test_acc 56.2
step 4000, dt 1.39s, train_loss 6.270e-05, test_loss 3.924e+00, train_acc 100.0, test_acc 56.4
step 5000, dt 1.42s, train_loss 2.940e-05, test_loss 4.061e+00, train_acc 100.0, test_acc 56.3
step 6000, dt 1.42s, train_loss 1.956e-05, test_loss 4.192e+00, train_acc 100.0, test_acc 56.6
step 7000, dt 1.45s, train_loss 1.102e-05, test_loss 4.322e+00, train_acc 100.0, test_acc 56.4


 35%|███▌      | 35/100 [06:37<12:09, 11.22s/it]

step 8000, dt 1.47s, train_loss 5.259e-06, test_loss 4.457e+00, train_acc 100.0, test_acc 56.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 7.727e-02, test_loss 3.318e+00, train_acc 98.1, test_acc 52.8
step 2000, dt 1.42s, train_loss 2.591e-01, test_loss 5.304e+00, train_acc 96.7, test_acc 53.0
step 3000, dt 1.48s, train_loss 5.334e-05, test_loss 4.923e+00, train_acc 100.0, test_acc 57.0
step 4000, dt 1.47s, train_loss 3.640e-05, test_loss 4.969e+00, train_acc 100.0, test_acc 56.8
step 5000, dt 1.46s, train_loss 1.750e-05, test_loss 5.020e+00, train_acc 100.0, test_acc 56.9
step 6000, dt 1.56s, train_loss 9.262e-06, test_loss 5.076e+00, train_acc 100.0, test_acc 56.8
step 7000, dt 1.46s, train_loss 7.503e-06, test_loss 5.140e+00, train_acc 100.0, test_acc 57.3


 36%|███▌      | 36/100 [06:49<12:07, 11.37s/it]

step 8000, dt 1.46s, train_loss 4.364e-06, test_loss 5.213e+00, train_acc 100.0, test_acc 57.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 1.363e-01, test_loss 3.554e+00, train_acc 98.6, test_acc 56.5
step 2000, dt 1.42s, train_loss 1.395e-03, test_loss 5.091e+00, train_acc 99.9, test_acc 56.4
step 3000, dt 1.44s, train_loss 3.663e-05, test_loss 5.045e+00, train_acc 100.0, test_acc 56.8
step 4000, dt 1.44s, train_loss 1.834e-05, test_loss 5.107e+00, train_acc 100.0, test_acc 57.4
step 5000, dt 1.51s, train_loss 1.826e-05, test_loss 5.172e+00, train_acc 100.0, test_acc 57.8
step 6000, dt 1.48s, train_loss 1.322e-05, test_loss 5.241e+00, train_acc 100.0, test_acc 57.9
step 7000, dt 1.50s, train_loss 4.889e-06, test_loss 5.315e+00, train_acc 100.0, test_acc 57.6


 37%|███▋      | 37/100 [07:01<12:01, 11.45s/it]

step 8000, dt 1.46s, train_loss 3.769e-06, test_loss 5.397e+00, train_acc 100.0, test_acc 57.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 2.258e-03, test_loss 2.982e+00, train_acc 100.0, test_acc 55.9
step 2000, dt 1.40s, train_loss 2.765e-04, test_loss 3.388e+00, train_acc 100.0, test_acc 55.8
step 3000, dt 1.40s, train_loss 1.493e-04, test_loss 3.656e+00, train_acc 100.0, test_acc 56.2
step 4000, dt 1.42s, train_loss 6.683e-05, test_loss 3.885e+00, train_acc 100.0, test_acc 56.3
step 5000, dt 1.37s, train_loss 3.546e-05, test_loss 4.094e+00, train_acc 100.0, test_acc 56.3
step 6000, dt 1.40s, train_loss 1.951e-05, test_loss 4.292e+00, train_acc 100.0, test_acc 56.9
step 7000, dt 1.39s, train_loss 1.024e-05, test_loss 4.483e+00, train_acc 100.0, test_acc 57.0


 38%|███▊      | 38/100 [07:12<11:44, 11.37s/it]

step 8000, dt 1.43s, train_loss 8.529e-06, test_loss 4.669e+00, train_acc 100.0, test_acc 56.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.483e-02, test_loss 3.472e+00, train_acc 99.8, test_acc 54.5
step 2000, dt 1.41s, train_loss 1.642e-04, test_loss 4.153e+00, train_acc 100.0, test_acc 52.4
step 3000, dt 1.39s, train_loss 1.010e-04, test_loss 4.288e+00, train_acc 100.0, test_acc 52.8
step 4000, dt 1.37s, train_loss 4.817e-05, test_loss 4.408e+00, train_acc 100.0, test_acc 53.0
step 5000, dt 1.40s, train_loss 3.526e-05, test_loss 4.527e+00, train_acc 100.0, test_acc 53.0
step 6000, dt 1.40s, train_loss 2.105e-05, test_loss 4.647e+00, train_acc 100.0, test_acc 53.1
step 7000, dt 1.44s, train_loss 9.674e-06, test_loss 4.771e+00, train_acc 100.0, test_acc 53.2


 39%|███▉      | 39/100 [07:23<11:30, 11.31s/it]

step 8000, dt 1.43s, train_loss 8.663e-06, test_loss 4.899e+00, train_acc 100.0, test_acc 53.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.237e-01, test_loss 3.617e+00, train_acc 97.2, test_acc 52.9
step 2000, dt 1.42s, train_loss 2.188e-04, test_loss 3.983e+00, train_acc 100.0, test_acc 55.9
step 3000, dt 1.45s, train_loss 7.936e-05, test_loss 4.129e+00, train_acc 100.0, test_acc 55.8
step 4000, dt 1.43s, train_loss 4.970e-05, test_loss 4.261e+00, train_acc 100.0, test_acc 55.9
step 5000, dt 1.43s, train_loss 2.921e-05, test_loss 4.391e+00, train_acc 100.0, test_acc 56.0
step 6000, dt 1.43s, train_loss 1.954e-05, test_loss 4.521e+00, train_acc 100.0, test_acc 55.7
step 7000, dt 1.43s, train_loss 1.062e-05, test_loss 4.652e+00, train_acc 100.0, test_acc 55.8


 40%|████      | 40/100 [07:35<11:23, 11.40s/it]

step 8000, dt 1.64s, train_loss 7.711e-06, test_loss 4.786e+00, train_acc 100.0, test_acc 55.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.957e-01, test_loss 3.211e+00, train_acc 94.9, test_acc 52.8
step 2000, dt 1.36s, train_loss 9.937e-05, test_loss 3.957e+00, train_acc 100.0, test_acc 54.9
step 3000, dt 1.40s, train_loss 7.542e-05, test_loss 4.093e+00, train_acc 100.0, test_acc 55.9
step 4000, dt 1.43s, train_loss 3.758e-05, test_loss 4.221e+00, train_acc 100.0, test_acc 56.1
step 5000, dt 1.43s, train_loss 2.142e-05, test_loss 4.343e+00, train_acc 100.0, test_acc 56.4
step 6000, dt 1.43s, train_loss 1.723e-05, test_loss 4.469e+00, train_acc 100.0, test_acc 56.3
step 7000, dt 1.45s, train_loss 8.049e-06, test_loss 4.595e+00, train_acc 100.0, test_acc 56.1


 41%|████      | 41/100 [07:46<11:10, 11.37s/it]

step 8000, dt 1.42s, train_loss 5.760e-06, test_loss 4.724e+00, train_acc 100.0, test_acc 56.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.39s, train_loss 1.818e-01, test_loss 3.292e+00, train_acc 98.2, test_acc 52.0
step 2000, dt 1.42s, train_loss 2.008e-02, test_loss 4.156e+00, train_acc 99.3, test_acc 57.2
step 3000, dt 1.48s, train_loss 1.321e-04, test_loss 4.406e+00, train_acc 100.0, test_acc 56.5
step 4000, dt 1.44s, train_loss 7.477e-05, test_loss 4.494e+00, train_acc 100.0, test_acc 57.1
step 5000, dt 1.45s, train_loss 2.227e-05, test_loss 4.582e+00, train_acc 100.0, test_acc 57.3
step 6000, dt 1.46s, train_loss 1.608e-05, test_loss 4.668e+00, train_acc 100.0, test_acc 57.3
step 7000, dt 1.46s, train_loss 1.144e-05, test_loss 4.755e+00, train_acc 100.0, test_acc 56.7


 42%|████▏     | 42/100 [07:57<11:03, 11.44s/it]

step 8000, dt 1.49s, train_loss 8.628e-06, test_loss 4.843e+00, train_acc 100.0, test_acc 56.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.313e-01, test_loss 3.559e+00, train_acc 95.9, test_acc 51.5
step 2000, dt 1.39s, train_loss 2.051e-04, test_loss 3.881e+00, train_acc 100.0, test_acc 55.5
step 3000, dt 1.44s, train_loss 1.180e-04, test_loss 4.037e+00, train_acc 100.0, test_acc 55.2
step 4000, dt 1.42s, train_loss 2.943e-05, test_loss 4.180e+00, train_acc 100.0, test_acc 55.3
step 5000, dt 1.44s, train_loss 4.995e-05, test_loss 4.315e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.40s, train_loss 2.377e-05, test_loss 4.444e+00, train_acc 100.0, test_acc 55.2
step 7000, dt 1.44s, train_loss 9.970e-06, test_loss 4.573e+00, train_acc 100.0, test_acc 55.3


 43%|████▎     | 43/100 [08:09<10:50, 11.41s/it]

step 8000, dt 1.45s, train_loss 5.144e-06, test_loss 4.705e+00, train_acc 100.0, test_acc 55.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.966e-01, test_loss 3.764e+00, train_acc 97.3, test_acc 51.4
step 2000, dt 1.41s, train_loss 6.812e-02, test_loss 5.051e+00, train_acc 98.0, test_acc 53.3
step 3000, dt 1.44s, train_loss 6.141e-05, test_loss 5.243e+00, train_acc 100.0, test_acc 55.5
step 4000, dt 1.46s, train_loss 3.885e-05, test_loss 5.282e+00, train_acc 100.0, test_acc 55.1
step 5000, dt 1.48s, train_loss 1.556e-05, test_loss 5.325e+00, train_acc 100.0, test_acc 54.9
step 6000, dt 1.52s, train_loss 1.657e-05, test_loss 5.373e+00, train_acc 100.0, test_acc 54.7
step 7000, dt 1.46s, train_loss 7.392e-06, test_loss 5.429e+00, train_acc 100.0, test_acc 54.6


 44%|████▍     | 44/100 [08:20<10:42, 11.47s/it]

step 8000, dt 1.46s, train_loss 6.279e-06, test_loss 5.488e+00, train_acc 100.0, test_acc 54.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.088e-01, test_loss 3.783e+00, train_acc 96.4, test_acc 52.7
step 2000, dt 1.43s, train_loss 2.877e-04, test_loss 4.578e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.41s, train_loss 8.894e-05, test_loss 4.640e+00, train_acc 100.0, test_acc 54.9
step 4000, dt 1.42s, train_loss 3.551e-05, test_loss 4.718e+00, train_acc 100.0, test_acc 55.9
step 5000, dt 1.45s, train_loss 1.985e-05, test_loss 4.801e+00, train_acc 100.0, test_acc 56.7
step 6000, dt 1.43s, train_loss 1.089e-05, test_loss 4.885e+00, train_acc 100.0, test_acc 56.7
step 7000, dt 1.43s, train_loss 5.920e-06, test_loss 4.973e+00, train_acc 100.0, test_acc 56.8


 45%|████▌     | 45/100 [08:32<10:28, 11.43s/it]

step 8000, dt 1.41s, train_loss 5.419e-06, test_loss 5.063e+00, train_acc 100.0, test_acc 56.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 1.383e-01, test_loss 3.442e+00, train_acc 95.4, test_acc 52.8
step 2000, dt 1.42s, train_loss 3.477e-02, test_loss 5.244e+00, train_acc 99.3, test_acc 53.7
step 3000, dt 1.44s, train_loss 3.889e-05, test_loss 5.064e+00, train_acc 100.0, test_acc 55.3
step 4000, dt 1.47s, train_loss 3.901e-05, test_loss 5.107e+00, train_acc 100.0, test_acc 55.0
step 5000, dt 1.45s, train_loss 1.615e-05, test_loss 5.155e+00, train_acc 100.0, test_acc 55.2
step 6000, dt 1.48s, train_loss 1.070e-05, test_loss 5.206e+00, train_acc 100.0, test_acc 55.4
step 7000, dt 1.46s, train_loss 6.971e-06, test_loss 5.261e+00, train_acc 100.0, test_acc 55.9


 46%|████▌     | 46/100 [08:43<10:18, 11.45s/it]

step 8000, dt 1.44s, train_loss 5.063e-06, test_loss 5.322e+00, train_acc 100.0, test_acc 55.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 4.853e-02, test_loss 2.976e+00, train_acc 97.6, test_acc 55.1
step 2000, dt 1.41s, train_loss 1.632e-04, test_loss 3.689e+00, train_acc 100.0, test_acc 58.1
step 3000, dt 1.46s, train_loss 6.308e-05, test_loss 3.776e+00, train_acc 100.0, test_acc 58.5
step 4000, dt 1.45s, train_loss 2.931e-05, test_loss 3.857e+00, train_acc 100.0, test_acc 59.0
step 5000, dt 1.47s, train_loss 3.007e-05, test_loss 3.944e+00, train_acc 100.0, test_acc 58.9
step 6000, dt 1.47s, train_loss 1.861e-05, test_loss 4.033e+00, train_acc 100.0, test_acc 58.8
step 7000, dt 1.47s, train_loss 1.079e-05, test_loss 4.128e+00, train_acc 100.0, test_acc 58.8


 47%|████▋     | 47/100 [08:55<10:08, 11.48s/it]

step 8000, dt 1.47s, train_loss 6.194e-06, test_loss 4.232e+00, train_acc 100.0, test_acc 58.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.706e-03, test_loss 2.675e+00, train_acc 100.0, test_acc 58.0
step 2000, dt 1.37s, train_loss 3.993e-04, test_loss 3.055e+00, train_acc 100.0, test_acc 57.9
step 3000, dt 1.35s, train_loss 1.344e-04, test_loss 3.321e+00, train_acc 100.0, test_acc 57.5
step 4000, dt 1.37s, train_loss 6.809e-05, test_loss 3.545e+00, train_acc 100.0, test_acc 57.7
step 5000, dt 1.37s, train_loss 4.530e-05, test_loss 3.745e+00, train_acc 100.0, test_acc 57.6
step 6000, dt 1.42s, train_loss 2.406e-05, test_loss 3.934e+00, train_acc 100.0, test_acc 57.7
step 7000, dt 1.40s, train_loss 1.317e-05, test_loss 4.114e+00, train_acc 100.0, test_acc 57.7


 48%|████▊     | 48/100 [09:06<09:50, 11.35s/it]

step 8000, dt 1.40s, train_loss 6.583e-06, test_loss 4.291e+00, train_acc 100.0, test_acc 57.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.38s, train_loss 1.520e-01, test_loss 3.587e+00, train_acc 94.4, test_acc 50.4
step 2000, dt 1.37s, train_loss 1.998e-04, test_loss 3.672e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.42s, train_loss 7.383e-05, test_loss 3.839e+00, train_acc 100.0, test_acc 55.8
step 4000, dt 1.40s, train_loss 4.480e-05, test_loss 3.974e+00, train_acc 100.0, test_acc 56.2
step 5000, dt 1.38s, train_loss 2.999e-05, test_loss 4.104e+00, train_acc 100.0, test_acc 56.3
step 6000, dt 1.42s, train_loss 1.787e-05, test_loss 4.228e+00, train_acc 100.0, test_acc 56.4
step 7000, dt 1.43s, train_loss 1.027e-05, test_loss 4.355e+00, train_acc 100.0, test_acc 56.4


 49%|████▉     | 49/100 [09:17<09:37, 11.33s/it]

step 8000, dt 1.47s, train_loss 5.555e-06, test_loss 4.489e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 6.641e-02, test_loss 3.383e+00, train_acc 95.6, test_acc 54.0
step 2000, dt 1.39s, train_loss 2.722e-04, test_loss 3.735e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.45s, train_loss 1.302e-04, test_loss 3.930e+00, train_acc 100.0, test_acc 55.5
step 4000, dt 1.44s, train_loss 5.815e-05, test_loss 4.089e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.41s, train_loss 2.580e-05, test_loss 4.241e+00, train_acc 100.0, test_acc 55.7
step 6000, dt 1.42s, train_loss 1.752e-05, test_loss 4.391e+00, train_acc 100.0, test_acc 55.9
step 7000, dt 1.44s, train_loss 1.408e-05, test_loss 4.542e+00, train_acc 100.0, test_acc 55.9


 50%|█████     | 50/100 [09:28<09:27, 11.35s/it]

step 8000, dt 1.46s, train_loss 7.130e-06, test_loss 4.696e+00, train_acc 100.0, test_acc 55.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 6.868e-02, test_loss 3.671e+00, train_acc 97.4, test_acc 52.4
step 2000, dt 1.42s, train_loss 1.898e-04, test_loss 4.343e+00, train_acc 100.0, test_acc 54.2
step 3000, dt 1.42s, train_loss 7.945e-05, test_loss 4.465e+00, train_acc 100.0, test_acc 54.4
step 4000, dt 1.47s, train_loss 3.581e-05, test_loss 4.575e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.48s, train_loss 4.503e-05, test_loss 4.681e+00, train_acc 100.0, test_acc 55.1
step 6000, dt 1.46s, train_loss 1.738e-05, test_loss 4.790e+00, train_acc 100.0, test_acc 54.8
step 7000, dt 1.44s, train_loss 8.565e-06, test_loss 4.907e+00, train_acc 100.0, test_acc 55.5


 51%|█████     | 51/100 [09:40<09:18, 11.40s/it]

step 8000, dt 1.48s, train_loss 8.019e-06, test_loss 5.029e+00, train_acc 100.0, test_acc 55.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 3.825e-01, test_loss 3.074e+00, train_acc 92.1, test_acc 57.2
step 2000, dt 1.42s, train_loss 2.428e-04, test_loss 3.749e+00, train_acc 100.0, test_acc 57.8
step 3000, dt 1.42s, train_loss 8.185e-05, test_loss 3.858e+00, train_acc 100.0, test_acc 58.1
step 4000, dt 1.44s, train_loss 3.586e-05, test_loss 3.951e+00, train_acc 100.0, test_acc 58.1
step 5000, dt 1.42s, train_loss 2.158e-05, test_loss 4.045e+00, train_acc 100.0, test_acc 58.1
step 6000, dt 1.44s, train_loss 1.579e-05, test_loss 4.132e+00, train_acc 100.0, test_acc 58.4
step 7000, dt 1.44s, train_loss 4.860e-06, test_loss 4.218e+00, train_acc 100.0, test_acc 58.7


 52%|█████▏    | 52/100 [09:51<09:06, 11.38s/it]

step 8000, dt 1.44s, train_loss 5.664e-06, test_loss 4.311e+00, train_acc 100.0, test_acc 58.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 4.027e-02, test_loss 3.534e+00, train_acc 99.2, test_acc 55.8
step 2000, dt 1.45s, train_loss 2.940e-04, test_loss 4.482e+00, train_acc 100.0, test_acc 56.3
step 3000, dt 1.48s, train_loss 8.959e-05, test_loss 4.632e+00, train_acc 100.0, test_acc 56.9
step 4000, dt 1.50s, train_loss 4.024e-05, test_loss 4.755e+00, train_acc 100.0, test_acc 57.6
step 5000, dt 1.49s, train_loss 2.363e-05, test_loss 4.869e+00, train_acc 100.0, test_acc 57.7
step 6000, dt 1.46s, train_loss 2.089e-05, test_loss 4.987e+00, train_acc 100.0, test_acc 58.0
step 7000, dt 1.51s, train_loss 8.255e-06, test_loss 5.107e+00, train_acc 100.0, test_acc 58.1


 53%|█████▎    | 53/100 [10:03<08:59, 11.48s/it]

step 8000, dt 1.45s, train_loss 5.087e-06, test_loss 5.230e+00, train_acc 100.0, test_acc 57.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.714e-03, test_loss 2.855e+00, train_acc 100.0, test_acc 54.5
step 2000, dt 1.35s, train_loss 4.781e-04, test_loss 3.242e+00, train_acc 100.0, test_acc 54.8
step 3000, dt 1.40s, train_loss 2.038e-04, test_loss 3.512e+00, train_acc 100.0, test_acc 54.9
step 4000, dt 1.47s, train_loss 9.881e-05, test_loss 3.744e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.39s, train_loss 4.388e-05, test_loss 3.957e+00, train_acc 100.0, test_acc 55.2
step 6000, dt 1.44s, train_loss 2.628e-05, test_loss 4.157e+00, train_acc 100.0, test_acc 55.2
step 7000, dt 1.42s, train_loss 1.555e-05, test_loss 4.351e+00, train_acc 100.0, test_acc 55.3


 54%|█████▍    | 54/100 [10:14<08:45, 11.42s/it]

step 8000, dt 1.46s, train_loss 7.131e-06, test_loss 4.536e+00, train_acc 100.0, test_acc 55.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.546e-03, test_loss 3.013e+00, train_acc 100.0, test_acc 55.3
step 2000, dt 1.34s, train_loss 2.984e-04, test_loss 3.370e+00, train_acc 100.0, test_acc 55.1
step 3000, dt 1.37s, train_loss 1.589e-04, test_loss 3.617e+00, train_acc 100.0, test_acc 55.3
step 4000, dt 1.44s, train_loss 8.646e-05, test_loss 3.832e+00, train_acc 100.0, test_acc 55.1
step 5000, dt 1.40s, train_loss 3.328e-05, test_loss 4.033e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.36s, train_loss 2.071e-05, test_loss 4.227e+00, train_acc 100.0, test_acc 55.4
step 7000, dt 1.43s, train_loss 1.070e-05, test_loss 4.417e+00, train_acc 100.0, test_acc 55.7


 55%|█████▌    | 55/100 [10:25<08:29, 11.33s/it]

step 8000, dt 1.38s, train_loss 7.448e-06, test_loss 4.605e+00, train_acc 100.0, test_acc 55.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 3.897e-02, test_loss 3.537e+00, train_acc 98.5, test_acc 54.0
step 2000, dt 1.45s, train_loss 2.713e-02, test_loss 5.003e+00, train_acc 99.6, test_acc 56.5
step 3000, dt 1.47s, train_loss 6.135e-05, test_loss 4.951e+00, train_acc 100.0, test_acc 55.1
step 4000, dt 1.48s, train_loss 3.441e-05, test_loss 4.994e+00, train_acc 100.0, test_acc 55.3
step 5000, dt 1.48s, train_loss 1.858e-05, test_loss 5.044e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.46s, train_loss 8.087e-06, test_loss 5.098e+00, train_acc 100.0, test_acc 55.6
step 7000, dt 1.46s, train_loss 7.426e-06, test_loss 5.159e+00, train_acc 100.0, test_acc 55.7


 56%|█████▌    | 56/100 [10:37<08:22, 11.42s/it]

step 8000, dt 1.49s, train_loss 5.428e-06, test_loss 5.228e+00, train_acc 100.0, test_acc 55.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 2.381e-02, test_loss 3.226e+00, train_acc 98.6, test_acc 52.1
step 2000, dt 1.37s, train_loss 2.633e-04, test_loss 3.597e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.43s, train_loss 1.312e-04, test_loss 3.759e+00, train_acc 100.0, test_acc 55.5
step 4000, dt 1.42s, train_loss 4.235e-05, test_loss 3.892e+00, train_acc 100.0, test_acc 55.7
step 5000, dt 1.40s, train_loss 3.282e-05, test_loss 4.021e+00, train_acc 100.0, test_acc 56.3
step 6000, dt 1.45s, train_loss 1.833e-05, test_loss 4.148e+00, train_acc 100.0, test_acc 56.1
step 7000, dt 1.44s, train_loss 8.485e-06, test_loss 4.272e+00, train_acc 100.0, test_acc 55.9


 57%|█████▋    | 57/100 [10:48<08:09, 11.39s/it]

step 8000, dt 1.46s, train_loss 4.529e-06, test_loss 4.400e+00, train_acc 100.0, test_acc 56.0
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 6.415e-02, test_loss 3.724e+00, train_acc 98.4, test_acc 55.6
step 2000, dt 1.41s, train_loss 3.134e-04, test_loss 3.804e+00, train_acc 100.0, test_acc 55.6
step 3000, dt 1.42s, train_loss 1.248e-04, test_loss 3.974e+00, train_acc 100.0, test_acc 55.7
step 4000, dt 1.41s, train_loss 7.773e-05, test_loss 4.123e+00, train_acc 100.0, test_acc 55.6
step 5000, dt 1.40s, train_loss 3.593e-05, test_loss 4.267e+00, train_acc 100.0, test_acc 55.9
step 6000, dt 1.40s, train_loss 2.641e-05, test_loss 4.408e+00, train_acc 100.0, test_acc 55.6
step 7000, dt 1.44s, train_loss 1.005e-05, test_loss 4.549e+00, train_acc 100.0, test_acc 55.7


 58%|█████▊    | 58/100 [11:00<07:56, 11.36s/it]

step 8000, dt 1.45s, train_loss 9.564e-06, test_loss 4.691e+00, train_acc 100.0, test_acc 55.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.095e-01, test_loss 3.526e+00, train_acc 96.8, test_acc 52.8
step 2000, dt 1.40s, train_loss 1.425e-04, test_loss 3.902e+00, train_acc 100.0, test_acc 57.2
step 3000, dt 1.42s, train_loss 5.987e-05, test_loss 4.014e+00, train_acc 100.0, test_acc 56.9
step 4000, dt 1.40s, train_loss 3.517e-05, test_loss 4.110e+00, train_acc 100.0, test_acc 57.4
step 5000, dt 1.40s, train_loss 2.635e-05, test_loss 4.207e+00, train_acc 100.0, test_acc 57.0
step 6000, dt 1.44s, train_loss 1.569e-05, test_loss 4.307e+00, train_acc 100.0, test_acc 57.5
step 7000, dt 1.40s, train_loss 9.920e-06, test_loss 4.413e+00, train_acc 100.0, test_acc 57.3


 59%|█████▉    | 59/100 [11:11<07:44, 11.33s/it]

step 8000, dt 1.44s, train_loss 5.506e-06, test_loss 4.525e+00, train_acc 100.0, test_acc 57.3
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.219e-01, test_loss 3.365e+00, train_acc 98.0, test_acc 55.2
step 2000, dt 1.39s, train_loss 2.365e-04, test_loss 3.718e+00, train_acc 100.0, test_acc 56.7
step 3000, dt 1.43s, train_loss 1.176e-04, test_loss 3.856e+00, train_acc 100.0, test_acc 57.2
step 4000, dt 1.39s, train_loss 7.035e-05, test_loss 3.971e+00, train_acc 100.0, test_acc 57.0
step 5000, dt 1.41s, train_loss 2.460e-05, test_loss 4.079e+00, train_acc 100.0, test_acc 56.8
step 6000, dt 1.46s, train_loss 1.637e-05, test_loss 4.187e+00, train_acc 100.0, test_acc 56.4
step 7000, dt 1.41s, train_loss 1.089e-05, test_loss 4.302e+00, train_acc 100.0, test_acc 56.3


 60%|██████    | 60/100 [11:22<07:32, 11.32s/it]

step 8000, dt 1.44s, train_loss 7.276e-06, test_loss 4.421e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.31s, train_loss 1.561e-03, test_loss 2.700e+00, train_acc 100.0, test_acc 56.9
step 2000, dt 1.33s, train_loss 3.373e-04, test_loss 3.125e+00, train_acc 100.0, test_acc 56.1
step 3000, dt 1.38s, train_loss 1.634e-04, test_loss 3.420e+00, train_acc 100.0, test_acc 55.9
step 4000, dt 1.39s, train_loss 7.601e-05, test_loss 3.660e+00, train_acc 100.0, test_acc 56.2
step 5000, dt 1.39s, train_loss 4.060e-05, test_loss 3.873e+00, train_acc 100.0, test_acc 56.2
step 6000, dt 1.37s, train_loss 2.099e-05, test_loss 4.074e+00, train_acc 100.0, test_acc 56.1
step 7000, dt 1.38s, train_loss 9.565e-06, test_loss 4.267e+00, train_acc 100.0, test_acc 56.1


 61%|██████    | 61/100 [11:33<07:17, 11.21s/it]

step 8000, dt 1.39s, train_loss 6.183e-06, test_loss 4.455e+00, train_acc 100.0, test_acc 56.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 5.657e-02, test_loss 3.763e+00, train_acc 95.1, test_acc 49.9
step 2000, dt 1.41s, train_loss 1.874e-04, test_loss 4.167e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.43s, train_loss 3.963e-05, test_loss 4.250e+00, train_acc 100.0, test_acc 55.1
step 4000, dt 1.46s, train_loss 3.829e-05, test_loss 4.330e+00, train_acc 100.0, test_acc 56.1
step 5000, dt 1.44s, train_loss 1.724e-05, test_loss 4.411e+00, train_acc 100.0, test_acc 56.6
step 6000, dt 1.45s, train_loss 1.183e-05, test_loss 4.494e+00, train_acc 100.0, test_acc 56.5
step 7000, dt 1.45s, train_loss 6.156e-06, test_loss 4.579e+00, train_acc 100.0, test_acc 56.8


 62%|██████▏   | 62/100 [11:45<07:08, 11.28s/it]

step 8000, dt 1.45s, train_loss 3.570e-06, test_loss 4.669e+00, train_acc 100.0, test_acc 56.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.39s, train_loss 3.004e-03, test_loss 3.099e+00, train_acc 100.0, test_acc 54.5
step 2000, dt 1.39s, train_loss 2.364e-04, test_loss 3.503e+00, train_acc 100.0, test_acc 56.1
step 3000, dt 1.43s, train_loss 1.307e-04, test_loss 3.748e+00, train_acc 100.0, test_acc 55.7
step 4000, dt 1.40s, train_loss 8.769e-05, test_loss 3.959e+00, train_acc 100.0, test_acc 54.9
step 5000, dt 1.42s, train_loss 4.571e-05, test_loss 4.154e+00, train_acc 100.0, test_acc 54.9
step 6000, dt 1.44s, train_loss 2.336e-05, test_loss 4.341e+00, train_acc 100.0, test_acc 54.6
step 7000, dt 1.43s, train_loss 1.182e-05, test_loss 4.525e+00, train_acc 100.0, test_acc 55.0


 63%|██████▎   | 63/100 [11:56<06:57, 11.29s/it]

step 8000, dt 1.44s, train_loss 7.768e-06, test_loss 4.706e+00, train_acc 100.0, test_acc 54.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 1.259e-01, test_loss 3.333e+00, train_acc 98.5, test_acc 54.2
step 2000, dt 1.39s, train_loss 1.650e-04, test_loss 3.762e+00, train_acc 100.0, test_acc 57.3
step 3000, dt 1.42s, train_loss 8.087e-05, test_loss 3.889e+00, train_acc 100.0, test_acc 56.9
step 4000, dt 1.42s, train_loss 6.047e-05, test_loss 4.002e+00, train_acc 100.0, test_acc 56.9
step 5000, dt 1.43s, train_loss 2.421e-05, test_loss 4.115e+00, train_acc 100.0, test_acc 57.2
step 6000, dt 1.42s, train_loss 1.707e-05, test_loss 4.232e+00, train_acc 100.0, test_acc 57.4
step 7000, dt 1.41s, train_loss 7.535e-06, test_loss 4.354e+00, train_acc 100.0, test_acc 57.5


 64%|██████▍   | 64/100 [12:07<06:46, 11.28s/it]

step 8000, dt 1.45s, train_loss 6.093e-06, test_loss 4.481e+00, train_acc 100.0, test_acc 57.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 2.008e-03, test_loss 2.835e+00, train_acc 100.0, test_acc 54.5
step 2000, dt 1.39s, train_loss 4.732e-04, test_loss 3.219e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.39s, train_loss 2.139e-04, test_loss 3.481e+00, train_acc 100.0, test_acc 55.3
step 4000, dt 1.38s, train_loss 8.560e-05, test_loss 3.709e+00, train_acc 100.0, test_acc 55.1
step 5000, dt 1.39s, train_loss 4.539e-05, test_loss 3.916e+00, train_acc 100.0, test_acc 55.4
step 6000, dt 1.44s, train_loss 2.498e-05, test_loss 4.112e+00, train_acc 100.0, test_acc 55.2
step 7000, dt 1.42s, train_loss 1.242e-05, test_loss 4.305e+00, train_acc 100.0, test_acc 55.2


 65%|██████▌   | 65/100 [12:18<06:34, 11.27s/it]

step 8000, dt 1.43s, train_loss 1.025e-05, test_loss 4.491e+00, train_acc 100.0, test_acc 55.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 3.656e-01, test_loss 3.500e+00, train_acc 95.8, test_acc 53.0
step 2000, dt 1.42s, train_loss 3.067e-04, test_loss 3.886e+00, train_acc 100.0, test_acc 58.0
step 3000, dt 1.45s, train_loss 8.690e-05, test_loss 4.007e+00, train_acc 100.0, test_acc 58.0
step 4000, dt 1.44s, train_loss 4.811e-05, test_loss 4.116e+00, train_acc 100.0, test_acc 58.2
step 5000, dt 1.40s, train_loss 3.332e-05, test_loss 4.222e+00, train_acc 100.0, test_acc 58.6
step 6000, dt 1.42s, train_loss 1.978e-05, test_loss 4.330e+00, train_acc 100.0, test_acc 58.4
step 7000, dt 1.45s, train_loss 8.812e-06, test_loss 4.440e+00, train_acc 100.0, test_acc 58.6


 66%|██████▌   | 66/100 [12:30<06:24, 11.32s/it]

step 8000, dt 1.47s, train_loss 5.507e-06, test_loss 4.553e+00, train_acc 100.0, test_acc 58.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 9.154e-02, test_loss 3.619e+00, train_acc 94.0, test_acc 51.1
step 2000, dt 1.41s, train_loss 2.225e-04, test_loss 3.775e+00, train_acc 100.0, test_acc 56.7
step 3000, dt 1.45s, train_loss 5.517e-05, test_loss 3.881e+00, train_acc 100.0, test_acc 56.0
step 4000, dt 1.42s, train_loss 2.938e-05, test_loss 3.985e+00, train_acc 100.0, test_acc 56.2
step 5000, dt 1.43s, train_loss 3.318e-05, test_loss 4.094e+00, train_acc 100.0, test_acc 56.0
step 6000, dt 1.43s, train_loss 2.270e-05, test_loss 4.205e+00, train_acc 100.0, test_acc 56.2
step 7000, dt 1.48s, train_loss 1.076e-05, test_loss 4.318e+00, train_acc 100.0, test_acc 56.2


 67%|██████▋   | 67/100 [12:41<06:14, 11.34s/it]

step 8000, dt 1.45s, train_loss 6.832e-06, test_loss 4.433e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.839e-02, test_loss 3.177e+00, train_acc 99.2, test_acc 54.0
step 2000, dt 1.40s, train_loss 2.611e-04, test_loss 3.447e+00, train_acc 100.0, test_acc 55.7
step 3000, dt 1.39s, train_loss 1.256e-04, test_loss 3.614e+00, train_acc 100.0, test_acc 55.7
step 4000, dt 1.40s, train_loss 5.970e-05, test_loss 3.763e+00, train_acc 100.0, test_acc 55.7
step 5000, dt 1.42s, train_loss 3.166e-05, test_loss 3.901e+00, train_acc 100.0, test_acc 56.1
step 6000, dt 1.42s, train_loss 1.599e-05, test_loss 4.039e+00, train_acc 100.0, test_acc 56.5
step 7000, dt 1.42s, train_loss 1.232e-05, test_loss 4.178e+00, train_acc 100.0, test_acc 56.6


 68%|██████▊   | 68/100 [12:53<06:02, 11.33s/it]

step 8000, dt 1.46s, train_loss 6.957e-06, test_loss 4.320e+00, train_acc 100.0, test_acc 56.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.38s, train_loss 2.265e-01, test_loss 3.963e+00, train_acc 89.1, test_acc 47.4
step 2000, dt 1.39s, train_loss 3.044e-04, test_loss 3.679e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.39s, train_loss 1.009e-04, test_loss 3.885e+00, train_acc 100.0, test_acc 55.9
step 4000, dt 1.42s, train_loss 6.971e-05, test_loss 4.050e+00, train_acc 100.0, test_acc 56.3
step 5000, dt 1.42s, train_loss 4.679e-05, test_loss 4.199e+00, train_acc 100.0, test_acc 56.0
step 6000, dt 1.41s, train_loss 2.256e-05, test_loss 4.344e+00, train_acc 100.0, test_acc 56.6
step 7000, dt 1.44s, train_loss 7.027e-06, test_loss 4.490e+00, train_acc 100.0, test_acc 56.9


 69%|██████▉   | 69/100 [13:04<05:50, 11.32s/it]

step 8000, dt 1.44s, train_loss 7.583e-06, test_loss 4.636e+00, train_acc 100.0, test_acc 57.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 2.089e-01, test_loss 2.942e+00, train_acc 96.9, test_acc 55.7
step 2000, dt 1.37s, train_loss 2.419e-04, test_loss 3.184e+00, train_acc 100.0, test_acc 60.4
step 3000, dt 1.39s, train_loss 1.080e-04, test_loss 3.316e+00, train_acc 100.0, test_acc 59.7
step 4000, dt 1.41s, train_loss 4.094e-05, test_loss 3.432e+00, train_acc 100.0, test_acc 59.7
step 5000, dt 1.42s, train_loss 3.543e-05, test_loss 3.543e+00, train_acc 100.0, test_acc 59.6
step 6000, dt 1.39s, train_loss 2.051e-05, test_loss 3.652e+00, train_acc 100.0, test_acc 59.7
step 7000, dt 1.43s, train_loss 1.514e-05, test_loss 3.762e+00, train_acc 100.0, test_acc 59.2


 70%|███████   | 70/100 [13:15<05:38, 11.30s/it]

step 8000, dt 1.46s, train_loss 5.655e-06, test_loss 3.874e+00, train_acc 100.0, test_acc 59.0
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 6.454e-02, test_loss 3.850e+00, train_acc 97.1, test_acc 52.3
step 2000, dt 1.42s, train_loss 1.513e-04, test_loss 3.933e+00, train_acc 100.0, test_acc 56.0
step 3000, dt 1.42s, train_loss 8.102e-05, test_loss 4.066e+00, train_acc 100.0, test_acc 55.8
step 4000, dt 1.44s, train_loss 7.267e-05, test_loss 4.193e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.44s, train_loss 1.781e-05, test_loss 4.319e+00, train_acc 100.0, test_acc 55.3
step 6000, dt 1.42s, train_loss 2.448e-05, test_loss 4.446e+00, train_acc 100.0, test_acc 56.1
step 7000, dt 1.46s, train_loss 9.155e-06, test_loss 4.575e+00, train_acc 100.0, test_acc 56.3


 71%|███████   | 71/100 [13:26<05:28, 11.33s/it]

step 8000, dt 1.43s, train_loss 8.404e-06, test_loss 4.706e+00, train_acc 100.0, test_acc 56.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 6.375e-02, test_loss 3.415e+00, train_acc 93.8, test_acc 55.5
step 2000, dt 1.39s, train_loss 2.066e-04, test_loss 3.601e+00, train_acc 100.0, test_acc 56.2
step 3000, dt 1.41s, train_loss 7.641e-05, test_loss 3.770e+00, train_acc 100.0, test_acc 55.6
step 4000, dt 1.41s, train_loss 4.668e-05, test_loss 3.912e+00, train_acc 100.0, test_acc 55.7
step 5000, dt 1.40s, train_loss 2.999e-05, test_loss 4.049e+00, train_acc 100.0, test_acc 55.3
step 6000, dt 1.41s, train_loss 2.463e-05, test_loss 4.177e+00, train_acc 100.0, test_acc 56.1
step 7000, dt 1.41s, train_loss 1.045e-05, test_loss 4.308e+00, train_acc 100.0, test_acc 56.2


 72%|███████▏  | 72/100 [13:38<05:16, 11.29s/it]

step 8000, dt 1.42s, train_loss 6.837e-06, test_loss 4.441e+00, train_acc 100.0, test_acc 56.3
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.764e-01, test_loss 3.882e+00, train_acc 92.3, test_acc 50.8
step 2000, dt 1.45s, train_loss 4.089e-04, test_loss 3.732e+00, train_acc 100.0, test_acc 57.1
step 3000, dt 1.45s, train_loss 1.109e-04, test_loss 3.921e+00, train_acc 100.0, test_acc 56.7
step 4000, dt 1.45s, train_loss 8.555e-05, test_loss 4.073e+00, train_acc 100.0, test_acc 56.5
step 5000, dt 1.44s, train_loss 4.011e-05, test_loss 4.219e+00, train_acc 100.0, test_acc 56.4
step 6000, dt 1.42s, train_loss 2.620e-05, test_loss 4.369e+00, train_acc 100.0, test_acc 56.2
step 7000, dt 1.46s, train_loss 1.723e-05, test_loss 4.521e+00, train_acc 100.0, test_acc 56.3


 73%|███████▎  | 73/100 [13:49<05:06, 11.34s/it]

step 8000, dt 1.42s, train_loss 9.358e-06, test_loss 4.673e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.39s, train_loss 4.992e-01, test_loss 3.481e+00, train_acc 94.3, test_acc 51.5
step 2000, dt 1.39s, train_loss 3.658e-03, test_loss 4.947e+00, train_acc 99.7, test_acc 53.8
step 3000, dt 1.42s, train_loss 6.452e-05, test_loss 4.937e+00, train_acc 100.0, test_acc 55.5
step 4000, dt 1.44s, train_loss 4.250e-05, test_loss 4.997e+00, train_acc 100.0, test_acc 55.6
step 5000, dt 1.42s, train_loss 1.910e-05, test_loss 5.062e+00, train_acc 100.0, test_acc 55.8
step 6000, dt 1.45s, train_loss 1.024e-05, test_loss 5.133e+00, train_acc 100.0, test_acc 56.0
step 7000, dt 1.45s, train_loss 4.023e-06, test_loss 5.209e+00, train_acc 100.0, test_acc 55.8


 74%|███████▍  | 74/100 [14:01<04:55, 11.36s/it]

step 8000, dt 1.47s, train_loss 4.752e-06, test_loss 5.292e+00, train_acc 100.0, test_acc 55.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 2.744e-01, test_loss 3.122e+00, train_acc 96.8, test_acc 56.6
step 2000, dt 1.40s, train_loss 2.707e-04, test_loss 3.455e+00, train_acc 100.0, test_acc 58.2
step 3000, dt 1.39s, train_loss 1.376e-04, test_loss 3.672e+00, train_acc 100.0, test_acc 58.3
step 4000, dt 1.44s, train_loss 7.348e-05, test_loss 3.851e+00, train_acc 100.0, test_acc 58.0
step 5000, dt 1.43s, train_loss 3.605e-05, test_loss 4.023e+00, train_acc 100.0, test_acc 57.7
step 6000, dt 1.41s, train_loss 3.162e-05, test_loss 4.186e+00, train_acc 100.0, test_acc 57.7
step 7000, dt 1.46s, train_loss 1.502e-05, test_loss 4.347e+00, train_acc 100.0, test_acc 57.5


 75%|███████▌  | 75/100 [14:12<04:43, 11.34s/it]

step 8000, dt 1.42s, train_loss 1.087e-05, test_loss 4.505e+00, train_acc 100.0, test_acc 57.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 6.782e-02, test_loss 3.491e+00, train_acc 97.9, test_acc 54.6
step 2000, dt 1.40s, train_loss 4.632e-04, test_loss 4.510e+00, train_acc 100.0, test_acc 57.4
step 3000, dt 1.42s, train_loss 6.529e-05, test_loss 4.639e+00, train_acc 100.0, test_acc 57.3
step 4000, dt 1.46s, train_loss 5.730e-05, test_loss 4.740e+00, train_acc 100.0, test_acc 57.2
step 5000, dt 1.44s, train_loss 1.614e-05, test_loss 4.838e+00, train_acc 100.0, test_acc 57.6
step 6000, dt 1.49s, train_loss 2.166e-05, test_loss 4.937e+00, train_acc 100.0, test_acc 57.8
step 7000, dt 1.45s, train_loss 7.862e-06, test_loss 5.038e+00, train_acc 100.0, test_acc 57.6


 76%|███████▌  | 76/100 [14:23<04:33, 11.38s/it]

step 8000, dt 1.46s, train_loss 4.609e-06, test_loss 5.142e+00, train_acc 100.0, test_acc 57.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 4.493e-02, test_loss 3.390e+00, train_acc 97.7, test_acc 53.7
step 2000, dt 1.37s, train_loss 2.220e-04, test_loss 4.755e+00, train_acc 100.0, test_acc 55.1
step 3000, dt 1.47s, train_loss 6.839e-05, test_loss 4.808e+00, train_acc 100.0, test_acc 55.4
step 4000, dt 1.43s, train_loss 3.254e-05, test_loss 4.882e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.45s, train_loss 1.155e-05, test_loss 4.954e+00, train_acc 100.0, test_acc 56.0
step 6000, dt 1.44s, train_loss 1.625e-05, test_loss 5.025e+00, train_acc 100.0, test_acc 55.9
step 7000, dt 1.46s, train_loss 1.023e-05, test_loss 5.097e+00, train_acc 100.0, test_acc 56.1


 77%|███████▋  | 77/100 [14:35<04:21, 11.38s/it]

step 8000, dt 1.44s, train_loss 5.563e-06, test_loss 5.173e+00, train_acc 100.0, test_acc 56.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 3.672e-01, test_loss 3.638e+00, train_acc 93.8, test_acc 50.2
step 2000, dt 1.41s, train_loss 1.979e-04, test_loss 4.230e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.45s, train_loss 8.782e-05, test_loss 4.358e+00, train_acc 100.0, test_acc 55.8
step 4000, dt 1.47s, train_loss 3.551e-05, test_loss 4.468e+00, train_acc 100.0, test_acc 55.7
step 5000, dt 1.41s, train_loss 1.646e-05, test_loss 4.574e+00, train_acc 100.0, test_acc 55.9
step 6000, dt 1.44s, train_loss 1.270e-05, test_loss 4.686e+00, train_acc 100.0, test_acc 55.9
step 7000, dt 1.42s, train_loss 1.386e-05, test_loss 4.801e+00, train_acc 100.0, test_acc 55.8


 78%|███████▊  | 78/100 [14:46<04:10, 11.39s/it]

step 8000, dt 1.47s, train_loss 7.905e-06, test_loss 4.921e+00, train_acc 100.0, test_acc 55.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 7.424e-02, test_loss 3.601e+00, train_acc 98.5, test_acc 53.7
step 2000, dt 1.36s, train_loss 1.824e-04, test_loss 4.015e+00, train_acc 100.0, test_acc 54.8
step 3000, dt 1.42s, train_loss 1.119e-04, test_loss 4.189e+00, train_acc 100.0, test_acc 54.7
step 4000, dt 1.39s, train_loss 7.408e-05, test_loss 4.352e+00, train_acc 100.0, test_acc 55.0
step 5000, dt 1.39s, train_loss 3.274e-05, test_loss 4.512e+00, train_acc 100.0, test_acc 54.9
step 6000, dt 1.40s, train_loss 1.655e-05, test_loss 4.669e+00, train_acc 100.0, test_acc 54.8
step 7000, dt 1.39s, train_loss 1.286e-05, test_loss 4.829e+00, train_acc 100.0, test_acc 55.3


 79%|███████▉  | 79/100 [14:57<03:57, 11.32s/it]

step 8000, dt 1.44s, train_loss 1.126e-05, test_loss 4.999e+00, train_acc 100.0, test_acc 55.3
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.597e-03, test_loss 2.906e+00, train_acc 100.0, test_acc 57.4
step 2000, dt 1.37s, train_loss 3.699e-04, test_loss 3.262e+00, train_acc 100.0, test_acc 56.9
step 3000, dt 1.44s, train_loss 1.104e-04, test_loss 3.520e+00, train_acc 100.0, test_acc 57.1
step 4000, dt 1.39s, train_loss 7.702e-05, test_loss 3.741e+00, train_acc 100.0, test_acc 57.1
step 5000, dt 1.42s, train_loss 3.372e-05, test_loss 3.946e+00, train_acc 100.0, test_acc 57.5
step 6000, dt 1.44s, train_loss 1.797e-05, test_loss 4.136e+00, train_acc 100.0, test_acc 57.5
step 7000, dt 1.43s, train_loss 9.831e-06, test_loss 4.318e+00, train_acc 100.0, test_acc 57.5


 80%|████████  | 80/100 [15:09<03:46, 11.31s/it]

step 8000, dt 1.44s, train_loss 7.445e-06, test_loss 4.498e+00, train_acc 100.0, test_acc 57.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.38s, train_loss 9.540e-02, test_loss 3.431e+00, train_acc 97.8, test_acc 52.0
step 2000, dt 1.42s, train_loss 2.216e-04, test_loss 3.744e+00, train_acc 100.0, test_acc 53.8
step 3000, dt 1.39s, train_loss 1.160e-04, test_loss 3.909e+00, train_acc 100.0, test_acc 54.3
step 4000, dt 1.42s, train_loss 6.535e-05, test_loss 4.050e+00, train_acc 100.0, test_acc 54.7
step 5000, dt 1.43s, train_loss 3.515e-05, test_loss 4.182e+00, train_acc 100.0, test_acc 54.7
step 6000, dt 1.41s, train_loss 1.698e-05, test_loss 4.308e+00, train_acc 100.0, test_acc 54.6
step 7000, dt 1.43s, train_loss 9.943e-06, test_loss 4.438e+00, train_acc 100.0, test_acc 54.9


 81%|████████  | 81/100 [15:20<03:34, 11.31s/it]

step 8000, dt 1.42s, train_loss 5.925e-06, test_loss 4.576e+00, train_acc 100.0, test_acc 54.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.153e-03, test_loss 3.283e+00, train_acc 100.0, test_acc 54.5
step 2000, dt 1.39s, train_loss 3.433e-04, test_loss 3.528e+00, train_acc 100.0, test_acc 55.4
step 3000, dt 1.42s, train_loss 1.167e-04, test_loss 3.744e+00, train_acc 100.0, test_acc 55.7
step 4000, dt 1.38s, train_loss 9.204e-05, test_loss 3.933e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.40s, train_loss 3.135e-05, test_loss 4.113e+00, train_acc 100.0, test_acc 55.6
step 6000, dt 1.42s, train_loss 1.864e-05, test_loss 4.282e+00, train_acc 100.0, test_acc 55.5
step 7000, dt 1.42s, train_loss 1.302e-05, test_loss 4.450e+00, train_acc 100.0, test_acc 55.5


 82%|████████▏ | 82/100 [15:31<03:23, 11.29s/it]

step 8000, dt 1.47s, train_loss 6.181e-06, test_loss 4.617e+00, train_acc 100.0, test_acc 55.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 1.329e-03, test_loss 2.772e+00, train_acc 100.0, test_acc 57.7
step 2000, dt 1.38s, train_loss 4.194e-04, test_loss 3.096e+00, train_acc 100.0, test_acc 58.0
step 3000, dt 1.38s, train_loss 1.304e-04, test_loss 3.332e+00, train_acc 100.0, test_acc 57.8
step 4000, dt 1.36s, train_loss 1.025e-04, test_loss 3.540e+00, train_acc 100.0, test_acc 58.1
step 5000, dt 1.36s, train_loss 4.103e-05, test_loss 3.734e+00, train_acc 100.0, test_acc 57.9
step 6000, dt 1.41s, train_loss 1.977e-05, test_loss 3.917e+00, train_acc 100.0, test_acc 58.2
step 7000, dt 1.42s, train_loss 1.477e-05, test_loss 4.097e+00, train_acc 100.0, test_acc 57.9


 83%|████████▎ | 83/100 [15:42<03:10, 11.23s/it]

step 8000, dt 1.42s, train_loss 7.718e-06, test_loss 4.276e+00, train_acc 100.0, test_acc 57.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 1.114e-01, test_loss 3.333e+00, train_acc 99.1, test_acc 55.1
step 2000, dt 1.39s, train_loss 1.159e-04, test_loss 3.594e+00, train_acc 100.0, test_acc 57.2
step 3000, dt 1.40s, train_loss 8.738e-05, test_loss 3.746e+00, train_acc 100.0, test_acc 56.7
step 4000, dt 1.43s, train_loss 4.541e-05, test_loss 3.883e+00, train_acc 100.0, test_acc 56.8
step 5000, dt 1.41s, train_loss 2.332e-05, test_loss 4.013e+00, train_acc 100.0, test_acc 56.9
step 6000, dt 1.41s, train_loss 1.317e-05, test_loss 4.144e+00, train_acc 100.0, test_acc 57.3
step 7000, dt 1.44s, train_loss 1.017e-05, test_loss 4.278e+00, train_acc 100.0, test_acc 57.3


 84%|████████▍ | 84/100 [15:53<02:59, 11.25s/it]

step 8000, dt 1.41s, train_loss 5.538e-06, test_loss 4.416e+00, train_acc 100.0, test_acc 56.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 5.375e-02, test_loss 3.361e+00, train_acc 99.5, test_acc 55.2
step 2000, dt 1.39s, train_loss 2.961e-04, test_loss 4.701e+00, train_acc 100.0, test_acc 52.4
step 3000, dt 1.41s, train_loss 4.724e-05, test_loss 4.790e+00, train_acc 100.0, test_acc 53.2
step 4000, dt 1.41s, train_loss 3.075e-05, test_loss 4.879e+00, train_acc 100.0, test_acc 53.5
step 5000, dt 1.47s, train_loss 1.961e-05, test_loss 4.965e+00, train_acc 100.0, test_acc 53.5
step 6000, dt 1.46s, train_loss 1.563e-05, test_loss 5.052e+00, train_acc 100.0, test_acc 53.6
step 7000, dt 1.48s, train_loss 1.132e-05, test_loss 5.144e+00, train_acc 100.0, test_acc 53.8


 85%|████████▌ | 85/100 [16:05<02:49, 11.30s/it]

step 8000, dt 1.48s, train_loss 3.620e-06, test_loss 5.239e+00, train_acc 100.0, test_acc 53.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 1.428e-01, test_loss 3.643e+00, train_acc 96.9, test_acc 54.4
step 2000, dt 1.42s, train_loss 1.914e-04, test_loss 4.052e+00, train_acc 100.0, test_acc 57.3
step 3000, dt 1.41s, train_loss 8.249e-05, test_loss 4.144e+00, train_acc 100.0, test_acc 57.2
step 4000, dt 1.42s, train_loss 4.856e-05, test_loss 4.232e+00, train_acc 100.0, test_acc 57.2
step 5000, dt 1.47s, train_loss 2.236e-05, test_loss 4.325e+00, train_acc 100.0, test_acc 57.2
step 6000, dt 1.43s, train_loss 1.592e-05, test_loss 4.425e+00, train_acc 100.0, test_acc 57.2
step 7000, dt 1.45s, train_loss 1.051e-05, test_loss 4.532e+00, train_acc 100.0, test_acc 57.1


 86%|████████▌ | 86/100 [16:16<02:38, 11.33s/it]

step 8000, dt 1.43s, train_loss 8.702e-06, test_loss 4.642e+00, train_acc 100.0, test_acc 56.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 1.557e-01, test_loss 3.136e+00, train_acc 94.8, test_acc 53.2
step 2000, dt 1.39s, train_loss 3.060e-04, test_loss 3.497e+00, train_acc 100.0, test_acc 57.2
step 3000, dt 1.40s, train_loss 1.210e-04, test_loss 3.638e+00, train_acc 100.0, test_acc 57.0
step 4000, dt 1.44s, train_loss 4.866e-05, test_loss 3.772e+00, train_acc 100.0, test_acc 56.9
step 5000, dt 1.45s, train_loss 4.348e-05, test_loss 3.900e+00, train_acc 100.0, test_acc 57.2
step 6000, dt 1.46s, train_loss 1.998e-05, test_loss 4.029e+00, train_acc 100.0, test_acc 56.7
step 7000, dt 1.47s, train_loss 1.226e-05, test_loss 4.162e+00, train_acc 100.0, test_acc 57.0


 87%|████████▋ | 87/100 [16:28<02:27, 11.34s/it]

step 8000, dt 1.46s, train_loss 8.390e-06, test_loss 4.300e+00, train_acc 100.0, test_acc 57.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 7.617e-02, test_loss 3.421e+00, train_acc 98.3, test_acc 54.0
step 2000, dt 1.40s, train_loss 5.990e-04, test_loss 4.396e+00, train_acc 100.0, test_acc 56.0
step 3000, dt 1.47s, train_loss 9.111e-05, test_loss 4.491e+00, train_acc 100.0, test_acc 56.3
step 4000, dt 1.44s, train_loss 2.609e-05, test_loss 4.569e+00, train_acc 100.0, test_acc 55.8
step 5000, dt 1.45s, train_loss 3.241e-05, test_loss 4.651e+00, train_acc 100.0, test_acc 55.6
step 6000, dt 1.45s, train_loss 1.419e-05, test_loss 4.737e+00, train_acc 100.0, test_acc 55.8
step 7000, dt 1.44s, train_loss 9.936e-06, test_loss 4.828e+00, train_acc 100.0, test_acc 55.7


 88%|████████▊ | 88/100 [16:39<02:16, 11.38s/it]

step 8000, dt 1.46s, train_loss 6.784e-06, test_loss 4.927e+00, train_acc 100.0, test_acc 55.5
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 5.469e-02, test_loss 3.483e+00, train_acc 95.9, test_acc 54.6
step 2000, dt 1.38s, train_loss 5.606e-04, test_loss 4.163e+00, train_acc 99.5, test_acc 56.9
step 3000, dt 1.40s, train_loss 7.782e-05, test_loss 3.948e+00, train_acc 100.0, test_acc 58.2
step 4000, dt 1.42s, train_loss 5.417e-05, test_loss 4.028e+00, train_acc 100.0, test_acc 58.3
step 5000, dt 1.42s, train_loss 2.741e-05, test_loss 4.108e+00, train_acc 100.0, test_acc 58.3
step 6000, dt 1.43s, train_loss 1.343e-05, test_loss 4.192e+00, train_acc 100.0, test_acc 58.6
step 7000, dt 1.45s, train_loss 9.378e-06, test_loss 4.280e+00, train_acc 100.0, test_acc 58.5


 89%|████████▉ | 89/100 [16:50<02:04, 11.36s/it]

step 8000, dt 1.46s, train_loss 6.122e-06, test_loss 4.375e+00, train_acc 100.0, test_acc 58.1
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.39s, train_loss 1.749e-03, test_loss 3.019e+00, train_acc 100.0, test_acc 55.1
step 2000, dt 1.37s, train_loss 4.082e-04, test_loss 3.357e+00, train_acc 100.0, test_acc 54.7
step 3000, dt 1.38s, train_loss 1.402e-04, test_loss 3.595e+00, train_acc 100.0, test_acc 55.1
step 4000, dt 1.37s, train_loss 8.764e-05, test_loss 3.809e+00, train_acc 100.0, test_acc 55.1
step 5000, dt 1.40s, train_loss 3.926e-05, test_loss 4.003e+00, train_acc 100.0, test_acc 54.8
step 6000, dt 1.41s, train_loss 2.050e-05, test_loss 4.191e+00, train_acc 100.0, test_acc 54.8
step 7000, dt 1.42s, train_loss 1.311e-05, test_loss 4.377e+00, train_acc 100.0, test_acc 54.5


 90%|█████████ | 90/100 [17:02<01:53, 11.31s/it]

step 8000, dt 1.43s, train_loss 1.017e-05, test_loss 4.557e+00, train_acc 100.0, test_acc 54.4
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 2.258e-01, test_loss 3.419e+00, train_acc 96.2, test_acc 54.6
step 2000, dt 1.39s, train_loss 1.904e-04, test_loss 3.612e+00, train_acc 100.0, test_acc 56.7
step 3000, dt 1.40s, train_loss 1.344e-04, test_loss 3.758e+00, train_acc 100.0, test_acc 56.9
step 4000, dt 1.41s, train_loss 4.291e-05, test_loss 3.887e+00, train_acc 100.0, test_acc 57.0
step 5000, dt 1.42s, train_loss 2.905e-05, test_loss 4.017e+00, train_acc 100.0, test_acc 57.1
step 6000, dt 1.41s, train_loss 1.727e-05, test_loss 4.149e+00, train_acc 100.0, test_acc 57.2
step 7000, dt 1.41s, train_loss 1.033e-05, test_loss 4.282e+00, train_acc 100.0, test_acc 57.4


 91%|█████████ | 91/100 [17:13<01:41, 11.29s/it]

step 8000, dt 1.46s, train_loss 8.112e-06, test_loss 4.416e+00, train_acc 100.0, test_acc 57.3
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.38s, train_loss 1.460e-01, test_loss 3.303e+00, train_acc 97.6, test_acc 52.8
step 2000, dt 1.44s, train_loss 4.694e-04, test_loss 4.220e+00, train_acc 100.0, test_acc 55.9
step 3000, dt 1.44s, train_loss 5.502e-05, test_loss 4.266e+00, train_acc 100.0, test_acc 56.5
step 4000, dt 1.40s, train_loss 2.489e-05, test_loss 4.319e+00, train_acc 100.0, test_acc 57.1
step 5000, dt 1.40s, train_loss 1.395e-05, test_loss 4.374e+00, train_acc 100.0, test_acc 57.0
step 6000, dt 1.44s, train_loss 1.087e-05, test_loss 4.437e+00, train_acc 100.0, test_acc 57.2
step 7000, dt 1.44s, train_loss 1.014e-05, test_loss 4.506e+00, train_acc 100.0, test_acc 57.2


 92%|█████████▏| 92/100 [17:24<01:30, 11.33s/it]

step 8000, dt 1.44s, train_loss 4.717e-06, test_loss 4.579e+00, train_acc 100.0, test_acc 57.2
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.34s, train_loss 1.115e-01, test_loss 2.998e+00, train_acc 97.2, test_acc 56.6
step 2000, dt 1.36s, train_loss 2.506e-04, test_loss 3.602e+00, train_acc 100.0, test_acc 57.3
step 3000, dt 1.39s, train_loss 8.045e-05, test_loss 3.761e+00, train_acc 100.0, test_acc 57.5
step 4000, dt 1.41s, train_loss 3.507e-05, test_loss 3.887e+00, train_acc 100.0, test_acc 57.8
step 5000, dt 1.37s, train_loss 2.810e-05, test_loss 4.005e+00, train_acc 100.0, test_acc 57.6
step 6000, dt 1.40s, train_loss 2.151e-05, test_loss 4.122e+00, train_acc 100.0, test_acc 57.9
step 7000, dt 1.43s, train_loss 1.042e-05, test_loss 4.243e+00, train_acc 100.0, test_acc 57.8


 93%|█████████▎| 93/100 [17:35<01:18, 11.26s/it]

step 8000, dt 1.40s, train_loss 6.380e-06, test_loss 4.367e+00, train_acc 100.0, test_acc 57.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.37s, train_loss 9.763e-02, test_loss 3.442e+00, train_acc 97.7, test_acc 52.4
step 2000, dt 1.40s, train_loss 3.418e-04, test_loss 4.119e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.44s, train_loss 1.616e-04, test_loss 4.297e+00, train_acc 100.0, test_acc 55.1
step 4000, dt 1.48s, train_loss 6.103e-05, test_loss 4.443e+00, train_acc 100.0, test_acc 55.4
step 5000, dt 1.45s, train_loss 2.929e-05, test_loss 4.585e+00, train_acc 100.0, test_acc 55.2
step 6000, dt 1.45s, train_loss 2.246e-05, test_loss 4.733e+00, train_acc 100.0, test_acc 54.6
step 7000, dt 1.46s, train_loss 1.250e-05, test_loss 4.881e+00, train_acc 100.0, test_acc 54.5


 94%|█████████▍| 94/100 [17:47<01:07, 11.33s/it]

step 8000, dt 1.44s, train_loss 7.654e-06, test_loss 5.031e+00, train_acc 100.0, test_acc 54.6
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.35s, train_loss 9.206e-03, test_loss 3.264e+00, train_acc 99.9, test_acc 53.7
step 2000, dt 1.38s, train_loss 2.618e-04, test_loss 3.522e+00, train_acc 100.0, test_acc 54.2
step 3000, dt 1.41s, train_loss 9.817e-05, test_loss 3.747e+00, train_acc 100.0, test_acc 54.7
step 4000, dt 1.44s, train_loss 8.216e-05, test_loss 3.952e+00, train_acc 100.0, test_acc 54.6
step 5000, dt 1.41s, train_loss 4.330e-05, test_loss 4.142e+00, train_acc 100.0, test_acc 54.5
step 6000, dt 1.41s, train_loss 1.692e-05, test_loss 4.331e+00, train_acc 100.0, test_acc 54.5
step 7000, dt 1.45s, train_loss 1.530e-05, test_loss 4.516e+00, train_acc 100.0, test_acc 54.5


 95%|█████████▌| 95/100 [17:58<00:56, 11.32s/it]

step 8000, dt 1.44s, train_loss 7.199e-06, test_loss 4.698e+00, train_acc 100.0, test_acc 54.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.33s, train_loss 6.519e-02, test_loss 3.462e+00, train_acc 98.5, test_acc 52.2
step 2000, dt 1.35s, train_loss 5.234e-04, test_loss 3.974e+00, train_acc 100.0, test_acc 54.7
step 3000, dt 1.38s, train_loss 1.769e-04, test_loss 4.186e+00, train_acc 100.0, test_acc 54.4
step 4000, dt 1.43s, train_loss 9.494e-05, test_loss 4.357e+00, train_acc 100.0, test_acc 54.5
step 5000, dt 1.38s, train_loss 5.362e-05, test_loss 4.522e+00, train_acc 100.0, test_acc 53.9
step 6000, dt 1.40s, train_loss 2.899e-05, test_loss 4.684e+00, train_acc 100.0, test_acc 53.8
step 7000, dt 1.41s, train_loss 1.310e-05, test_loss 4.848e+00, train_acc 100.0, test_acc 53.9


 96%|█████████▌| 96/100 [18:09<00:45, 11.26s/it]

step 8000, dt 1.42s, train_loss 9.270e-06, test_loss 5.018e+00, train_acc 100.0, test_acc 53.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 1.432e-01, test_loss 3.466e+00, train_acc 95.8, test_acc 52.5
step 2000, dt 1.42s, train_loss 3.257e-04, test_loss 3.684e+00, train_acc 100.0, test_acc 55.3
step 3000, dt 1.41s, train_loss 1.242e-04, test_loss 3.867e+00, train_acc 100.0, test_acc 54.8
step 4000, dt 1.42s, train_loss 7.563e-05, test_loss 4.028e+00, train_acc 100.0, test_acc 54.9
step 5000, dt 1.42s, train_loss 3.744e-05, test_loss 4.181e+00, train_acc 100.0, test_acc 55.6
step 6000, dt 1.44s, train_loss 3.019e-05, test_loss 4.331e+00, train_acc 100.0, test_acc 55.8
step 7000, dt 1.41s, train_loss 9.589e-06, test_loss 4.474e+00, train_acc 100.0, test_acc 55.9


 97%|█████████▋| 97/100 [18:21<00:33, 11.27s/it]

step 8000, dt 1.40s, train_loss 7.393e-06, test_loss 4.620e+00, train_acc 100.0, test_acc 55.8
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 3.794e-02, test_loss 3.341e+00, train_acc 99.2, test_acc 55.1
step 2000, dt 1.41s, train_loss 3.258e-03, test_loss 4.800e+00, train_acc 100.0, test_acc 56.3
step 3000, dt 1.41s, train_loss 1.126e-04, test_loss 4.664e+00, train_acc 100.0, test_acc 57.7
step 4000, dt 1.42s, train_loss 2.649e-05, test_loss 4.729e+00, train_acc 100.0, test_acc 57.6
step 5000, dt 1.44s, train_loss 2.853e-05, test_loss 4.793e+00, train_acc 100.0, test_acc 57.5
step 6000, dt 1.42s, train_loss 1.853e-05, test_loss 4.862e+00, train_acc 100.0, test_acc 57.9
step 7000, dt 1.48s, train_loss 5.286e-06, test_loss 4.936e+00, train_acc 100.0, test_acc 58.1


 98%|█████████▊| 98/100 [18:32<00:22, 11.31s/it]

step 8000, dt 1.45s, train_loss 4.884e-06, test_loss 5.016e+00, train_acc 100.0, test_acc 57.9
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.36s, train_loss 8.228e-03, test_loss 2.853e+00, train_acc 99.9, test_acc 56.7
step 2000, dt 1.42s, train_loss 3.404e-04, test_loss 3.557e+00, train_acc 100.0, test_acc 57.2
step 3000, dt 1.42s, train_loss 9.400e-05, test_loss 3.738e+00, train_acc 100.0, test_acc 57.5
step 4000, dt 1.40s, train_loss 6.441e-05, test_loss 3.880e+00, train_acc 100.0, test_acc 58.0
step 5000, dt 1.42s, train_loss 3.050e-05, test_loss 4.015e+00, train_acc 100.0, test_acc 57.8
step 6000, dt 1.44s, train_loss 1.717e-05, test_loss 4.146e+00, train_acc 100.0, test_acc 57.8
step 7000, dt 1.42s, train_loss 9.442e-06, test_loss 4.280e+00, train_acc 100.0, test_acc 57.9


 99%|█████████▉| 99/100 [18:43<00:11, 11.32s/it]

step 8000, dt 1.46s, train_loss 9.921e-06, test_loss 4.418e+00, train_acc 100.0, test_acc 57.7
Initialized MLPBase model with 15210 parameters
step 1000, dt 1.32s, train_loss 1.023e-01, test_loss 3.654e+00, train_acc 98.2, test_acc 52.4
step 2000, dt 1.38s, train_loss 2.141e-03, test_loss 5.332e+00, train_acc 99.9, test_acc 56.1
step 3000, dt 1.43s, train_loss 8.870e-05, test_loss 5.360e+00, train_acc 100.0, test_acc 55.8
step 4000, dt 1.42s, train_loss 2.048e-05, test_loss 5.433e+00, train_acc 100.0, test_acc 55.5
step 5000, dt 1.44s, train_loss 8.764e-06, test_loss 5.505e+00, train_acc 100.0, test_acc 55.6
step 6000, dt 1.49s, train_loss 1.489e-05, test_loss 5.578e+00, train_acc 100.0, test_acc 55.8
step 7000, dt 1.47s, train_loss 4.225e-06, test_loss 5.657e+00, train_acc 100.0, test_acc 56.1


100%|██████████| 100/100 [18:55<00:00, 11.35s/it]


step 8000, dt 1.47s, train_loss 3.678e-06, test_loss 5.741e+00, train_acc 100.0, test_acc 56.2


  0%|          | 0/100 [00:00<?, ?it/s]

Initialized ConvBase model with 5210 parameters
step 1000, dt 4.31s, train_loss 6.210e-03, test_loss 8.322e-01, train_acc 99.9, test_acc 83.8
step 2000, dt 4.19s, train_loss 2.506e-04, test_loss 9.707e-01, train_acc 100.0, test_acc 84.7
step 3000, dt 4.23s, train_loss 1.343e-04, test_loss 1.078e+00, train_acc 100.0, test_acc 84.9
step 4000, dt 4.27s, train_loss 5.414e-05, test_loss 1.160e+00, train_acc 100.0, test_acc 84.3
step 5000, dt 4.35s, train_loss 3.441e-05, test_loss 1.237e+00, train_acc 100.0, test_acc 84.2
step 6000, dt 4.29s, train_loss 2.506e-05, test_loss 1.310e+00, train_acc 100.0, test_acc 84.2
step 7000, dt 4.36s, train_loss 7.534e-06, test_loss 1.379e+00, train_acc 100.0, test_acc 84.2


  1%|          | 1/100 [00:34<56:43, 34.38s/it]

step 8000, dt 4.37s, train_loss 4.210e-06, test_loss 1.446e+00, train_acc 100.0, test_acc 84.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 2.102e-02, test_loss 9.908e-01, train_acc 99.6, test_acc 79.4
step 2000, dt 4.28s, train_loss 1.712e-04, test_loss 1.207e+00, train_acc 100.0, test_acc 80.8
step 3000, dt 4.28s, train_loss 1.243e-04, test_loss 1.325e+00, train_acc 100.0, test_acc 80.4
step 4000, dt 4.31s, train_loss 5.322e-05, test_loss 1.425e+00, train_acc 100.0, test_acc 80.6
step 5000, dt 4.31s, train_loss 4.572e-05, test_loss 1.520e+00, train_acc 100.0, test_acc 80.9
step 6000, dt 4.36s, train_loss 1.588e-05, test_loss 1.613e+00, train_acc 100.0, test_acc 80.8
step 7000, dt 4.36s, train_loss 8.881e-06, test_loss 1.705e+00, train_acc 100.0, test_acc 80.9


  2%|▏         | 2/100 [01:08<56:14, 34.44s/it]

step 8000, dt 4.38s, train_loss 7.336e-06, test_loss 1.797e+00, train_acc 100.0, test_acc 80.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 1.896e-02, test_loss 1.011e+00, train_acc 96.4, test_acc 81.8
step 2000, dt 4.20s, train_loss 1.986e-04, test_loss 8.479e-01, train_acc 100.0, test_acc 86.2
step 3000, dt 4.26s, train_loss 1.704e-04, test_loss 9.329e-01, train_acc 100.0, test_acc 86.1
step 4000, dt 4.24s, train_loss 6.131e-05, test_loss 9.977e-01, train_acc 100.0, test_acc 86.0
step 5000, dt 4.37s, train_loss 4.372e-05, test_loss 1.055e+00, train_acc 100.0, test_acc 86.0
step 6000, dt 4.32s, train_loss 1.844e-05, test_loss 1.112e+00, train_acc 100.0, test_acc 86.0
step 7000, dt 4.36s, train_loss 9.755e-06, test_loss 1.165e+00, train_acc 100.0, test_acc 86.0


  3%|▎         | 3/100 [01:43<55:37, 34.41s/it]

step 8000, dt 4.45s, train_loss 5.669e-06, test_loss 1.216e+00, train_acc 100.0, test_acc 86.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 8.622e-02, test_loss 9.371e-01, train_acc 96.9, test_acc 78.8
step 2000, dt 4.28s, train_loss 2.367e-02, test_loss 1.217e+00, train_acc 97.4, test_acc 81.6
step 3000, dt 4.28s, train_loss 1.239e-04, test_loss 1.213e+00, train_acc 100.0, test_acc 85.1
step 4000, dt 4.34s, train_loss 7.059e-05, test_loss 1.321e+00, train_acc 100.0, test_acc 84.8
step 5000, dt 4.30s, train_loss 2.898e-05, test_loss 1.388e+00, train_acc 100.0, test_acc 84.3
step 6000, dt 4.33s, train_loss 1.264e-05, test_loss 1.442e+00, train_acc 100.0, test_acc 84.3
step 7000, dt 4.44s, train_loss 9.586e-06, test_loss 1.494e+00, train_acc 100.0, test_acc 84.3


  4%|▍         | 4/100 [02:17<55:11, 34.49s/it]

step 8000, dt 4.48s, train_loss 4.735e-06, test_loss 1.553e+00, train_acc 100.0, test_acc 83.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.24s, train_loss 1.368e-02, test_loss 8.051e-01, train_acc 98.1, test_acc 82.1
step 2000, dt 4.20s, train_loss 1.037e-03, test_loss 7.957e-01, train_acc 100.0, test_acc 86.4
step 3000, dt 4.33s, train_loss 1.557e-04, test_loss 9.343e-01, train_acc 100.0, test_acc 86.9
step 4000, dt 4.31s, train_loss 7.063e-05, test_loss 1.005e+00, train_acc 100.0, test_acc 86.5
step 5000, dt 4.32s, train_loss 3.123e-05, test_loss 1.065e+00, train_acc 100.0, test_acc 86.7
step 6000, dt 4.39s, train_loss 1.747e-05, test_loss 1.122e+00, train_acc 100.0, test_acc 86.7
step 7000, dt 4.37s, train_loss 1.310e-05, test_loss 1.171e+00, train_acc 100.0, test_acc 86.4


  5%|▌         | 5/100 [02:52<54:40, 34.53s/it]

step 8000, dt 4.44s, train_loss 5.973e-06, test_loss 1.221e+00, train_acc 100.0, test_acc 86.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 3.488e-02, test_loss 9.108e-01, train_acc 96.9, test_acc 81.8
step 2000, dt 4.24s, train_loss 5.478e-04, test_loss 9.423e-01, train_acc 100.0, test_acc 83.3
step 3000, dt 4.22s, train_loss 3.164e-04, test_loss 1.051e+00, train_acc 100.0, test_acc 83.4
step 4000, dt 4.37s, train_loss 1.040e-04, test_loss 1.150e+00, train_acc 100.0, test_acc 83.3
step 5000, dt 4.46s, train_loss 6.779e-05, test_loss 1.243e+00, train_acc 100.0, test_acc 82.9
step 6000, dt 4.49s, train_loss 2.753e-05, test_loss 1.330e+00, train_acc 100.0, test_acc 82.7
step 7000, dt 4.55s, train_loss 1.507e-05, test_loss 1.413e+00, train_acc 100.0, test_acc 82.8


  6%|▌         | 6/100 [03:27<54:24, 34.73s/it]

step 8000, dt 4.55s, train_loss 1.178e-05, test_loss 1.494e+00, train_acc 100.0, test_acc 82.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 1.732e-02, test_loss 8.195e-01, train_acc 99.4, test_acc 84.0
step 2000, dt 4.21s, train_loss 2.365e-04, test_loss 8.442e-01, train_acc 100.0, test_acc 87.4
step 3000, dt 4.23s, train_loss 8.140e-05, test_loss 9.314e-01, train_acc 100.0, test_acc 87.1
step 4000, dt 4.24s, train_loss 8.168e-05, test_loss 9.986e-01, train_acc 100.0, test_acc 86.9
step 5000, dt 4.26s, train_loss 2.814e-05, test_loss 1.056e+00, train_acc 100.0, test_acc 87.0
step 6000, dt 4.29s, train_loss 2.844e-05, test_loss 1.112e+00, train_acc 100.0, test_acc 86.6
step 7000, dt 4.29s, train_loss 1.219e-05, test_loss 1.165e+00, train_acc 100.0, test_acc 86.6


  7%|▋         | 7/100 [04:01<53:29, 34.51s/it]

step 8000, dt 4.38s, train_loss 6.574e-06, test_loss 1.218e+00, train_acc 100.0, test_acc 86.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 1.556e-02, test_loss 6.975e-01, train_acc 99.4, test_acc 83.8
step 2000, dt 4.22s, train_loss 2.576e-04, test_loss 7.714e-01, train_acc 100.0, test_acc 86.0
step 3000, dt 4.24s, train_loss 9.285e-05, test_loss 8.456e-01, train_acc 100.0, test_acc 86.1
step 4000, dt 4.22s, train_loss 1.168e-04, test_loss 9.037e-01, train_acc 100.0, test_acc 85.8
step 5000, dt 4.25s, train_loss 2.343e-05, test_loss 9.561e-01, train_acc 100.0, test_acc 85.8
step 6000, dt 4.21s, train_loss 2.392e-05, test_loss 1.008e+00, train_acc 100.0, test_acc 85.8
step 7000, dt 4.35s, train_loss 6.374e-06, test_loss 1.059e+00, train_acc 100.0, test_acc 85.4


  8%|▊         | 8/100 [04:35<52:41, 34.37s/it]

step 8000, dt 4.34s, train_loss 4.842e-06, test_loss 1.108e+00, train_acc 100.0, test_acc 85.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 4.755e-02, test_loss 1.114e+00, train_acc 97.6, test_acc 79.1
step 2000, dt 4.24s, train_loss 2.525e-04, test_loss 1.090e+00, train_acc 100.0, test_acc 83.3
step 3000, dt 4.34s, train_loss 3.776e-04, test_loss 1.224e+00, train_acc 100.0, test_acc 83.3
step 4000, dt 4.38s, train_loss 5.160e-05, test_loss 1.333e+00, train_acc 100.0, test_acc 82.5
step 5000, dt 4.36s, train_loss 4.018e-05, test_loss 1.434e+00, train_acc 100.0, test_acc 82.6
step 6000, dt 4.58s, train_loss 2.202e-05, test_loss 1.527e+00, train_acc 100.0, test_acc 82.4
step 7000, dt 4.47s, train_loss 1.034e-05, test_loss 1.615e+00, train_acc 100.0, test_acc 82.4


  9%|▉         | 9/100 [05:10<52:26, 34.58s/it]

step 8000, dt 4.52s, train_loss 6.138e-06, test_loss 1.699e+00, train_acc 100.0, test_acc 82.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 1.857e-02, test_loss 8.182e-01, train_acc 99.3, test_acc 83.8
step 2000, dt 4.17s, train_loss 3.335e-04, test_loss 1.079e+00, train_acc 100.0, test_acc 82.8
step 3000, dt 4.21s, train_loss 1.804e-04, test_loss 1.210e+00, train_acc 100.0, test_acc 83.2
step 4000, dt 4.21s, train_loss 4.408e-05, test_loss 1.317e+00, train_acc 100.0, test_acc 83.2
step 5000, dt 4.26s, train_loss 2.894e-05, test_loss 1.408e+00, train_acc 100.0, test_acc 83.1
step 6000, dt 4.28s, train_loss 2.039e-05, test_loss 1.493e+00, train_acc 100.0, test_acc 83.5
step 7000, dt 4.27s, train_loss 1.634e-05, test_loss 1.575e+00, train_acc 100.0, test_acc 83.5


 10%|█         | 10/100 [05:44<51:34, 34.39s/it]

step 8000, dt 4.37s, train_loss 7.936e-06, test_loss 1.655e+00, train_acc 100.0, test_acc 83.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 8.108e-02, test_loss 7.865e-01, train_acc 98.8, test_acc 81.6
step 2000, dt 4.33s, train_loss 8.600e-02, test_loss 1.076e+00, train_acc 96.0, test_acc 81.3
step 3000, dt 4.36s, train_loss 1.651e-04, test_loss 1.095e+00, train_acc 100.0, test_acc 83.9
step 4000, dt 4.55s, train_loss 1.153e-04, test_loss 1.157e+00, train_acc 100.0, test_acc 84.2
step 5000, dt 4.48s, train_loss 5.238e-05, test_loss 1.210e+00, train_acc 100.0, test_acc 84.2
step 6000, dt 4.52s, train_loss 1.497e-05, test_loss 1.258e+00, train_acc 100.0, test_acc 84.5
step 7000, dt 4.65s, train_loss 1.581e-05, test_loss 1.301e+00, train_acc 100.0, test_acc 84.7


 11%|█         | 11/100 [06:20<51:40, 34.83s/it]

step 8000, dt 4.73s, train_loss 1.411e-05, test_loss 1.347e+00, train_acc 100.0, test_acc 84.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.25s, train_loss 8.735e-03, test_loss 8.367e-01, train_acc 98.8, test_acc 82.9
step 2000, dt 4.29s, train_loss 3.105e-04, test_loss 1.028e+00, train_acc 100.0, test_acc 84.5
step 3000, dt 4.44s, train_loss 7.758e-05, test_loss 1.118e+00, train_acc 100.0, test_acc 84.0
step 4000, dt 4.36s, train_loss 7.083e-05, test_loss 1.185e+00, train_acc 100.0, test_acc 83.9
step 5000, dt 4.42s, train_loss 2.987e-05, test_loss 1.252e+00, train_acc 100.0, test_acc 83.6
step 6000, dt 4.43s, train_loss 2.213e-05, test_loss 1.321e+00, train_acc 100.0, test_acc 83.4
step 7000, dt 4.62s, train_loss 7.825e-06, test_loss 1.387e+00, train_acc 100.0, test_acc 83.4


 12%|█▏        | 12/100 [06:56<51:23, 35.04s/it]

step 8000, dt 4.69s, train_loss 4.494e-06, test_loss 1.451e+00, train_acc 100.0, test_acc 83.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 3.395e-03, test_loss 6.485e-01, train_acc 100.0, test_acc 87.3
step 2000, dt 4.21s, train_loss 3.665e-04, test_loss 8.087e-01, train_acc 100.0, test_acc 87.0
step 3000, dt 4.22s, train_loss 1.060e-04, test_loss 9.052e-01, train_acc 100.0, test_acc 87.0
step 4000, dt 4.32s, train_loss 5.726e-05, test_loss 9.842e-01, train_acc 100.0, test_acc 86.7
step 5000, dt 4.22s, train_loss 1.697e-05, test_loss 1.055e+00, train_acc 100.0, test_acc 86.7
step 6000, dt 4.24s, train_loss 2.258e-05, test_loss 1.118e+00, train_acc 100.0, test_acc 86.7
step 7000, dt 4.25s, train_loss 7.342e-06, test_loss 1.182e+00, train_acc 100.0, test_acc 86.8


 13%|█▎        | 13/100 [07:30<50:20, 34.71s/it]

step 8000, dt 4.30s, train_loss 4.554e-06, test_loss 1.246e+00, train_acc 100.0, test_acc 86.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 3.275e-02, test_loss 9.052e-01, train_acc 98.9, test_acc 80.6
step 2000, dt 4.18s, train_loss 2.209e-04, test_loss 9.940e-01, train_acc 100.0, test_acc 83.9
step 3000, dt 4.25s, train_loss 1.369e-04, test_loss 1.099e+00, train_acc 100.0, test_acc 83.9
step 4000, dt 4.32s, train_loss 4.779e-05, test_loss 1.166e+00, train_acc 100.0, test_acc 83.4
step 5000, dt 4.38s, train_loss 3.469e-05, test_loss 1.228e+00, train_acc 100.0, test_acc 83.3
step 6000, dt 4.32s, train_loss 1.345e-05, test_loss 1.287e+00, train_acc 100.0, test_acc 83.2
step 7000, dt 4.35s, train_loss 1.109e-05, test_loss 1.345e+00, train_acc 100.0, test_acc 83.1


 14%|█▍        | 14/100 [08:04<49:35, 34.60s/it]

step 8000, dt 4.35s, train_loss 7.398e-06, test_loss 1.402e+00, train_acc 100.0, test_acc 83.2
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.13s, train_loss 1.787e-02, test_loss 9.037e-01, train_acc 99.4, test_acc 83.2
step 2000, dt 4.20s, train_loss 3.653e-04, test_loss 1.034e+00, train_acc 100.0, test_acc 83.5
step 3000, dt 4.27s, train_loss 9.882e-05, test_loss 1.156e+00, train_acc 100.0, test_acc 83.5
step 4000, dt 4.24s, train_loss 6.709e-05, test_loss 1.249e+00, train_acc 100.0, test_acc 83.6
step 5000, dt 4.28s, train_loss 3.960e-05, test_loss 1.332e+00, train_acc 100.0, test_acc 83.6
step 6000, dt 4.36s, train_loss 1.620e-05, test_loss 1.409e+00, train_acc 100.0, test_acc 83.6
step 7000, dt 4.43s, train_loss 1.387e-05, test_loss 1.484e+00, train_acc 100.0, test_acc 83.6


 15%|█▌        | 15/100 [08:38<48:55, 34.53s/it]

step 8000, dt 4.45s, train_loss 7.249e-06, test_loss 1.559e+00, train_acc 100.0, test_acc 83.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 3.815e-03, test_loss 1.021e+00, train_acc 99.8, test_acc 82.2
step 2000, dt 4.26s, train_loss 2.822e-04, test_loss 1.110e+00, train_acc 100.0, test_acc 83.3
step 3000, dt 4.29s, train_loss 8.683e-05, test_loss 1.210e+00, train_acc 100.0, test_acc 83.1
step 4000, dt 4.33s, train_loss 8.604e-05, test_loss 1.299e+00, train_acc 100.0, test_acc 82.9
step 5000, dt 4.35s, train_loss 2.496e-05, test_loss 1.382e+00, train_acc 100.0, test_acc 83.1
step 6000, dt 4.39s, train_loss 2.201e-05, test_loss 1.458e+00, train_acc 100.0, test_acc 83.1
step 7000, dt 4.36s, train_loss 8.215e-06, test_loss 1.532e+00, train_acc 100.0, test_acc 82.7


 16%|█▌        | 16/100 [09:13<48:23, 34.56s/it]

step 8000, dt 4.48s, train_loss 5.012e-06, test_loss 1.608e+00, train_acc 100.0, test_acc 82.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 1.388e-01, test_loss 1.197e+00, train_acc 96.0, test_acc 75.8
step 2000, dt 4.32s, train_loss 3.660e-04, test_loss 1.244e+00, train_acc 100.0, test_acc 82.0
step 3000, dt 4.32s, train_loss 1.570e-04, test_loss 1.359e+00, train_acc 100.0, test_acc 82.0
step 4000, dt 4.35s, train_loss 6.867e-05, test_loss 1.443e+00, train_acc 100.0, test_acc 82.3
step 5000, dt 4.35s, train_loss 4.216e-05, test_loss 1.526e+00, train_acc 100.0, test_acc 82.2
step 6000, dt 4.41s, train_loss 3.082e-05, test_loss 1.603e+00, train_acc 100.0, test_acc 82.1
step 7000, dt 4.41s, train_loss 1.510e-05, test_loss 1.677e+00, train_acc 100.0, test_acc 82.1


 17%|█▋        | 17/100 [09:48<47:55, 34.65s/it]

step 8000, dt 4.48s, train_loss 1.263e-05, test_loss 1.755e+00, train_acc 100.0, test_acc 81.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 2.517e-02, test_loss 8.601e-01, train_acc 98.9, test_acc 83.9
step 2000, dt 4.26s, train_loss 1.728e-04, test_loss 8.615e-01, train_acc 100.0, test_acc 86.1
step 3000, dt 4.27s, train_loss 1.312e-04, test_loss 9.410e-01, train_acc 100.0, test_acc 86.1
step 4000, dt 4.29s, train_loss 5.463e-05, test_loss 1.011e+00, train_acc 100.0, test_acc 85.6
step 5000, dt 4.33s, train_loss 3.253e-05, test_loss 1.073e+00, train_acc 100.0, test_acc 85.8
step 6000, dt 4.41s, train_loss 2.123e-05, test_loss 1.135e+00, train_acc 100.0, test_acc 85.8
step 7000, dt 4.43s, train_loss 9.662e-06, test_loss 1.195e+00, train_acc 100.0, test_acc 85.7


 18%|█▊        | 18/100 [10:22<47:21, 34.65s/it]

step 8000, dt 4.47s, train_loss 5.540e-06, test_loss 1.255e+00, train_acc 100.0, test_acc 85.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 4.205e-02, test_loss 9.293e-01, train_acc 97.5, test_acc 80.9
step 2000, dt 4.35s, train_loss 3.183e-04, test_loss 9.763e-01, train_acc 100.0, test_acc 85.5
step 3000, dt 4.51s, train_loss 1.019e-04, test_loss 1.051e+00, train_acc 100.0, test_acc 85.3
step 4000, dt 4.47s, train_loss 6.953e-05, test_loss 1.116e+00, train_acc 100.0, test_acc 85.3
step 5000, dt 4.51s, train_loss 2.786e-05, test_loss 1.175e+00, train_acc 100.0, test_acc 85.0
step 6000, dt 4.65s, train_loss 1.118e-05, test_loss 1.226e+00, train_acc 100.0, test_acc 84.8
step 7000, dt 4.63s, train_loss 9.896e-06, test_loss 1.274e+00, train_acc 100.0, test_acc 84.8


 19%|█▉        | 19/100 [10:58<47:20, 35.07s/it]

step 8000, dt 4.76s, train_loss 2.441e-06, test_loss 1.320e+00, train_acc 100.0, test_acc 84.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 6.938e-02, test_loss 7.619e-01, train_acc 98.8, test_acc 84.2
step 2000, dt 4.31s, train_loss 2.801e-04, test_loss 8.885e-01, train_acc 100.0, test_acc 85.6
step 3000, dt 4.23s, train_loss 9.578e-05, test_loss 9.771e-01, train_acc 100.0, test_acc 86.1
step 4000, dt 4.28s, train_loss 5.905e-05, test_loss 1.043e+00, train_acc 100.0, test_acc 85.9
step 5000, dt 4.30s, train_loss 3.950e-05, test_loss 1.097e+00, train_acc 100.0, test_acc 85.9
step 6000, dt 4.30s, train_loss 9.155e-06, test_loss 1.146e+00, train_acc 100.0, test_acc 85.6
step 7000, dt 4.36s, train_loss 1.034e-05, test_loss 1.191e+00, train_acc 100.0, test_acc 85.7


 20%|██        | 20/100 [11:33<46:30, 34.89s/it]

step 8000, dt 4.48s, train_loss 7.300e-06, test_loss 1.235e+00, train_acc 100.0, test_acc 85.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 1.422e-01, test_loss 9.290e-01, train_acc 97.0, test_acc 80.9
step 2000, dt 4.23s, train_loss 2.173e-04, test_loss 8.764e-01, train_acc 100.0, test_acc 85.6
step 3000, dt 4.29s, train_loss 1.501e-04, test_loss 9.749e-01, train_acc 100.0, test_acc 85.8
step 4000, dt 4.27s, train_loss 6.330e-05, test_loss 1.044e+00, train_acc 100.0, test_acc 85.7
step 5000, dt 4.28s, train_loss 3.487e-05, test_loss 1.109e+00, train_acc 100.0, test_acc 85.8
step 6000, dt 4.28s, train_loss 2.133e-05, test_loss 1.170e+00, train_acc 100.0, test_acc 86.0
step 7000, dt 4.26s, train_loss 5.721e-06, test_loss 1.228e+00, train_acc 100.0, test_acc 85.7


 21%|██        | 21/100 [12:07<45:38, 34.67s/it]

step 8000, dt 4.34s, train_loss 9.468e-06, test_loss 1.284e+00, train_acc 100.0, test_acc 85.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 1.264e-01, test_loss 8.980e-01, train_acc 98.6, test_acc 83.5
step 2000, dt 4.30s, train_loss 4.221e-03, test_loss 1.099e+00, train_acc 99.6, test_acc 84.0
step 3000, dt 4.43s, train_loss 7.550e-05, test_loss 1.021e+00, train_acc 100.0, test_acc 86.6
step 4000, dt 4.48s, train_loss 3.248e-05, test_loss 1.059e+00, train_acc 100.0, test_acc 86.6
step 5000, dt 4.49s, train_loss 8.816e-06, test_loss 1.094e+00, train_acc 100.0, test_acc 86.4
step 6000, dt 4.57s, train_loss 6.774e-06, test_loss 1.128e+00, train_acc 100.0, test_acc 86.5
step 7000, dt 4.62s, train_loss 7.335e-06, test_loss 1.164e+00, train_acc 100.0, test_acc 86.6


 22%|██▏       | 22/100 [12:43<45:28, 34.98s/it]

step 8000, dt 4.66s, train_loss 4.606e-06, test_loss 1.202e+00, train_acc 100.0, test_acc 86.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 2.203e-03, test_loss 7.106e-01, train_acc 100.0, test_acc 84.0
step 2000, dt 4.18s, train_loss 2.439e-04, test_loss 7.901e-01, train_acc 100.0, test_acc 85.0
step 3000, dt 4.26s, train_loss 1.155e-04, test_loss 8.644e-01, train_acc 100.0, test_acc 85.1
step 4000, dt 4.32s, train_loss 7.969e-05, test_loss 9.319e-01, train_acc 100.0, test_acc 84.9
step 5000, dt 4.28s, train_loss 2.045e-05, test_loss 9.909e-01, train_acc 100.0, test_acc 84.7
step 6000, dt 4.40s, train_loss 1.371e-05, test_loss 1.046e+00, train_acc 100.0, test_acc 84.5
step 7000, dt 4.34s, train_loss 6.958e-06, test_loss 1.102e+00, train_acc 100.0, test_acc 84.9


 23%|██▎       | 23/100 [13:17<44:40, 34.81s/it]

step 8000, dt 4.46s, train_loss 3.738e-06, test_loss 1.157e+00, train_acc 100.0, test_acc 84.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.13s, train_loss 2.639e-02, test_loss 7.433e-01, train_acc 99.0, test_acc 82.2
step 2000, dt 4.15s, train_loss 9.432e-04, test_loss 9.513e-01, train_acc 100.0, test_acc 83.4
step 3000, dt 4.26s, train_loss 2.221e-04, test_loss 1.061e+00, train_acc 100.0, test_acc 83.7
step 4000, dt 4.23s, train_loss 5.978e-05, test_loss 1.155e+00, train_acc 100.0, test_acc 83.6
step 5000, dt 4.30s, train_loss 4.438e-05, test_loss 1.237e+00, train_acc 100.0, test_acc 83.5
step 6000, dt 4.26s, train_loss 2.638e-05, test_loss 1.316e+00, train_acc 100.0, test_acc 83.6
step 7000, dt 4.34s, train_loss 1.583e-05, test_loss 1.391e+00, train_acc 100.0, test_acc 83.4


 24%|██▍       | 24/100 [13:51<43:48, 34.59s/it]

step 8000, dt 4.38s, train_loss 6.462e-06, test_loss 1.466e+00, train_acc 100.0, test_acc 83.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.14s, train_loss 4.915e-02, test_loss 9.039e-01, train_acc 97.8, test_acc 81.8
step 2000, dt 4.23s, train_loss 4.548e-04, test_loss 9.368e-01, train_acc 100.0, test_acc 85.3
step 3000, dt 4.30s, train_loss 8.482e-05, test_loss 9.912e-01, train_acc 100.0, test_acc 85.6
step 4000, dt 4.43s, train_loss 8.368e-05, test_loss 1.033e+00, train_acc 100.0, test_acc 85.4
step 5000, dt 4.38s, train_loss 2.501e-05, test_loss 1.073e+00, train_acc 100.0, test_acc 85.3
step 6000, dt 4.35s, train_loss 1.705e-05, test_loss 1.115e+00, train_acc 100.0, test_acc 85.3
step 7000, dt 4.41s, train_loss 6.189e-06, test_loss 1.158e+00, train_acc 100.0, test_acc 85.3


 25%|██▌       | 25/100 [14:26<43:14, 34.60s/it]

step 8000, dt 4.38s, train_loss 6.219e-06, test_loss 1.199e+00, train_acc 100.0, test_acc 85.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 9.052e-02, test_loss 1.116e+00, train_acc 96.4, test_acc 77.8
step 2000, dt 4.20s, train_loss 3.330e-04, test_loss 1.086e+00, train_acc 100.0, test_acc 83.7
step 3000, dt 4.31s, train_loss 4.913e-05, test_loss 1.197e+00, train_acc 100.0, test_acc 83.6
step 4000, dt 4.27s, train_loss 7.387e-05, test_loss 1.291e+00, train_acc 100.0, test_acc 83.4
step 5000, dt 4.35s, train_loss 3.906e-05, test_loss 1.376e+00, train_acc 100.0, test_acc 83.4
step 6000, dt 4.34s, train_loss 2.906e-05, test_loss 1.459e+00, train_acc 100.0, test_acc 83.2
step 7000, dt 4.32s, train_loss 1.089e-05, test_loss 1.543e+00, train_acc 100.0, test_acc 83.3


 26%|██▌       | 26/100 [15:00<42:34, 34.52s/it]

step 8000, dt 4.39s, train_loss 9.792e-06, test_loss 1.626e+00, train_acc 100.0, test_acc 83.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.14s, train_loss 1.156e-02, test_loss 7.177e-01, train_acc 99.7, test_acc 85.4
step 2000, dt 4.23s, train_loss 2.856e-04, test_loss 8.015e-01, train_acc 100.0, test_acc 86.4
step 3000, dt 4.26s, train_loss 7.695e-05, test_loss 8.940e-01, train_acc 100.0, test_acc 85.7
step 4000, dt 4.22s, train_loss 4.615e-05, test_loss 9.599e-01, train_acc 100.0, test_acc 85.9
step 5000, dt 4.28s, train_loss 2.881e-05, test_loss 1.015e+00, train_acc 100.0, test_acc 85.8
step 6000, dt 4.27s, train_loss 1.469e-05, test_loss 1.070e+00, train_acc 100.0, test_acc 85.7
step 7000, dt 4.35s, train_loss 7.180e-06, test_loss 1.124e+00, train_acc 100.0, test_acc 85.6


 27%|██▋       | 27/100 [15:34<41:51, 34.41s/it]

step 8000, dt 4.38s, train_loss 3.777e-06, test_loss 1.178e+00, train_acc 100.0, test_acc 85.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.12s, train_loss 7.964e-02, test_loss 9.562e-01, train_acc 97.3, test_acc 81.1
step 2000, dt 4.21s, train_loss 3.118e-04, test_loss 1.102e+00, train_acc 100.0, test_acc 84.2
step 3000, dt 4.27s, train_loss 5.187e-05, test_loss 1.217e+00, train_acc 100.0, test_acc 84.4
step 4000, dt 4.32s, train_loss 6.143e-05, test_loss 1.312e+00, train_acc 100.0, test_acc 84.2
step 5000, dt 4.31s, train_loss 5.124e-05, test_loss 1.395e+00, train_acc 100.0, test_acc 83.6
step 6000, dt 4.35s, train_loss 1.981e-05, test_loss 1.474e+00, train_acc 100.0, test_acc 83.5
step 7000, dt 4.36s, train_loss 1.363e-05, test_loss 1.552e+00, train_acc 100.0, test_acc 83.4


 28%|██▊       | 28/100 [16:09<41:17, 34.42s/it]

step 8000, dt 4.47s, train_loss 5.705e-06, test_loss 1.631e+00, train_acc 100.0, test_acc 83.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 3.738e-02, test_loss 8.864e-01, train_acc 99.6, test_acc 82.8
step 2000, dt 4.23s, train_loss 2.169e-04, test_loss 8.785e-01, train_acc 100.0, test_acc 84.7
step 3000, dt 4.26s, train_loss 1.004e-04, test_loss 9.891e-01, train_acc 100.0, test_acc 84.9
step 4000, dt 4.27s, train_loss 6.748e-05, test_loss 1.070e+00, train_acc 100.0, test_acc 84.9
step 5000, dt 4.39s, train_loss 4.055e-05, test_loss 1.141e+00, train_acc 100.0, test_acc 84.3
step 6000, dt 4.45s, train_loss 2.059e-05, test_loss 1.211e+00, train_acc 100.0, test_acc 84.2
step 7000, dt 4.51s, train_loss 1.289e-05, test_loss 1.275e+00, train_acc 100.0, test_acc 84.2


 29%|██▉       | 29/100 [16:44<40:55, 34.58s/it]

step 8000, dt 4.68s, train_loss 7.280e-06, test_loss 1.337e+00, train_acc 100.0, test_acc 84.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.13s, train_loss 2.053e-02, test_loss 6.991e-01, train_acc 99.5, test_acc 84.2
step 2000, dt 4.24s, train_loss 1.022e-03, test_loss 9.068e-01, train_acc 100.0, test_acc 86.0
step 3000, dt 4.23s, train_loss 1.431e-04, test_loss 7.083e-01, train_acc 100.0, test_acc 89.6
step 4000, dt 4.37s, train_loss 6.892e-05, test_loss 7.769e-01, train_acc 100.0, test_acc 89.4
step 5000, dt 4.33s, train_loss 3.645e-05, test_loss 8.306e-01, train_acc 100.0, test_acc 88.7
step 6000, dt 4.48s, train_loss 1.387e-05, test_loss 8.757e-01, train_acc 100.0, test_acc 88.5
step 7000, dt 4.46s, train_loss 1.486e-05, test_loss 9.160e-01, train_acc 100.0, test_acc 88.3


 30%|███       | 30/100 [17:18<40:24, 34.63s/it]

step 8000, dt 4.52s, train_loss 6.289e-06, test_loss 9.548e-01, train_acc 100.0, test_acc 88.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 8.360e-03, test_loss 8.953e-01, train_acc 99.7, test_acc 81.0
step 2000, dt 4.16s, train_loss 5.298e-04, test_loss 1.029e+00, train_acc 100.0, test_acc 81.9
step 3000, dt 4.30s, train_loss 1.800e-04, test_loss 1.147e+00, train_acc 100.0, test_acc 81.0
step 4000, dt 4.30s, train_loss 6.217e-05, test_loss 1.248e+00, train_acc 100.0, test_acc 80.7
step 5000, dt 4.30s, train_loss 2.947e-05, test_loss 1.337e+00, train_acc 100.0, test_acc 80.8
step 6000, dt 4.34s, train_loss 1.427e-05, test_loss 1.421e+00, train_acc 100.0, test_acc 80.2
step 7000, dt 4.35s, train_loss 1.226e-05, test_loss 1.509e+00, train_acc 100.0, test_acc 80.0


 31%|███       | 31/100 [17:53<39:43, 34.55s/it]

step 8000, dt 4.37s, train_loss 4.210e-06, test_loss 1.595e+00, train_acc 100.0, test_acc 80.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.13s, train_loss 3.506e-02, test_loss 9.722e-01, train_acc 97.1, test_acc 79.9
step 2000, dt 4.27s, train_loss 5.716e-04, test_loss 1.013e+00, train_acc 100.0, test_acc 83.2
step 3000, dt 4.24s, train_loss 2.103e-04, test_loss 1.123e+00, train_acc 100.0, test_acc 83.4
step 4000, dt 4.31s, train_loss 6.172e-05, test_loss 1.209e+00, train_acc 100.0, test_acc 83.6
step 5000, dt 4.43s, train_loss 5.583e-05, test_loss 1.286e+00, train_acc 100.0, test_acc 83.2
step 6000, dt 4.42s, train_loss 2.438e-05, test_loss 1.362e+00, train_acc 100.0, test_acc 83.4
step 7000, dt 4.55s, train_loss 7.282e-06, test_loss 1.435e+00, train_acc 100.0, test_acc 83.5


 32%|███▏      | 32/100 [18:28<39:16, 34.65s/it]

step 8000, dt 4.53s, train_loss 6.155e-06, test_loss 1.507e+00, train_acc 100.0, test_acc 83.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 9.133e-02, test_loss 1.079e+00, train_acc 95.8, test_acc 78.7
step 2000, dt 4.21s, train_loss 5.457e-04, test_loss 9.140e-01, train_acc 100.0, test_acc 86.3
step 3000, dt 4.26s, train_loss 2.542e-04, test_loss 1.010e+00, train_acc 100.0, test_acc 86.3
step 4000, dt 4.31s, train_loss 7.981e-05, test_loss 1.087e+00, train_acc 100.0, test_acc 86.3
step 5000, dt 4.32s, train_loss 3.036e-05, test_loss 1.153e+00, train_acc 100.0, test_acc 86.0
step 6000, dt 4.33s, train_loss 2.095e-05, test_loss 1.217e+00, train_acc 100.0, test_acc 86.1
step 7000, dt 4.38s, train_loss 1.151e-05, test_loss 1.277e+00, train_acc 100.0, test_acc 85.9


 33%|███▎      | 33/100 [19:02<38:37, 34.60s/it]

step 8000, dt 4.44s, train_loss 9.736e-06, test_loss 1.335e+00, train_acc 100.0, test_acc 86.2
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.14s, train_loss 3.684e-02, test_loss 7.154e-01, train_acc 99.4, test_acc 84.1
step 2000, dt 4.21s, train_loss 4.607e-04, test_loss 8.383e-01, train_acc 100.0, test_acc 86.0
step 3000, dt 4.24s, train_loss 1.619e-04, test_loss 9.458e-01, train_acc 100.0, test_acc 85.6
step 4000, dt 4.25s, train_loss 6.914e-05, test_loss 1.020e+00, train_acc 100.0, test_acc 85.4
step 5000, dt 4.29s, train_loss 2.922e-05, test_loss 1.082e+00, train_acc 100.0, test_acc 85.2
step 6000, dt 4.30s, train_loss 1.253e-05, test_loss 1.138e+00, train_acc 100.0, test_acc 85.2
step 7000, dt 4.37s, train_loss 1.057e-05, test_loss 1.188e+00, train_acc 100.0, test_acc 85.4


 34%|███▍      | 34/100 [19:36<37:55, 34.48s/it]

step 8000, dt 4.39s, train_loss 5.814e-06, test_loss 1.234e+00, train_acc 100.0, test_acc 85.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 8.598e-03, test_loss 7.745e-01, train_acc 100.0, test_acc 83.3
step 2000, dt 4.20s, train_loss 4.030e-04, test_loss 9.741e-01, train_acc 100.0, test_acc 84.2
step 3000, dt 4.19s, train_loss 1.593e-04, test_loss 1.081e+00, train_acc 100.0, test_acc 84.1
step 4000, dt 4.24s, train_loss 8.866e-05, test_loss 1.169e+00, train_acc 100.0, test_acc 84.1
step 5000, dt 4.22s, train_loss 4.406e-05, test_loss 1.243e+00, train_acc 100.0, test_acc 84.0
step 6000, dt 4.33s, train_loss 2.161e-05, test_loss 1.310e+00, train_acc 100.0, test_acc 84.2
step 7000, dt 4.35s, train_loss 1.530e-05, test_loss 1.379e+00, train_acc 100.0, test_acc 83.9


 35%|███▌      | 35/100 [20:10<37:13, 34.37s/it]

step 8000, dt 4.40s, train_loss 5.572e-06, test_loss 1.447e+00, train_acc 100.0, test_acc 84.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.14s, train_loss 6.478e-02, test_loss 6.311e-01, train_acc 98.4, test_acc 85.5
step 2000, dt 4.18s, train_loss 2.233e-04, test_loss 7.986e-01, train_acc 100.0, test_acc 87.7
step 3000, dt 4.27s, train_loss 1.877e-04, test_loss 8.719e-01, train_acc 100.0, test_acc 87.3
step 4000, dt 4.24s, train_loss 1.148e-04, test_loss 9.330e-01, train_acc 100.0, test_acc 86.9
step 5000, dt 4.31s, train_loss 4.093e-05, test_loss 9.925e-01, train_acc 100.0, test_acc 86.5
step 6000, dt 4.26s, train_loss 1.200e-05, test_loss 1.049e+00, train_acc 100.0, test_acc 86.3
step 7000, dt 4.31s, train_loss 1.230e-05, test_loss 1.104e+00, train_acc 100.0, test_acc 86.2


 36%|███▌      | 36/100 [20:45<36:33, 34.28s/it]

step 8000, dt 4.36s, train_loss 7.071e-06, test_loss 1.161e+00, train_acc 100.0, test_acc 86.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 2.724e-02, test_loss 1.002e+00, train_acc 98.1, test_acc 81.4
step 2000, dt 4.25s, train_loss 2.249e-04, test_loss 8.907e-01, train_acc 100.0, test_acc 86.3
step 3000, dt 4.21s, train_loss 9.751e-05, test_loss 9.771e-01, train_acc 100.0, test_acc 86.5
step 4000, dt 4.28s, train_loss 3.408e-05, test_loss 1.046e+00, train_acc 100.0, test_acc 86.4
step 5000, dt 4.30s, train_loss 2.664e-05, test_loss 1.112e+00, train_acc 100.0, test_acc 86.1
step 6000, dt 4.28s, train_loss 1.677e-05, test_loss 1.174e+00, train_acc 100.0, test_acc 85.7
step 7000, dt 4.37s, train_loss 6.804e-06, test_loss 1.236e+00, train_acc 100.0, test_acc 85.7


 37%|███▋      | 37/100 [21:19<35:58, 34.26s/it]

step 8000, dt 4.37s, train_loss 3.860e-06, test_loss 1.298e+00, train_acc 100.0, test_acc 85.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 2.216e-02, test_loss 9.545e-01, train_acc 98.9, test_acc 82.1
step 2000, dt 4.18s, train_loss 1.956e-04, test_loss 1.009e+00, train_acc 100.0, test_acc 85.4
step 3000, dt 4.39s, train_loss 3.914e-05, test_loss 1.077e+00, train_acc 100.0, test_acc 85.7
step 4000, dt 4.35s, train_loss 5.366e-05, test_loss 1.135e+00, train_acc 100.0, test_acc 85.6
step 5000, dt 4.42s, train_loss 3.085e-05, test_loss 1.190e+00, train_acc 100.0, test_acc 85.9
step 6000, dt 4.40s, train_loss 2.384e-05, test_loss 1.245e+00, train_acc 100.0, test_acc 85.9
step 7000, dt 4.43s, train_loss 1.356e-05, test_loss 1.299e+00, train_acc 100.0, test_acc 86.0


 38%|███▊      | 38/100 [21:54<35:35, 34.44s/it]

step 8000, dt 4.47s, train_loss 5.740e-06, test_loss 1.351e+00, train_acc 100.0, test_acc 85.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 2.035e-02, test_loss 1.159e+00, train_acc 99.4, test_acc 77.8
step 2000, dt 4.35s, train_loss 1.004e-04, test_loss 1.124e+00, train_acc 100.0, test_acc 84.5
step 3000, dt 4.47s, train_loss 8.121e-05, test_loss 1.214e+00, train_acc 100.0, test_acc 84.1
step 4000, dt 4.59s, train_loss 3.709e-05, test_loss 1.286e+00, train_acc 100.0, test_acc 83.6
step 5000, dt 4.61s, train_loss 2.379e-05, test_loss 1.352e+00, train_acc 100.0, test_acc 83.9
step 6000, dt 4.67s, train_loss 1.603e-05, test_loss 1.414e+00, train_acc 100.0, test_acc 83.9
step 7000, dt 4.64s, train_loss 5.470e-06, test_loss 1.475e+00, train_acc 100.0, test_acc 83.6


 39%|███▉      | 39/100 [22:30<35:32, 34.96s/it]

step 8000, dt 4.67s, train_loss 6.186e-06, test_loss 1.540e+00, train_acc 100.0, test_acc 83.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 1.456e-01, test_loss 9.219e-01, train_acc 98.3, test_acc 81.8
step 2000, dt 4.18s, train_loss 1.166e-04, test_loss 1.013e+00, train_acc 100.0, test_acc 84.5
step 3000, dt 4.23s, train_loss 6.900e-05, test_loss 1.102e+00, train_acc 100.0, test_acc 84.8
step 4000, dt 4.25s, train_loss 5.830e-05, test_loss 1.173e+00, train_acc 100.0, test_acc 84.6
step 5000, dt 4.29s, train_loss 2.788e-05, test_loss 1.245e+00, train_acc 100.0, test_acc 84.8
step 6000, dt 4.34s, train_loss 1.844e-05, test_loss 1.312e+00, train_acc 100.0, test_acc 84.8
step 7000, dt 4.35s, train_loss 1.336e-05, test_loss 1.376e+00, train_acc 100.0, test_acc 84.7


 40%|████      | 40/100 [23:04<34:45, 34.76s/it]

step 8000, dt 4.41s, train_loss 6.797e-06, test_loss 1.438e+00, train_acc 100.0, test_acc 84.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 6.072e-02, test_loss 8.137e-01, train_acc 98.8, test_acc 83.3
step 2000, dt 4.28s, train_loss 3.842e-04, test_loss 6.809e-01, train_acc 100.0, test_acc 87.9
step 3000, dt 4.32s, train_loss 1.403e-04, test_loss 7.410e-01, train_acc 100.0, test_acc 87.7
step 4000, dt 4.43s, train_loss 6.201e-05, test_loss 7.847e-01, train_acc 100.0, test_acc 87.7
step 5000, dt 4.45s, train_loss 3.862e-05, test_loss 8.238e-01, train_acc 100.0, test_acc 87.5
step 6000, dt 4.52s, train_loss 2.541e-05, test_loss 8.619e-01, train_acc 100.0, test_acc 87.5
step 7000, dt 4.54s, train_loss 7.490e-06, test_loss 8.992e-01, train_acc 100.0, test_acc 87.6


 41%|████      | 41/100 [23:39<34:17, 34.88s/it]

step 8000, dt 4.48s, train_loss 7.958e-06, test_loss 9.394e-01, train_acc 100.0, test_acc 87.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 1.448e-01, test_loss 8.344e-01, train_acc 97.1, test_acc 80.9
step 2000, dt 4.25s, train_loss 5.793e-04, test_loss 8.792e-01, train_acc 100.0, test_acc 84.8
step 3000, dt 4.34s, train_loss 4.486e-04, test_loss 9.764e-01, train_acc 100.0, test_acc 84.6
step 4000, dt 4.33s, train_loss 1.124e-04, test_loss 1.047e+00, train_acc 100.0, test_acc 84.8
step 5000, dt 4.40s, train_loss 7.460e-05, test_loss 1.105e+00, train_acc 100.0, test_acc 85.0
step 6000, dt 4.43s, train_loss 3.427e-05, test_loss 1.162e+00, train_acc 100.0, test_acc 85.1
step 7000, dt 4.47s, train_loss 1.483e-05, test_loss 1.221e+00, train_acc 100.0, test_acc 85.2


 42%|████▏     | 42/100 [24:14<33:43, 34.89s/it]

step 8000, dt 4.48s, train_loss 1.232e-05, test_loss 1.276e+00, train_acc 100.0, test_acc 85.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 5.739e-03, test_loss 9.665e-01, train_acc 99.9, test_acc 80.9
step 2000, dt 4.23s, train_loss 5.617e-04, test_loss 9.918e-01, train_acc 100.0, test_acc 84.7
step 3000, dt 4.34s, train_loss 1.201e-04, test_loss 1.121e+00, train_acc 100.0, test_acc 84.6
step 4000, dt 4.41s, train_loss 1.087e-04, test_loss 1.209e+00, train_acc 100.0, test_acc 84.8
step 5000, dt 4.46s, train_loss 8.514e-05, test_loss 1.284e+00, train_acc 100.0, test_acc 84.8
step 6000, dt 4.48s, train_loss 2.575e-05, test_loss 1.356e+00, train_acc 100.0, test_acc 84.8
step 7000, dt 4.52s, train_loss 1.098e-05, test_loss 1.426e+00, train_acc 100.0, test_acc 84.7


 43%|████▎     | 43/100 [24:49<33:13, 34.97s/it]

step 8000, dt 4.56s, train_loss 4.975e-06, test_loss 1.491e+00, train_acc 100.0, test_acc 84.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 1.826e-02, test_loss 7.898e-01, train_acc 100.0, test_acc 84.1
step 2000, dt 4.20s, train_loss 5.384e-04, test_loss 1.083e+00, train_acc 100.0, test_acc 84.0
step 3000, dt 4.22s, train_loss 2.085e-04, test_loss 1.212e+00, train_acc 100.0, test_acc 83.3
step 4000, dt 4.30s, train_loss 4.859e-05, test_loss 1.321e+00, train_acc 100.0, test_acc 83.3
step 5000, dt 4.26s, train_loss 3.828e-05, test_loss 1.412e+00, train_acc 100.0, test_acc 83.3
step 6000, dt 4.34s, train_loss 2.023e-05, test_loss 1.489e+00, train_acc 100.0, test_acc 83.5
step 7000, dt 4.36s, train_loss 9.703e-06, test_loss 1.564e+00, train_acc 100.0, test_acc 83.7


 44%|████▍     | 44/100 [25:24<32:28, 34.80s/it]

step 8000, dt 4.51s, train_loss 9.675e-06, test_loss 1.643e+00, train_acc 100.0, test_acc 83.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.13s, train_loss 1.279e-02, test_loss 7.960e-01, train_acc 99.6, test_acc 83.0
step 2000, dt 4.21s, train_loss 3.919e-04, test_loss 9.784e-01, train_acc 100.0, test_acc 84.1
step 3000, dt 4.22s, train_loss 1.143e-04, test_loss 1.052e+00, train_acc 100.0, test_acc 83.9
step 4000, dt 4.22s, train_loss 5.396e-05, test_loss 1.119e+00, train_acc 100.0, test_acc 84.0
step 5000, dt 4.28s, train_loss 3.230e-05, test_loss 1.180e+00, train_acc 100.0, test_acc 84.0
step 6000, dt 4.33s, train_loss 1.263e-05, test_loss 1.242e+00, train_acc 100.0, test_acc 83.9
step 7000, dt 4.36s, train_loss 1.143e-05, test_loss 1.304e+00, train_acc 100.0, test_acc 83.7


 45%|████▌     | 45/100 [25:58<31:43, 34.60s/it]

step 8000, dt 4.37s, train_loss 6.059e-06, test_loss 1.368e+00, train_acc 100.0, test_acc 83.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 9.443e-03, test_loss 7.740e-01, train_acc 99.2, test_acc 85.0
step 2000, dt 4.20s, train_loss 2.417e-04, test_loss 9.432e-01, train_acc 100.0, test_acc 86.0
step 3000, dt 4.21s, train_loss 5.122e-05, test_loss 1.028e+00, train_acc 100.0, test_acc 85.8
step 4000, dt 4.21s, train_loss 4.958e-05, test_loss 1.095e+00, train_acc 100.0, test_acc 85.5
step 5000, dt 4.20s, train_loss 2.572e-05, test_loss 1.154e+00, train_acc 100.0, test_acc 85.5
step 6000, dt 4.24s, train_loss 1.712e-05, test_loss 1.209e+00, train_acc 100.0, test_acc 85.5
step 7000, dt 4.29s, train_loss 7.442e-06, test_loss 1.262e+00, train_acc 100.0, test_acc 85.8


 46%|████▌     | 46/100 [26:32<30:58, 34.41s/it]

step 8000, dt 4.43s, train_loss 4.482e-06, test_loss 1.314e+00, train_acc 100.0, test_acc 85.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.23s, train_loss 2.571e-03, test_loss 8.536e-01, train_acc 99.9, test_acc 82.5
step 2000, dt 4.20s, train_loss 3.890e-04, test_loss 1.024e+00, train_acc 100.0, test_acc 84.3
step 3000, dt 4.28s, train_loss 2.037e-04, test_loss 1.127e+00, train_acc 100.0, test_acc 83.7
step 4000, dt 4.25s, train_loss 7.975e-05, test_loss 1.214e+00, train_acc 100.0, test_acc 84.1
step 5000, dt 4.34s, train_loss 3.234e-05, test_loss 1.298e+00, train_acc 100.0, test_acc 84.0
step 6000, dt 4.43s, train_loss 2.183e-05, test_loss 1.373e+00, train_acc 100.0, test_acc 83.9
step 7000, dt 4.38s, train_loss 1.077e-05, test_loss 1.446e+00, train_acc 100.0, test_acc 84.0


 47%|████▋     | 47/100 [27:06<30:26, 34.45s/it]

step 8000, dt 4.42s, train_loss 2.943e-06, test_loss 1.518e+00, train_acc 100.0, test_acc 83.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 2.233e-02, test_loss 7.201e-01, train_acc 99.6, test_acc 84.6
step 2000, dt 4.43s, train_loss 3.454e-04, test_loss 8.016e-01, train_acc 100.0, test_acc 88.0
step 3000, dt 4.46s, train_loss 1.052e-04, test_loss 8.947e-01, train_acc 100.0, test_acc 86.9
step 4000, dt 4.57s, train_loss 5.095e-05, test_loss 9.622e-01, train_acc 100.0, test_acc 86.8
step 5000, dt 4.56s, train_loss 2.419e-05, test_loss 1.024e+00, train_acc 100.0, test_acc 86.6
step 6000, dt 4.68s, train_loss 1.351e-05, test_loss 1.082e+00, train_acc 100.0, test_acc 86.4
step 7000, dt 4.73s, train_loss 1.682e-05, test_loss 1.140e+00, train_acc 100.0, test_acc 86.1


 48%|████▊     | 48/100 [27:43<30:22, 35.05s/it]

step 8000, dt 4.80s, train_loss 6.709e-06, test_loss 1.198e+00, train_acc 100.0, test_acc 85.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 2.029e-02, test_loss 7.874e-01, train_acc 99.9, test_acc 82.5
step 2000, dt 4.16s, train_loss 4.045e-04, test_loss 1.098e+00, train_acc 100.0, test_acc 82.0
step 3000, dt 4.28s, train_loss 1.311e-04, test_loss 1.216e+00, train_acc 100.0, test_acc 81.6
step 4000, dt 4.36s, train_loss 5.840e-05, test_loss 1.306e+00, train_acc 100.0, test_acc 81.7
step 5000, dt 4.45s, train_loss 3.860e-05, test_loss 1.390e+00, train_acc 100.0, test_acc 81.5
step 6000, dt 4.51s, train_loss 2.300e-05, test_loss 1.470e+00, train_acc 100.0, test_acc 81.0
step 7000, dt 4.53s, train_loss 9.538e-06, test_loss 1.548e+00, train_acc 100.0, test_acc 81.1


 49%|████▉     | 49/100 [28:18<29:48, 35.07s/it]

step 8000, dt 4.67s, train_loss 8.567e-06, test_loss 1.625e+00, train_acc 100.0, test_acc 81.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 7.218e-02, test_loss 8.326e-01, train_acc 96.9, test_acc 83.8
step 2000, dt 4.22s, train_loss 1.876e-04, test_loss 7.693e-01, train_acc 100.0, test_acc 87.6
step 3000, dt 4.24s, train_loss 9.071e-05, test_loss 8.448e-01, train_acc 100.0, test_acc 87.3
step 4000, dt 4.20s, train_loss 6.096e-05, test_loss 9.081e-01, train_acc 100.0, test_acc 87.0
step 5000, dt 4.25s, train_loss 3.283e-05, test_loss 9.674e-01, train_acc 100.0, test_acc 87.0
step 6000, dt 4.26s, train_loss 1.267e-05, test_loss 1.021e+00, train_acc 100.0, test_acc 86.9
step 7000, dt 4.28s, train_loss 1.259e-05, test_loss 1.070e+00, train_acc 100.0, test_acc 86.6


 50%|█████     | 50/100 [28:52<28:57, 34.74s/it]

step 8000, dt 4.32s, train_loss 4.042e-06, test_loss 1.118e+00, train_acc 100.0, test_acc 86.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 8.663e-03, test_loss 5.522e-01, train_acc 99.9, test_acc 88.1
step 2000, dt 4.24s, train_loss 2.781e-04, test_loss 7.057e-01, train_acc 100.0, test_acc 88.3
step 3000, dt 4.24s, train_loss 1.453e-04, test_loss 7.768e-01, train_acc 100.0, test_acc 88.1
step 4000, dt 4.30s, train_loss 7.187e-05, test_loss 8.369e-01, train_acc 100.0, test_acc 88.4
step 5000, dt 4.31s, train_loss 3.377e-05, test_loss 8.902e-01, train_acc 100.0, test_acc 88.0
step 6000, dt 4.37s, train_loss 1.520e-05, test_loss 9.419e-01, train_acc 100.0, test_acc 88.1
step 7000, dt 4.40s, train_loss 1.018e-05, test_loss 9.930e-01, train_acc 100.0, test_acc 88.0


 51%|█████     | 51/100 [29:26<28:17, 34.65s/it]

step 8000, dt 4.39s, train_loss 7.546e-06, test_loss 1.044e+00, train_acc 100.0, test_acc 87.7
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 4.720e-02, test_loss 8.996e-01, train_acc 98.4, test_acc 82.3
step 2000, dt 4.24s, train_loss 4.483e-04, test_loss 8.747e-01, train_acc 100.0, test_acc 86.2
step 3000, dt 4.29s, train_loss 1.517e-04, test_loss 9.600e-01, train_acc 100.0, test_acc 86.8
step 4000, dt 4.36s, train_loss 4.235e-05, test_loss 1.021e+00, train_acc 100.0, test_acc 86.4
step 5000, dt 4.41s, train_loss 2.396e-05, test_loss 1.077e+00, train_acc 100.0, test_acc 86.3
step 6000, dt 4.41s, train_loss 1.852e-05, test_loss 1.133e+00, train_acc 100.0, test_acc 85.9
step 7000, dt 4.39s, train_loss 1.032e-05, test_loss 1.187e+00, train_acc 100.0, test_acc 85.9


 52%|█████▏    | 52/100 [30:01<27:44, 34.67s/it]

step 8000, dt 4.42s, train_loss 6.478e-06, test_loss 1.240e+00, train_acc 100.0, test_acc 85.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 3.737e-02, test_loss 1.090e+00, train_acc 96.0, test_acc 78.7
step 2000, dt 4.27s, train_loss 1.396e-04, test_loss 9.775e-01, train_acc 100.0, test_acc 84.3
step 3000, dt 4.27s, train_loss 8.990e-05, test_loss 1.073e+00, train_acc 100.0, test_acc 84.5
step 4000, dt 4.37s, train_loss 1.230e-04, test_loss 1.144e+00, train_acc 100.0, test_acc 85.0
step 5000, dt 4.38s, train_loss 3.156e-05, test_loss 1.208e+00, train_acc 100.0, test_acc 84.9
step 6000, dt 4.41s, train_loss 2.397e-05, test_loss 1.268e+00, train_acc 100.0, test_acc 84.7
step 7000, dt 4.46s, train_loss 1.002e-05, test_loss 1.327e+00, train_acc 100.0, test_acc 84.5


 53%|█████▎    | 53/100 [30:36<27:11, 34.70s/it]

step 8000, dt 4.45s, train_loss 7.764e-06, test_loss 1.388e+00, train_acc 100.0, test_acc 84.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 1.558e-03, test_loss 7.131e-01, train_acc 100.0, test_acc 85.8
step 2000, dt 4.19s, train_loss 2.692e-04, test_loss 8.210e-01, train_acc 100.0, test_acc 86.4
step 3000, dt 4.25s, train_loss 1.193e-04, test_loss 8.944e-01, train_acc 100.0, test_acc 86.4
step 4000, dt 4.25s, train_loss 5.327e-05, test_loss 9.561e-01, train_acc 100.0, test_acc 86.6
step 5000, dt 4.28s, train_loss 2.178e-05, test_loss 1.011e+00, train_acc 100.0, test_acc 86.5
step 6000, dt 4.28s, train_loss 1.279e-05, test_loss 1.063e+00, train_acc 100.0, test_acc 86.4
step 7000, dt 4.28s, train_loss 3.864e-06, test_loss 1.115e+00, train_acc 100.0, test_acc 86.0


 54%|█████▍    | 54/100 [31:10<26:27, 34.51s/it]

step 8000, dt 4.33s, train_loss 4.733e-06, test_loss 1.167e+00, train_acc 100.0, test_acc 86.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 6.950e-03, test_loss 7.818e-01, train_acc 99.9, test_acc 82.8
step 2000, dt 4.22s, train_loss 3.303e-04, test_loss 9.385e-01, train_acc 100.0, test_acc 83.1
step 3000, dt 4.19s, train_loss 1.042e-04, test_loss 1.037e+00, train_acc 100.0, test_acc 83.1
step 4000, dt 4.27s, train_loss 7.703e-05, test_loss 1.120e+00, train_acc 100.0, test_acc 83.3
step 5000, dt 4.25s, train_loss 2.424e-05, test_loss 1.193e+00, train_acc 100.0, test_acc 83.5
step 6000, dt 4.35s, train_loss 1.778e-05, test_loss 1.263e+00, train_acc 100.0, test_acc 83.7
step 7000, dt 4.38s, train_loss 1.626e-05, test_loss 1.328e+00, train_acc 100.0, test_acc 83.8


 55%|█████▌    | 55/100 [31:44<25:48, 34.40s/it]

step 8000, dt 4.31s, train_loss 7.742e-06, test_loss 1.395e+00, train_acc 100.0, test_acc 83.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 5.378e-03, test_loss 8.294e-01, train_acc 100.0, test_acc 84.6
step 2000, dt 4.25s, train_loss 2.169e-04, test_loss 9.791e-01, train_acc 100.0, test_acc 85.7
step 3000, dt 4.27s, train_loss 1.635e-04, test_loss 1.068e+00, train_acc 100.0, test_acc 85.6
step 4000, dt 4.34s, train_loss 5.409e-05, test_loss 1.141e+00, train_acc 100.0, test_acc 86.1
step 5000, dt 4.28s, train_loss 2.329e-05, test_loss 1.210e+00, train_acc 100.0, test_acc 86.1
step 6000, dt 4.45s, train_loss 1.825e-05, test_loss 1.274e+00, train_acc 100.0, test_acc 86.0
step 7000, dt 4.49s, train_loss 1.094e-05, test_loss 1.337e+00, train_acc 100.0, test_acc 86.1


 56%|█████▌    | 56/100 [32:19<25:19, 34.53s/it]

step 8000, dt 4.59s, train_loss 5.206e-06, test_loss 1.399e+00, train_acc 100.0, test_acc 86.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 3.418e-03, test_loss 6.636e-01, train_acc 100.0, test_acc 84.4
step 2000, dt 4.18s, train_loss 9.722e-04, test_loss 8.617e-01, train_acc 100.0, test_acc 85.0
step 3000, dt 4.22s, train_loss 3.394e-04, test_loss 9.682e-01, train_acc 100.0, test_acc 85.0
step 4000, dt 4.23s, train_loss 7.197e-05, test_loss 1.057e+00, train_acc 100.0, test_acc 85.2
step 5000, dt 4.25s, train_loss 5.637e-05, test_loss 1.134e+00, train_acc 100.0, test_acc 84.8
step 6000, dt 4.30s, train_loss 2.541e-05, test_loss 1.204e+00, train_acc 100.0, test_acc 84.9
step 7000, dt 4.43s, train_loss 1.127e-05, test_loss 1.271e+00, train_acc 100.0, test_acc 84.7


 57%|█████▋    | 57/100 [32:53<24:41, 34.46s/it]

step 8000, dt 4.46s, train_loss 4.030e-06, test_loss 1.335e+00, train_acc 100.0, test_acc 84.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.12s, train_loss 1.311e-02, test_loss 6.265e-01, train_acc 99.5, test_acc 86.4
step 2000, dt 4.36s, train_loss 1.410e-04, test_loss 6.643e-01, train_acc 100.0, test_acc 87.5
step 3000, dt 4.41s, train_loss 6.354e-05, test_loss 7.124e-01, train_acc 100.0, test_acc 87.0
step 4000, dt 4.56s, train_loss 5.103e-05, test_loss 7.542e-01, train_acc 100.0, test_acc 87.5
step 5000, dt 4.59s, train_loss 9.217e-06, test_loss 7.951e-01, train_acc 100.0, test_acc 87.5
step 6000, dt 4.54s, train_loss 1.329e-05, test_loss 8.322e-01, train_acc 100.0, test_acc 87.6
step 7000, dt 4.56s, train_loss 4.653e-06, test_loss 8.712e-01, train_acc 100.0, test_acc 87.7


 58%|█████▊    | 58/100 [33:29<24:23, 34.86s/it]

step 8000, dt 4.63s, train_loss 5.863e-06, test_loss 9.091e-01, train_acc 100.0, test_acc 87.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.24s, train_loss 2.149e-01, test_loss 9.148e-01, train_acc 95.4, test_acc 79.8
step 2000, dt 4.23s, train_loss 6.382e-04, test_loss 1.103e+00, train_acc 100.0, test_acc 84.3
step 3000, dt 4.38s, train_loss 2.011e-04, test_loss 1.223e+00, train_acc 100.0, test_acc 84.0
step 4000, dt 4.31s, train_loss 1.256e-04, test_loss 1.316e+00, train_acc 100.0, test_acc 83.9
step 5000, dt 4.38s, train_loss 5.035e-05, test_loss 1.404e+00, train_acc 100.0, test_acc 83.7
step 6000, dt 4.45s, train_loss 2.370e-05, test_loss 1.487e+00, train_acc 100.0, test_acc 83.8
step 7000, dt 4.50s, train_loss 1.422e-05, test_loss 1.566e+00, train_acc 100.0, test_acc 84.0


 59%|█████▉    | 59/100 [34:04<23:52, 34.93s/it]

step 8000, dt 4.60s, train_loss 7.816e-06, test_loss 1.636e+00, train_acc 100.0, test_acc 83.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.23s, train_loss 2.166e-02, test_loss 9.677e-01, train_acc 97.1, test_acc 82.6
step 2000, dt 4.25s, train_loss 2.132e-04, test_loss 8.160e-01, train_acc 100.0, test_acc 85.9
step 3000, dt 4.34s, train_loss 1.266e-04, test_loss 8.912e-01, train_acc 100.0, test_acc 86.1
step 4000, dt 4.29s, train_loss 5.143e-05, test_loss 9.476e-01, train_acc 100.0, test_acc 86.1
step 5000, dt 4.25s, train_loss 1.995e-05, test_loss 9.971e-01, train_acc 100.0, test_acc 86.2
step 6000, dt 4.31s, train_loss 2.359e-05, test_loss 1.048e+00, train_acc 100.0, test_acc 85.8
step 7000, dt 4.28s, train_loss 6.393e-06, test_loss 1.099e+00, train_acc 100.0, test_acc 85.8


 60%|██████    | 60/100 [34:38<23:08, 34.72s/it]

step 8000, dt 4.27s, train_loss 5.944e-06, test_loss 1.150e+00, train_acc 100.0, test_acc 86.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 2.310e-02, test_loss 7.669e-01, train_acc 99.8, test_acc 83.3
step 2000, dt 4.24s, train_loss 3.594e-04, test_loss 9.228e-01, train_acc 100.0, test_acc 84.3
step 3000, dt 4.24s, train_loss 1.505e-04, test_loss 1.011e+00, train_acc 100.0, test_acc 83.9
step 4000, dt 4.28s, train_loss 8.536e-05, test_loss 1.083e+00, train_acc 100.0, test_acc 83.5
step 5000, dt 4.30s, train_loss 3.027e-05, test_loss 1.153e+00, train_acc 100.0, test_acc 83.2
step 6000, dt 4.45s, train_loss 1.705e-05, test_loss 1.221e+00, train_acc 100.0, test_acc 83.1
step 7000, dt 4.39s, train_loss 7.509e-06, test_loss 1.283e+00, train_acc 100.0, test_acc 83.2


 61%|██████    | 61/100 [35:13<22:31, 34.67s/it]

step 8000, dt 4.43s, train_loss 5.702e-06, test_loss 1.346e+00, train_acc 100.0, test_acc 83.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.13s, train_loss 6.959e-03, test_loss 1.098e+00, train_acc 98.7, test_acc 82.3
step 2000, dt 4.29s, train_loss 2.004e-01, test_loss 1.079e+00, train_acc 98.9, test_acc 82.9
step 3000, dt 4.29s, train_loss 4.063e-05, test_loss 1.180e+00, train_acc 100.0, test_acc 86.0
step 4000, dt 4.45s, train_loss 1.673e-05, test_loss 1.221e+00, train_acc 100.0, test_acc 86.2
step 5000, dt 4.43s, train_loss 9.609e-06, test_loss 1.254e+00, train_acc 100.0, test_acc 86.0
step 6000, dt 4.39s, train_loss 1.369e-05, test_loss 1.288e+00, train_acc 100.0, test_acc 86.0
step 7000, dt 4.55s, train_loss 4.487e-06, test_loss 1.324e+00, train_acc 100.0, test_acc 85.6


 62%|██████▏   | 62/100 [35:48<22:02, 34.81s/it]

step 8000, dt 4.59s, train_loss 4.660e-06, test_loss 1.361e+00, train_acc 100.0, test_acc 85.2
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.23s, train_loss 6.287e-02, test_loss 9.508e-01, train_acc 98.9, test_acc 79.8
step 2000, dt 4.27s, train_loss 3.450e-04, test_loss 1.186e+00, train_acc 100.0, test_acc 80.7
step 3000, dt 4.30s, train_loss 2.786e-04, test_loss 1.311e+00, train_acc 100.0, test_acc 80.5
step 4000, dt 4.34s, train_loss 1.208e-04, test_loss 1.414e+00, train_acc 100.0, test_acc 80.5
step 5000, dt 4.34s, train_loss 6.398e-05, test_loss 1.504e+00, train_acc 100.0, test_acc 80.4
step 6000, dt 4.37s, train_loss 3.566e-05, test_loss 1.586e+00, train_acc 100.0, test_acc 80.4
step 7000, dt 4.39s, train_loss 1.049e-05, test_loss 1.660e+00, train_acc 100.0, test_acc 80.3


 63%|██████▎   | 63/100 [36:23<21:26, 34.78s/it]

step 8000, dt 4.47s, train_loss 7.405e-06, test_loss 1.741e+00, train_acc 100.0, test_acc 80.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.12s, train_loss 6.627e-03, test_loss 8.290e-01, train_acc 99.7, test_acc 84.2
step 2000, dt 4.27s, train_loss 1.898e-04, test_loss 9.162e-01, train_acc 100.0, test_acc 86.6
step 3000, dt 4.24s, train_loss 1.672e-04, test_loss 9.969e-01, train_acc 100.0, test_acc 86.2
step 4000, dt 4.22s, train_loss 3.690e-05, test_loss 1.068e+00, train_acc 100.0, test_acc 86.4
step 5000, dt 4.29s, train_loss 2.875e-05, test_loss 1.135e+00, train_acc 100.0, test_acc 86.4
step 6000, dt 4.26s, train_loss 1.274e-05, test_loss 1.199e+00, train_acc 100.0, test_acc 86.2
step 7000, dt 4.28s, train_loss 7.975e-06, test_loss 1.261e+00, train_acc 100.0, test_acc 86.1


 64%|██████▍   | 64/100 [36:57<20:42, 34.51s/it]

step 8000, dt 4.23s, train_loss 6.679e-06, test_loss 1.322e+00, train_acc 100.0, test_acc 86.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 3.155e-02, test_loss 8.955e-01, train_acc 99.3, test_acc 83.0
step 2000, dt 4.23s, train_loss 3.050e-04, test_loss 9.788e-01, train_acc 100.0, test_acc 86.3
step 3000, dt 4.28s, train_loss 1.455e-04, test_loss 1.053e+00, train_acc 100.0, test_acc 86.2
step 4000, dt 4.33s, train_loss 3.994e-05, test_loss 1.108e+00, train_acc 100.0, test_acc 86.7
step 5000, dt 4.27s, train_loss 3.594e-05, test_loss 1.157e+00, train_acc 100.0, test_acc 86.6
step 6000, dt 4.28s, train_loss 1.913e-05, test_loss 1.206e+00, train_acc 100.0, test_acc 86.1
step 7000, dt 4.35s, train_loss 9.426e-06, test_loss 1.257e+00, train_acc 100.0, test_acc 86.2


 65%|██████▌   | 65/100 [37:31<20:06, 34.47s/it]

step 8000, dt 4.42s, train_loss 6.519e-06, test_loss 1.308e+00, train_acc 100.0, test_acc 86.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.23s, train_loss 8.886e-03, test_loss 9.079e-01, train_acc 99.9, test_acc 83.3
step 2000, dt 4.21s, train_loss 3.534e-04, test_loss 1.062e+00, train_acc 100.0, test_acc 83.5
step 3000, dt 4.27s, train_loss 1.386e-04, test_loss 1.163e+00, train_acc 100.0, test_acc 83.1
step 4000, dt 4.35s, train_loss 6.190e-05, test_loss 1.249e+00, train_acc 100.0, test_acc 83.3
step 5000, dt 4.38s, train_loss 1.886e-05, test_loss 1.323e+00, train_acc 100.0, test_acc 83.3
step 6000, dt 4.40s, train_loss 2.328e-05, test_loss 1.396e+00, train_acc 100.0, test_acc 83.8
step 7000, dt 4.39s, train_loss 1.051e-05, test_loss 1.466e+00, train_acc 100.0, test_acc 83.7


 66%|██████▌   | 66/100 [38:06<19:34, 34.54s/it]

step 8000, dt 4.46s, train_loss 5.855e-06, test_loss 1.532e+00, train_acc 100.0, test_acc 83.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.14s, train_loss 4.532e-02, test_loss 1.225e+00, train_acc 95.8, test_acc 78.1
step 2000, dt 4.26s, train_loss 4.330e-04, test_loss 1.243e+00, train_acc 100.0, test_acc 83.4
step 3000, dt 4.36s, train_loss 1.820e-04, test_loss 1.389e+00, train_acc 100.0, test_acc 83.1
step 4000, dt 4.38s, train_loss 1.032e-04, test_loss 1.486e+00, train_acc 100.0, test_acc 83.3
step 5000, dt 4.42s, train_loss 4.100e-05, test_loss 1.573e+00, train_acc 100.0, test_acc 83.3
step 6000, dt 4.44s, train_loss 1.809e-05, test_loss 1.656e+00, train_acc 100.0, test_acc 83.2
step 7000, dt 4.40s, train_loss 9.652e-06, test_loss 1.738e+00, train_acc 100.0, test_acc 83.2


 67%|██████▋   | 67/100 [38:41<19:03, 34.65s/it]

step 8000, dt 4.49s, train_loss 6.507e-06, test_loss 1.820e+00, train_acc 100.0, test_acc 83.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 6.121e-02, test_loss 8.683e-01, train_acc 97.6, test_acc 81.6
step 2000, dt 4.20s, train_loss 3.477e-04, test_loss 9.908e-01, train_acc 100.0, test_acc 84.5
step 3000, dt 4.31s, train_loss 1.369e-04, test_loss 1.091e+00, train_acc 100.0, test_acc 85.0
step 4000, dt 4.23s, train_loss 9.882e-05, test_loss 1.171e+00, train_acc 100.0, test_acc 85.0
step 5000, dt 4.31s, train_loss 4.418e-05, test_loss 1.243e+00, train_acc 100.0, test_acc 85.0
step 6000, dt 4.34s, train_loss 1.963e-05, test_loss 1.315e+00, train_acc 100.0, test_acc 84.6
step 7000, dt 4.32s, train_loss 1.610e-05, test_loss 1.386e+00, train_acc 100.0, test_acc 84.4


 68%|██████▊   | 68/100 [39:15<18:25, 34.55s/it]

step 8000, dt 4.38s, train_loss 7.601e-06, test_loss 1.453e+00, train_acc 100.0, test_acc 84.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 9.271e-03, test_loss 9.727e-01, train_acc 99.4, test_acc 81.8
step 2000, dt 4.14s, train_loss 3.081e-04, test_loss 1.050e+00, train_acc 100.0, test_acc 84.0
step 3000, dt 4.19s, train_loss 6.039e-05, test_loss 1.151e+00, train_acc 100.0, test_acc 84.2
step 4000, dt 4.20s, train_loss 5.550e-05, test_loss 1.231e+00, train_acc 100.0, test_acc 84.4
step 5000, dt 4.25s, train_loss 3.549e-05, test_loss 1.302e+00, train_acc 100.0, test_acc 84.3
step 6000, dt 4.23s, train_loss 1.630e-05, test_loss 1.372e+00, train_acc 100.0, test_acc 84.2
step 7000, dt 4.28s, train_loss 8.717e-06, test_loss 1.441e+00, train_acc 100.0, test_acc 84.3


 69%|██████▉   | 69/100 [39:49<17:43, 34.30s/it]

step 8000, dt 4.28s, train_loss 7.260e-06, test_loss 1.509e+00, train_acc 100.0, test_acc 84.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 4.373e-02, test_loss 8.986e-01, train_acc 99.7, test_acc 81.8
step 2000, dt 4.20s, train_loss 6.922e-04, test_loss 8.672e-01, train_acc 99.8, test_acc 84.5
step 3000, dt 4.45s, train_loss 9.627e-05, test_loss 9.061e-01, train_acc 100.0, test_acc 85.4
step 4000, dt 4.44s, train_loss 1.764e-05, test_loss 9.562e-01, train_acc 100.0, test_acc 85.2
step 5000, dt 4.45s, train_loss 1.958e-05, test_loss 9.997e-01, train_acc 100.0, test_acc 85.6
step 6000, dt 4.51s, train_loss 8.025e-06, test_loss 1.039e+00, train_acc 100.0, test_acc 85.2
step 7000, dt 4.48s, train_loss 1.058e-05, test_loss 1.077e+00, train_acc 100.0, test_acc 85.5


 70%|███████   | 70/100 [40:24<17:17, 34.58s/it]

step 8000, dt 4.51s, train_loss 3.043e-06, test_loss 1.117e+00, train_acc 100.0, test_acc 85.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 2.376e-02, test_loss 8.833e-01, train_acc 99.9, test_acc 81.5
step 2000, dt 4.39s, train_loss 5.387e-04, test_loss 1.178e+00, train_acc 100.0, test_acc 81.0
step 3000, dt 4.35s, train_loss 1.256e-04, test_loss 1.313e+00, train_acc 100.0, test_acc 80.9
step 4000, dt 4.28s, train_loss 1.201e-04, test_loss 1.426e+00, train_acc 100.0, test_acc 81.1
step 5000, dt 4.33s, train_loss 2.927e-05, test_loss 1.526e+00, train_acc 100.0, test_acc 80.8
step 6000, dt 4.38s, train_loss 2.104e-05, test_loss 1.615e+00, train_acc 100.0, test_acc 80.7
step 7000, dt 4.50s, train_loss 1.978e-05, test_loss 1.698e+00, train_acc 100.0, test_acc 80.1


 71%|███████   | 71/100 [40:59<16:46, 34.70s/it]

step 8000, dt 4.57s, train_loss 1.032e-05, test_loss 1.780e+00, train_acc 100.0, test_acc 80.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 3.575e-02, test_loss 8.814e-01, train_acc 99.3, test_acc 81.6
step 2000, dt 4.27s, train_loss 2.777e-04, test_loss 1.124e+00, train_acc 100.0, test_acc 82.6
step 3000, dt 4.25s, train_loss 1.571e-04, test_loss 1.234e+00, train_acc 100.0, test_acc 82.9
step 4000, dt 4.40s, train_loss 4.546e-05, test_loss 1.312e+00, train_acc 100.0, test_acc 83.0
step 5000, dt 4.43s, train_loss 3.744e-05, test_loss 1.388e+00, train_acc 100.0, test_acc 82.7
step 6000, dt 4.47s, train_loss 1.601e-05, test_loss 1.463e+00, train_acc 100.0, test_acc 82.6
step 7000, dt 4.54s, train_loss 8.583e-06, test_loss 1.534e+00, train_acc 100.0, test_acc 82.9


 72%|███████▏  | 72/100 [41:34<16:16, 34.86s/it]

step 8000, dt 4.71s, train_loss 6.496e-06, test_loss 1.605e+00, train_acc 100.0, test_acc 83.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.12s, train_loss 4.937e-02, test_loss 8.210e-01, train_acc 96.5, test_acc 81.3
step 2000, dt 4.17s, train_loss 4.082e-04, test_loss 9.571e-01, train_acc 100.0, test_acc 85.3
step 3000, dt 4.26s, train_loss 1.983e-04, test_loss 1.045e+00, train_acc 100.0, test_acc 84.2
step 4000, dt 4.32s, train_loss 8.549e-05, test_loss 1.120e+00, train_acc 100.0, test_acc 84.7
step 5000, dt 4.40s, train_loss 4.767e-05, test_loss 1.192e+00, train_acc 100.0, test_acc 84.7
step 6000, dt 4.41s, train_loss 1.903e-05, test_loss 1.264e+00, train_acc 100.0, test_acc 84.6
step 7000, dt 4.44s, train_loss 1.045e-05, test_loss 1.333e+00, train_acc 100.0, test_acc 84.7


 73%|███████▎  | 73/100 [42:09<15:39, 34.80s/it]

step 8000, dt 4.53s, train_loss 6.540e-06, test_loss 1.397e+00, train_acc 100.0, test_acc 85.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.23s, train_loss 2.332e-02, test_loss 8.378e-01, train_acc 99.8, test_acc 82.6
step 2000, dt 4.24s, train_loss 3.229e-03, test_loss 1.009e+00, train_acc 99.4, test_acc 83.4
step 3000, dt 4.26s, train_loss 1.313e-04, test_loss 9.713e-01, train_acc 100.0, test_acc 86.5
step 4000, dt 4.32s, train_loss 6.100e-05, test_loss 1.027e+00, train_acc 100.0, test_acc 86.4
step 5000, dt 4.25s, train_loss 3.822e-05, test_loss 1.075e+00, train_acc 100.0, test_acc 86.0
step 6000, dt 4.31s, train_loss 2.697e-05, test_loss 1.120e+00, train_acc 100.0, test_acc 86.2
step 7000, dt 4.35s, train_loss 6.622e-06, test_loss 1.164e+00, train_acc 100.0, test_acc 85.9


 74%|███████▍  | 74/100 [42:43<15:00, 34.64s/it]

step 8000, dt 4.32s, train_loss 5.095e-06, test_loss 1.208e+00, train_acc 100.0, test_acc 85.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 3.525e-02, test_loss 8.935e-01, train_acc 97.3, test_acc 81.2
step 2000, dt 4.25s, train_loss 5.023e-04, test_loss 1.241e+00, train_acc 100.0, test_acc 80.7
step 3000, dt 4.27s, train_loss 1.526e-04, test_loss 1.372e+00, train_acc 100.0, test_acc 79.9
step 4000, dt 4.24s, train_loss 4.977e-05, test_loss 1.460e+00, train_acc 100.0, test_acc 79.8
step 5000, dt 4.28s, train_loss 6.004e-05, test_loss 1.540e+00, train_acc 100.0, test_acc 79.8
step 6000, dt 4.25s, train_loss 2.604e-05, test_loss 1.617e+00, train_acc 100.0, test_acc 80.2
step 7000, dt 4.32s, train_loss 1.272e-05, test_loss 1.692e+00, train_acc 100.0, test_acc 80.2


 75%|███████▌  | 75/100 [43:17<14:22, 34.51s/it]

step 8000, dt 4.36s, train_loss 6.473e-06, test_loss 1.769e+00, train_acc 100.0, test_acc 80.2
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.14s, train_loss 7.016e-03, test_loss 8.966e-01, train_acc 99.9, test_acc 81.9
step 2000, dt 4.25s, train_loss 4.148e-04, test_loss 1.144e+00, train_acc 100.0, test_acc 82.6
step 3000, dt 4.23s, train_loss 7.245e-05, test_loss 1.270e+00, train_acc 100.0, test_acc 82.7
step 4000, dt 4.23s, train_loss 8.017e-05, test_loss 1.369e+00, train_acc 100.0, test_acc 82.3
step 5000, dt 4.24s, train_loss 3.046e-05, test_loss 1.457e+00, train_acc 100.0, test_acc 82.2
step 6000, dt 4.27s, train_loss 1.895e-05, test_loss 1.544e+00, train_acc 100.0, test_acc 82.3
step 7000, dt 4.35s, train_loss 9.931e-06, test_loss 1.627e+00, train_acc 100.0, test_acc 82.2


 76%|███████▌  | 76/100 [43:51<13:45, 34.39s/it]

step 8000, dt 4.40s, train_loss 5.153e-06, test_loss 1.705e+00, train_acc 100.0, test_acc 82.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 4.085e-03, test_loss 7.239e-01, train_acc 99.7, test_acc 85.2
step 2000, dt 4.18s, train_loss 1.267e-04, test_loss 8.770e-01, train_acc 100.0, test_acc 85.5
step 3000, dt 4.31s, train_loss 1.046e-04, test_loss 9.477e-01, train_acc 100.0, test_acc 85.2
step 4000, dt 4.32s, train_loss 6.572e-05, test_loss 1.004e+00, train_acc 100.0, test_acc 84.9
step 5000, dt 4.35s, train_loss 1.711e-05, test_loss 1.056e+00, train_acc 100.0, test_acc 84.8
step 6000, dt 4.43s, train_loss 1.644e-05, test_loss 1.106e+00, train_acc 100.0, test_acc 85.0
step 7000, dt 4.47s, train_loss 1.018e-05, test_loss 1.156e+00, train_acc 100.0, test_acc 85.0


 77%|███████▋  | 77/100 [44:26<13:14, 34.52s/it]

step 8000, dt 4.56s, train_loss 4.014e-06, test_loss 1.206e+00, train_acc 100.0, test_acc 84.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.26s, train_loss 1.253e-01, test_loss 8.658e-01, train_acc 96.6, test_acc 81.7
step 2000, dt 4.29s, train_loss 4.359e-04, test_loss 8.403e-01, train_acc 100.0, test_acc 86.1
step 3000, dt 4.30s, train_loss 9.423e-05, test_loss 9.220e-01, train_acc 100.0, test_acc 86.4
step 4000, dt 4.36s, train_loss 5.463e-05, test_loss 9.871e-01, train_acc 100.0, test_acc 86.2
step 5000, dt 4.48s, train_loss 4.305e-05, test_loss 1.051e+00, train_acc 100.0, test_acc 86.0
step 6000, dt 4.41s, train_loss 1.801e-05, test_loss 1.115e+00, train_acc 100.0, test_acc 86.0
step 7000, dt 4.43s, train_loss 1.002e-05, test_loss 1.178e+00, train_acc 100.0, test_acc 85.8


 78%|███████▊  | 78/100 [45:01<12:42, 34.65s/it]

step 8000, dt 4.41s, train_loss 8.091e-06, test_loss 1.239e+00, train_acc 100.0, test_acc 85.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 1.558e-02, test_loss 6.569e-01, train_acc 98.7, test_acc 84.6
step 2000, dt 4.22s, train_loss 2.705e-04, test_loss 8.514e-01, train_acc 100.0, test_acc 85.7
step 3000, dt 4.25s, train_loss 1.126e-04, test_loss 9.429e-01, train_acc 100.0, test_acc 85.9
step 4000, dt 4.28s, train_loss 1.079e-04, test_loss 1.020e+00, train_acc 100.0, test_acc 85.7
step 5000, dt 4.36s, train_loss 3.204e-05, test_loss 1.090e+00, train_acc 100.0, test_acc 85.6
step 6000, dt 4.39s, train_loss 2.874e-05, test_loss 1.154e+00, train_acc 100.0, test_acc 85.7
step 7000, dt 4.39s, train_loss 1.519e-05, test_loss 1.213e+00, train_acc 100.0, test_acc 85.7


 79%|███████▉  | 79/100 [45:36<12:06, 34.60s/it]

step 8000, dt 4.44s, train_loss 6.778e-06, test_loss 1.273e+00, train_acc 100.0, test_acc 85.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 1.920e-02, test_loss 1.179e+00, train_acc 96.9, test_acc 78.9
step 2000, dt 4.31s, train_loss 4.729e-04, test_loss 1.042e+00, train_acc 100.0, test_acc 83.3
step 3000, dt 4.37s, train_loss 8.617e-05, test_loss 1.168e+00, train_acc 100.0, test_acc 83.5
step 4000, dt 4.39s, train_loss 1.550e-04, test_loss 1.267e+00, train_acc 100.0, test_acc 83.3
step 5000, dt 4.49s, train_loss 4.011e-05, test_loss 1.354e+00, train_acc 100.0, test_acc 83.5
step 6000, dt 4.57s, train_loss 2.130e-05, test_loss 1.443e+00, train_acc 100.0, test_acc 83.5
step 7000, dt 4.60s, train_loss 1.455e-05, test_loss 1.535e+00, train_acc 100.0, test_acc 83.3


 80%|████████  | 80/100 [46:11<11:38, 34.91s/it]

step 8000, dt 4.68s, train_loss 7.508e-06, test_loss 1.621e+00, train_acc 100.0, test_acc 83.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 3.937e-02, test_loss 8.471e-01, train_acc 98.4, test_acc 83.1
step 2000, dt 4.24s, train_loss 2.095e-04, test_loss 8.147e-01, train_acc 100.0, test_acc 86.6
step 3000, dt 4.33s, train_loss 1.151e-04, test_loss 8.681e-01, train_acc 100.0, test_acc 86.5
step 4000, dt 4.37s, train_loss 3.808e-05, test_loss 9.126e-01, train_acc 100.0, test_acc 86.7
step 5000, dt 4.47s, train_loss 2.658e-05, test_loss 9.534e-01, train_acc 100.0, test_acc 86.5
step 6000, dt 4.41s, train_loss 9.507e-06, test_loss 9.933e-01, train_acc 100.0, test_acc 86.5
step 7000, dt 4.45s, train_loss 6.662e-06, test_loss 1.032e+00, train_acc 100.0, test_acc 86.6


 81%|████████  | 81/100 [46:46<11:03, 34.94s/it]

step 8000, dt 4.53s, train_loss 6.442e-06, test_loss 1.072e+00, train_acc 100.0, test_acc 86.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 6.123e-02, test_loss 9.300e-01, train_acc 97.7, test_acc 83.3
step 2000, dt 4.21s, train_loss 1.978e-03, test_loss 1.003e+00, train_acc 99.7, test_acc 85.4
step 3000, dt 4.39s, train_loss 4.835e-05, test_loss 9.628e-01, train_acc 100.0, test_acc 87.4
step 4000, dt 4.51s, train_loss 2.799e-05, test_loss 9.924e-01, train_acc 100.0, test_acc 87.3
step 5000, dt 4.54s, train_loss 1.461e-05, test_loss 1.016e+00, train_acc 100.0, test_acc 87.1
step 6000, dt 4.51s, train_loss 2.763e-06, test_loss 1.040e+00, train_acc 100.0, test_acc 87.0
step 7000, dt 4.54s, train_loss 5.044e-06, test_loss 1.066e+00, train_acc 100.0, test_acc 86.9


 82%|████████▏ | 82/100 [47:22<10:32, 35.11s/it]

step 8000, dt 4.63s, train_loss 1.450e-06, test_loss 1.091e+00, train_acc 100.0, test_acc 87.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 4.960e-02, test_loss 8.110e-01, train_acc 97.1, test_acc 80.8
step 2000, dt 4.27s, train_loss 2.252e-02, test_loss 1.029e+00, train_acc 99.3, test_acc 83.3
step 3000, dt 4.56s, train_loss 3.181e-04, test_loss 1.051e+00, train_acc 100.0, test_acc 86.1
step 4000, dt 4.71s, train_loss 8.351e-05, test_loss 1.123e+00, train_acc 100.0, test_acc 85.9
step 5000, dt 4.78s, train_loss 5.470e-05, test_loss 1.175e+00, train_acc 100.0, test_acc 85.8
step 6000, dt 4.75s, train_loss 3.704e-05, test_loss 1.222e+00, train_acc 100.0, test_acc 85.7
step 7000, dt 4.79s, train_loss 1.899e-05, test_loss 1.270e+00, train_acc 100.0, test_acc 85.9


 83%|████████▎ | 83/100 [47:59<10:06, 35.68s/it]

step 8000, dt 4.92s, train_loss 1.196e-05, test_loss 1.318e+00, train_acc 100.0, test_acc 85.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 3.506e-02, test_loss 6.431e-01, train_acc 99.7, test_acc 85.1
step 2000, dt 4.22s, train_loss 5.151e-04, test_loss 7.787e-01, train_acc 100.0, test_acc 85.7
step 3000, dt 4.34s, train_loss 2.272e-04, test_loss 8.477e-01, train_acc 100.0, test_acc 85.6
step 4000, dt 4.32s, train_loss 5.539e-05, test_loss 9.071e-01, train_acc 100.0, test_acc 85.8
step 5000, dt 4.37s, train_loss 4.851e-05, test_loss 9.599e-01, train_acc 100.0, test_acc 85.6
step 6000, dt 4.37s, train_loss 2.365e-05, test_loss 1.008e+00, train_acc 100.0, test_acc 85.7
step 7000, dt 4.41s, train_loss 1.835e-05, test_loss 1.055e+00, train_acc 100.0, test_acc 86.1


 84%|████████▍ | 84/100 [48:33<09:26, 35.40s/it]

step 8000, dt 4.51s, train_loss 9.311e-06, test_loss 1.100e+00, train_acc 100.0, test_acc 86.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 2.673e-02, test_loss 1.088e+00, train_acc 96.9, test_acc 77.6
step 2000, dt 4.25s, train_loss 5.992e-04, test_loss 1.194e+00, train_acc 100.0, test_acc 82.3
step 3000, dt 4.29s, train_loss 2.097e-04, test_loss 1.314e+00, train_acc 100.0, test_acc 81.7
step 4000, dt 4.30s, train_loss 9.490e-05, test_loss 1.405e+00, train_acc 100.0, test_acc 82.0
step 5000, dt 4.42s, train_loss 5.444e-05, test_loss 1.497e+00, train_acc 100.0, test_acc 81.9
step 6000, dt 4.42s, train_loss 3.337e-05, test_loss 1.587e+00, train_acc 100.0, test_acc 81.8
step 7000, dt 4.50s, train_loss 1.206e-05, test_loss 1.674e+00, train_acc 100.0, test_acc 82.0


 85%|████████▌ | 85/100 [49:08<08:48, 35.21s/it]

step 8000, dt 4.43s, train_loss 1.038e-05, test_loss 1.759e+00, train_acc 100.0, test_acc 81.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 9.541e-03, test_loss 8.247e-01, train_acc 99.8, test_acc 83.8
step 2000, dt 4.19s, train_loss 4.372e-04, test_loss 9.972e-01, train_acc 100.0, test_acc 84.8
step 3000, dt 4.25s, train_loss 1.616e-04, test_loss 1.104e+00, train_acc 100.0, test_acc 85.2
step 4000, dt 4.21s, train_loss 4.557e-05, test_loss 1.196e+00, train_acc 100.0, test_acc 85.3
step 5000, dt 4.20s, train_loss 4.054e-05, test_loss 1.277e+00, train_acc 100.0, test_acc 85.1
step 6000, dt 4.23s, train_loss 2.615e-05, test_loss 1.353e+00, train_acc 100.0, test_acc 84.7
step 7000, dt 4.28s, train_loss 1.209e-05, test_loss 1.431e+00, train_acc 100.0, test_acc 84.7


 86%|████████▌ | 86/100 [49:42<08:07, 34.81s/it]

step 8000, dt 4.33s, train_loss 7.419e-06, test_loss 1.507e+00, train_acc 100.0, test_acc 84.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 2.624e-02, test_loss 1.064e+00, train_acc 98.4, test_acc 79.7
step 2000, dt 4.19s, train_loss 4.305e-04, test_loss 1.300e+00, train_acc 100.0, test_acc 79.9
step 3000, dt 4.29s, train_loss 1.405e-04, test_loss 1.426e+00, train_acc 100.0, test_acc 80.0
step 4000, dt 4.31s, train_loss 5.776e-05, test_loss 1.527e+00, train_acc 100.0, test_acc 80.1
step 5000, dt 4.40s, train_loss 4.095e-05, test_loss 1.624e+00, train_acc 100.0, test_acc 80.5
step 6000, dt 4.38s, train_loss 2.163e-05, test_loss 1.708e+00, train_acc 100.0, test_acc 80.5
step 7000, dt 4.50s, train_loss 1.083e-05, test_loss 1.789e+00, train_acc 100.0, test_acc 80.5


 87%|████████▋ | 87/100 [50:17<07:32, 34.80s/it]

step 8000, dt 4.53s, train_loss 5.327e-06, test_loss 1.869e+00, train_acc 100.0, test_acc 80.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.23s, train_loss 6.307e-02, test_loss 7.450e-01, train_acc 99.0, test_acc 85.1
step 2000, dt 4.29s, train_loss 2.547e-04, test_loss 6.781e-01, train_acc 100.0, test_acc 87.8
step 3000, dt 4.40s, train_loss 1.116e-04, test_loss 7.477e-01, train_acc 100.0, test_acc 88.1
step 4000, dt 4.52s, train_loss 6.815e-05, test_loss 8.011e-01, train_acc 100.0, test_acc 87.9
step 5000, dt 4.53s, train_loss 2.650e-05, test_loss 8.498e-01, train_acc 100.0, test_acc 88.3
step 6000, dt 4.68s, train_loss 1.552e-05, test_loss 8.948e-01, train_acc 100.0, test_acc 88.1
step 7000, dt 4.74s, train_loss 7.136e-06, test_loss 9.378e-01, train_acc 100.0, test_acc 87.7


 88%|████████▊ | 88/100 [50:53<07:02, 35.22s/it]

step 8000, dt 4.81s, train_loss 6.090e-06, test_loss 9.786e-01, train_acc 100.0, test_acc 87.8
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 9.082e-03, test_loss 9.090e-01, train_acc 99.9, test_acc 82.8
step 2000, dt 4.20s, train_loss 1.611e-04, test_loss 9.973e-01, train_acc 100.0, test_acc 83.6
step 3000, dt 4.25s, train_loss 9.503e-05, test_loss 1.081e+00, train_acc 100.0, test_acc 83.2
step 4000, dt 4.21s, train_loss 5.621e-05, test_loss 1.151e+00, train_acc 100.0, test_acc 82.8
step 5000, dt 4.24s, train_loss 2.955e-05, test_loss 1.214e+00, train_acc 100.0, test_acc 83.2
step 6000, dt 4.29s, train_loss 1.780e-05, test_loss 1.276e+00, train_acc 100.0, test_acc 83.3
step 7000, dt 4.30s, train_loss 8.107e-06, test_loss 1.338e+00, train_acc 100.0, test_acc 83.6


 89%|████████▉ | 89/100 [51:27<06:23, 34.86s/it]

step 8000, dt 4.33s, train_loss 3.531e-06, test_loss 1.395e+00, train_acc 100.0, test_acc 83.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.18s, train_loss 3.748e-02, test_loss 8.521e-01, train_acc 97.9, test_acc 81.3
step 2000, dt 4.29s, train_loss 8.863e-04, test_loss 1.010e+00, train_acc 100.0, test_acc 83.1
step 3000, dt 4.38s, train_loss 3.409e-04, test_loss 1.139e+00, train_acc 100.0, test_acc 83.5
step 4000, dt 4.38s, train_loss 1.250e-04, test_loss 1.234e+00, train_acc 100.0, test_acc 83.6
step 5000, dt 4.44s, train_loss 6.796e-05, test_loss 1.319e+00, train_acc 100.0, test_acc 83.7
step 6000, dt 4.56s, train_loss 3.587e-05, test_loss 1.398e+00, train_acc 100.0, test_acc 83.7
step 7000, dt 4.55s, train_loss 2.744e-05, test_loss 1.475e+00, train_acc 100.0, test_acc 83.5


 90%|█████████ | 90/100 [52:03<05:50, 35.05s/it]

step 8000, dt 4.70s, train_loss 1.056e-05, test_loss 1.551e+00, train_acc 100.0, test_acc 83.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 6.382e-02, test_loss 1.105e+00, train_acc 98.0, test_acc 79.7
step 2000, dt 4.23s, train_loss 8.643e-04, test_loss 1.052e+00, train_acc 100.0, test_acc 85.2
step 3000, dt 4.39s, train_loss 1.830e-04, test_loss 1.164e+00, train_acc 100.0, test_acc 85.8
step 4000, dt 4.37s, train_loss 6.322e-05, test_loss 1.249e+00, train_acc 100.0, test_acc 85.8
step 5000, dt 4.43s, train_loss 3.126e-05, test_loss 1.327e+00, train_acc 100.0, test_acc 85.7
step 6000, dt 4.43s, train_loss 1.987e-05, test_loss 1.406e+00, train_acc 100.0, test_acc 85.4
step 7000, dt 4.53s, train_loss 1.688e-05, test_loss 1.479e+00, train_acc 100.0, test_acc 85.4


 91%|█████████ | 91/100 [52:38<05:15, 35.06s/it]

step 8000, dt 4.50s, train_loss 1.009e-05, test_loss 1.548e+00, train_acc 100.0, test_acc 85.3
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 4.656e-03, test_loss 7.407e-01, train_acc 99.9, test_acc 82.8
step 2000, dt 4.18s, train_loss 4.937e-04, test_loss 9.594e-01, train_acc 100.0, test_acc 83.4
step 3000, dt 4.20s, train_loss 1.421e-04, test_loss 1.058e+00, train_acc 100.0, test_acc 83.4
step 4000, dt 4.28s, train_loss 6.938e-05, test_loss 1.144e+00, train_acc 100.0, test_acc 83.7
step 5000, dt 4.27s, train_loss 2.206e-05, test_loss 1.222e+00, train_acc 100.0, test_acc 83.5
step 6000, dt 4.29s, train_loss 2.179e-05, test_loss 1.295e+00, train_acc 100.0, test_acc 83.8
step 7000, dt 4.29s, train_loss 1.582e-05, test_loss 1.365e+00, train_acc 100.0, test_acc 83.7


 92%|█████████▏| 92/100 [53:12<04:37, 34.73s/it]

step 8000, dt 4.28s, train_loss 6.556e-06, test_loss 1.432e+00, train_acc 100.0, test_acc 83.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.17s, train_loss 3.794e-02, test_loss 1.164e+00, train_acc 96.2, test_acc 79.3
step 2000, dt 4.20s, train_loss 3.574e-04, test_loss 1.166e+00, train_acc 100.0, test_acc 83.8
step 3000, dt 4.32s, train_loss 1.078e-04, test_loss 1.305e+00, train_acc 100.0, test_acc 84.0
step 4000, dt 4.33s, train_loss 4.829e-05, test_loss 1.412e+00, train_acc 100.0, test_acc 83.9
step 5000, dt 4.47s, train_loss 2.813e-05, test_loss 1.513e+00, train_acc 100.0, test_acc 83.5
step 6000, dt 4.41s, train_loss 1.516e-05, test_loss 1.608e+00, train_acc 100.0, test_acc 82.9
step 7000, dt 4.48s, train_loss 1.144e-05, test_loss 1.704e+00, train_acc 100.0, test_acc 82.7


 93%|█████████▎| 93/100 [53:47<04:03, 34.79s/it]

step 8000, dt 4.55s, train_loss 5.327e-06, test_loss 1.797e+00, train_acc 100.0, test_acc 82.5
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 4.694e-02, test_loss 1.035e+00, train_acc 98.4, test_acc 78.9
step 2000, dt 4.33s, train_loss 3.409e-04, test_loss 1.057e+00, train_acc 100.0, test_acc 82.0
step 3000, dt 4.28s, train_loss 1.662e-04, test_loss 1.153e+00, train_acc 100.0, test_acc 81.9
step 4000, dt 4.37s, train_loss 1.266e-04, test_loss 1.238e+00, train_acc 100.0, test_acc 81.7
step 5000, dt 4.40s, train_loss 3.984e-05, test_loss 1.319e+00, train_acc 100.0, test_acc 81.8
step 6000, dt 4.48s, train_loss 3.210e-05, test_loss 1.396e+00, train_acc 100.0, test_acc 81.4
step 7000, dt 4.47s, train_loss 1.164e-05, test_loss 1.469e+00, train_acc 100.0, test_acc 81.2


 94%|█████████▍| 94/100 [54:22<03:29, 34.88s/it]

step 8000, dt 4.54s, train_loss 1.033e-05, test_loss 1.542e+00, train_acc 100.0, test_acc 80.9
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.20s, train_loss 1.978e-02, test_loss 1.263e+00, train_acc 96.8, test_acc 76.0
step 2000, dt 4.19s, train_loss 1.863e-02, test_loss 1.306e+00, train_acc 98.2, test_acc 79.1
step 3000, dt 4.35s, train_loss 1.235e-04, test_loss 1.133e+00, train_acc 100.0, test_acc 85.6
step 4000, dt 4.39s, train_loss 3.240e-05, test_loss 1.202e+00, train_acc 100.0, test_acc 85.7
step 5000, dt 4.36s, train_loss 4.831e-05, test_loss 1.262e+00, train_acc 100.0, test_acc 85.5
step 6000, dt 4.41s, train_loss 1.451e-05, test_loss 1.315e+00, train_acc 100.0, test_acc 85.4
step 7000, dt 4.43s, train_loss 8.782e-06, test_loss 1.365e+00, train_acc 100.0, test_acc 85.3


 95%|█████████▌| 95/100 [54:56<02:54, 34.87s/it]

step 8000, dt 4.52s, train_loss 3.838e-06, test_loss 1.414e+00, train_acc 100.0, test_acc 85.1
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.21s, train_loss 1.520e-02, test_loss 9.093e-01, train_acc 99.6, test_acc 82.1
step 2000, dt 4.22s, train_loss 3.480e-04, test_loss 1.054e+00, train_acc 100.0, test_acc 83.8
step 3000, dt 4.32s, train_loss 8.855e-05, test_loss 1.139e+00, train_acc 100.0, test_acc 84.2
step 4000, dt 4.39s, train_loss 7.379e-05, test_loss 1.212e+00, train_acc 100.0, test_acc 84.6
step 5000, dt 4.44s, train_loss 2.321e-05, test_loss 1.275e+00, train_acc 100.0, test_acc 85.1
step 6000, dt 4.45s, train_loss 1.840e-05, test_loss 1.337e+00, train_acc 100.0, test_acc 85.1
step 7000, dt 4.55s, train_loss 8.465e-06, test_loss 1.396e+00, train_acc 100.0, test_acc 84.8


 96%|█████████▌| 96/100 [55:32<02:19, 34.95s/it]

step 8000, dt 4.55s, train_loss 3.709e-06, test_loss 1.453e+00, train_acc 100.0, test_acc 85.0
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.25s, train_loss 3.171e-03, test_loss 6.964e-01, train_acc 99.8, test_acc 84.6
step 2000, dt 4.21s, train_loss 2.728e-04, test_loss 8.447e-01, train_acc 100.0, test_acc 86.1
step 3000, dt 4.22s, train_loss 7.667e-05, test_loss 9.168e-01, train_acc 100.0, test_acc 85.7
step 4000, dt 4.28s, train_loss 3.766e-05, test_loss 9.858e-01, train_acc 100.0, test_acc 85.4
step 5000, dt 4.23s, train_loss 3.964e-05, test_loss 1.051e+00, train_acc 100.0, test_acc 85.4
step 6000, dt 4.29s, train_loss 2.419e-05, test_loss 1.113e+00, train_acc 100.0, test_acc 85.4
step 7000, dt 4.25s, train_loss 9.082e-06, test_loss 1.175e+00, train_acc 100.0, test_acc 85.4


 97%|█████████▋| 97/100 [56:06<01:44, 34.67s/it]

step 8000, dt 4.28s, train_loss 6.817e-06, test_loss 1.237e+00, train_acc 100.0, test_acc 85.4
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.15s, train_loss 8.169e-02, test_loss 8.641e-01, train_acc 97.9, test_acc 82.6
step 2000, dt 4.27s, train_loss 4.189e-04, test_loss 8.303e-01, train_acc 100.0, test_acc 86.0
step 3000, dt 4.30s, train_loss 2.739e-04, test_loss 9.217e-01, train_acc 100.0, test_acc 85.1
step 4000, dt 4.45s, train_loss 9.008e-05, test_loss 9.929e-01, train_acc 100.0, test_acc 84.8
step 5000, dt 4.49s, train_loss 3.438e-05, test_loss 1.055e+00, train_acc 100.0, test_acc 85.0
step 6000, dt 4.59s, train_loss 2.408e-05, test_loss 1.111e+00, train_acc 100.0, test_acc 85.0
step 7000, dt 4.66s, train_loss 6.938e-06, test_loss 1.163e+00, train_acc 100.0, test_acc 84.7


 98%|█████████▊| 98/100 [56:41<01:09, 34.99s/it]

step 8000, dt 4.79s, train_loss 8.310e-06, test_loss 1.216e+00, train_acc 100.0, test_acc 84.6
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.19s, train_loss 2.097e-01, test_loss 9.012e-01, train_acc 98.2, test_acc 80.9
step 2000, dt 4.27s, train_loss 5.939e-04, test_loss 1.247e+00, train_acc 100.0, test_acc 81.4
step 3000, dt 4.33s, train_loss 2.003e-04, test_loss 1.386e+00, train_acc 100.0, test_acc 81.3
step 4000, dt 4.39s, train_loss 8.989e-05, test_loss 1.488e+00, train_acc 100.0, test_acc 81.4
step 5000, dt 4.43s, train_loss 2.764e-05, test_loss 1.576e+00, train_acc 100.0, test_acc 81.3
step 6000, dt 4.49s, train_loss 2.160e-05, test_loss 1.656e+00, train_acc 100.0, test_acc 81.4
step 7000, dt 4.42s, train_loss 1.258e-05, test_loss 1.734e+00, train_acc 100.0, test_acc 81.2


 99%|█████████▉| 99/100 [57:16<00:35, 35.03s/it]

step 8000, dt 4.60s, train_loss 1.080e-05, test_loss 1.810e+00, train_acc 100.0, test_acc 81.2
Initialized ConvBase model with 5210 parameters
step 1000, dt 4.16s, train_loss 1.139e-01, test_loss 1.003e+00, train_acc 98.2, test_acc 78.9
step 2000, dt 4.24s, train_loss 2.885e-04, test_loss 1.169e+00, train_acc 100.0, test_acc 81.3
step 3000, dt 4.23s, train_loss 1.902e-04, test_loss 1.307e+00, train_acc 100.0, test_acc 81.0
step 4000, dt 4.29s, train_loss 6.407e-05, test_loss 1.417e+00, train_acc 100.0, test_acc 80.9
step 5000, dt 4.33s, train_loss 5.100e-05, test_loss 1.512e+00, train_acc 100.0, test_acc 80.8
step 6000, dt 4.36s, train_loss 2.455e-05, test_loss 1.604e+00, train_acc 100.0, test_acc 81.0
step 7000, dt 4.45s, train_loss 1.078e-05, test_loss 1.692e+00, train_acc 100.0, test_acc 81.3


100%|██████████| 100/100 [57:51<00:00, 34.72s/it]

step 8000, dt 4.52s, train_loss 8.071e-06, test_loss 1.777e+00, train_acc 100.0, test_acc 81.3





In [14]:
# for sub_index, subset in enumerate(subsets):
#     if 123 in subset['indices']:
#       print("x is in subset with index: ", sub_index)
#       # trainset_mask[subset_index]  = True

In [15]:
# np.shape(data['x'])[0]
len(list_of_mlp_models)

100

# Memorization

In [None]:

from mnist1d import train

def estimate_mem_infl(list_of_models, subsets, x_i_index):
  '''Computes memorization and influence estimates for a single x_i from the 
  dataset. 
  Parameters: list of all models and their results as well as the subset index 
  in the list of the subsets, the list of subsets, and the index of an x in 
  the training set for which we are looking to find memorization for.'''

  # list of all subset indices where the x_i is present
  # trainset_mask = np.zeros(train_data_size, dtype=np.bool)
  x_in_subsets= []
  x_not_in_subsets = []

  # Check in which subsets is the x_i present
  for sub_index, subset in enumerate(subsets):
    if x_i_index in subset['indices']:
      # Append the index to the list of indices for the chosen subsets 
      x_in_subsets.append(sub_index)
    else:
      # Append the index of a subset where x is NOT present
      x_not_in_subsets.append(sub_index)

  x_in_picked_models = []
  x_not_in_picked_models = []

  # print(len(x_not_in_subsets))
  # Take the models that are trained based on those subsets where x is present
  for index in x_in_subsets:
    x_in_picked_models.append(list_of_models[index])

  # Take the models that are trained based on those subsets where x_i is NOT present
  for index in x_not_in_subsets:
    x_not_in_picked_models.append(list_of_models[index])

  # picked_models = [model for index,model in enumerate(list_of_models) if index] 

  # Get the image and its label
  # # Convert splits into the proper forms
  # x_train, x_test = torch.Tensor(subsets[subset_index]['x']), torch.Tensor(subsets[subset_index]['x_test'])
  # y_train, y_test = torch.LongTensor(subsets[subset_index]['y']), torch.LongTensor(subsets[subset_index]['y_test'])
  x_train, x_test = torch.Tensor(data['x']), torch.Tensor(data['x_test'])
  y_train, y_test = torch.LongTensor(data['y']), torch.LongTensor(data['y_test'])

  input_image = x_train[x_i_index]
  image_label = y_train[x_i_index]

  # Count correct predictions
  num_correct_pred = 0

  # Check how many of these models are not making the error in classification
  for model in x_in_picked_models:
    
    prediction = model(input_image).argmax(-1).cpu().numpy()
    target = image_label.cpu().numpy().astype(np.float32)

    # Check if the predicion is the same as the label
    # If so, increment the count of correct rpedictions
    if prediction == target:
      num_correct_pred += 1
  
  # Take the fraction of the number of models that are predicting correctly
  # over the total number of models.
  # The result is the first probability in the equation .

  p_x_i_in = num_correct_pred / len(x_in_subsets)




  # Count correct predictions
  num_correct_pred = 0

  # Check how many of these models are not making the error in classification
  for model in x_not_in_picked_models:
    
    prediction = model(input_image).argmax(-1).cpu().numpy()
    target = image_label.cpu().numpy().astype(np.float32)

    # Check if the predicion is the same as the label
    # If so, increment the count of correct rpedictions
    if prediction == target:
      num_correct_pred += 1
  
  # Take the fraction of the number of models that are predicting correctly
  # over the total number of models.
  # The result is the first probability in the equation .

  p_x_i_not_in = num_correct_pred / len(x_not_in_subsets)


  memorization_estimate_x_i = p_x_i_in - p_x_i_not_in

  #*************** Influence of x_i on every j in test data *****************

  #Create a list for influence of x_i on each x_j
  x_i_inluence_list = []

  # # Check the influence by the models created based on datasets where x_i is present
  # # for j in range(np.shape(x_test)[0]):
  # for j in range(np.shape(x_test)[0]):
  #   # Take a single image from the testing part of the dataset
  #   input_image = x_test[j]
  #   image_label = y_test[j]

  #   # Count correct predictions
  #   num_correct_pred_x_in = 0
  #   num_correct_pred_x_not_in = 0

  #   # check how each model performs from the models where x_i is present
  #   for modelo in x_in_picked_models:  
  #       prediction = modelo(input_image).argmax(-1).cpu().numpy()
  #       target = image_label.cpu().numpy().astype(np.float32)

  #       # Check if the predicion is the same as the label
  #       # If so, increment the count of correct rpedictions
  #       if prediction == target:
  #         num_correct_pred_x_in += 1
    
  #   #Calculate the influence of x_i on the x_j from the test data
  #   infl_x_i_in_s_on_x_j = num_correct_pred_x_in / np.shape(x_test)[0]

  #   # check how each model performs from the models where x_i is NOT present
  #   for modelo in x_not_in_picked_models:  
  #       prediction = modelo(input_image).argmax(-1).cpu().numpy()
  #       target = image_label.cpu().numpy().astype(np.float32)

  #       # Check if the predicion is the same as the label
  #       # If so, increment the count of correct rpedictions
  #       if prediction == target:
  #         num_correct_pred_x_not_in += 1

  #   #Calculate the influence of x_i on the x_j from the test data
  #   infl_x_i_not_in_s_on_x_j = num_correct_pred_x_not_in / np.shape(x_test)[0]

  #   # Append the influence of x_i on every x_j from the test data
  #   x_i_inluence_list.append(infl_x_i_not_in_s_on_x_j)


  return memorization_estimate_x_i, x_i_inluence_list


# keep a list of all memorization values for histogram
mem_values_mlp = []
infl_values_mlp = []

# For every x_i in the training dataset find its memorization value
# and list of influences on each x_j from the testing dataset.
for x_i in tqdm(range(t)): #np.shape(data['x'])[0])
  # print()  
  mem, infl = estimate_mem_infl(list_of_mlp_models,subsets,x_i )

  # print("\nmemorization value for x_i is" , mem )
  # print("\n Its influence on every point in test data is: ", infl)

  # save the memorization in the list of mem values
  mem_values_mlp.append(mem)

  # save the influence in the list of infl_values_mlp
  # infl_values_mlp.append(infl)

mem_values_conv_base = []
infl_values_conv_base = []

# DO the same for ConvBase Model
for x_i in tqdm(range(t)): #np.shape(data['x'])[0])
  # print()  
  mem, infl = estimate_mem_infl(list_of_ConvBase_models ,subsets,x_i )

  # print("\nmemorization value for x_i is" , mem )
  # print("\n Its influence on every point in test data is: ", infl)

  # save the memorization in the list of mem values
  mem_values_conv_base.append(mem)

  # save the influence in the list of  infl_values_conv_base
  # infl_values_conv_base.append(infl)

100%|██████████| 100/100 [30:18<00:00, 18.18s/it]
 10%|█         | 10/100 [05:52<52:52, 35.25s/it]


KeyboardInterrupt: ignored

# Histogram of Memorization MLP Model

In [None]:
# Fixing random state for reproducibility
np.random.seed(19680801)
# the histogram of the data
n, bins, patches = plt.hist(mem_values_mlp, 20, facecolor='g', alpha=0.75)

plt.title('Memorizaztion Values Distribution')
plt.xlabel('Memorizaztion Values')
plt.ylabel('Freequency')
plt.xlim(-1, 1)
plt.savefig("MLP_model_histogram.png")
plt.show()


# Histogram of insfluence of X0 on the test data samples (MLP)

In [None]:
# Fixing random state for reproducibility
# np.random.seed(19680801)
# # the histogram of the data
# n, bins, patches = plt.hist(infl_values_mlp[0], 20, facecolor='g', alpha=0.75)

# plt.title('Influence Values Distribution MLP')
# plt.xlabel('Influence Values')
# plt.ylabel('Freequency')
# plt.xlim(-1, 1)
# plt.show()
# plt.savefig("MLP_model_histogram_influence.png")

# Histogram of Memorization ConvBase Model

In [None]:
# Fixing random state for reproducibility
np.random.seed(19680801)
# the histogram of the data
n, bins, patches = plt.hist(mem_values_conv_base, 20, facecolor='g', alpha=0.75)

plt.title('Memorizaztion Values Distribution')
plt.xlabel('Memorizaztion Values')
plt.ylabel('Freequency')
plt.xlim(-1, 1)
plt.savefig("ConvBase_model_histogram.png")
plt.show()


# Histogram of insfluence of X0 on the test data samples (ConvBase)

In [None]:
# Fixing random state for reproducibility
# np.random.seed(19680801)
# # the histogram of the data
# n, bins, patches = plt.hist(infl_values_conv_base[0], 20, facecolor='g', alpha=0.75)

# plt.title('Influence Values Distribution Conv Base')
# plt.xlabel('Influence Values')
# plt.ylabel('Freequency')
# plt.xlim(-1, 1)
# plt.show()
# plt.savefig("ConvBase_model_histogram_influence.png")

# Creatign a new dataset with more samples with a high memorization

In [None]:
# transformations of the templates which will make them harder to fit
def pad(x, padding):
    low, high = padding
    p = low + int(np.random.rand()*(high-low+1))
    return np.concatenate([x, np.zeros((p))])

def shear(x, scale=10):
    coeff = scale*(np.random.rand() - 0.5)
    return x - coeff*np.linspace(-0.5,.5,len(x))

def translate(x, max_translation):
    k = np.random.choice(max_translation)
    return np.concatenate([x[-k:], x[:-k]])

def corr_noise_like(x, scale):
    noise = scale * np.random.randn(*x.shape)
    return gaussian_filter(noise, 2)

def iid_noise_like(x, scale):
    noise = scale * np.random.randn(*x.shape)
    return noise

def interpolate(x, N):
    scale = np.linspace(0,1,len(x))
    new_scale = np.linspace(0,1,N)
    new_x = interp1d(scale, x, axis=0, kind='linear')(new_scale)
    return new_x

def transform(x, y, args, eps=1e-8):
    new_x = pad(x+eps, args.padding) # pad
    new_x = interpolate(new_x, args.template_len + args.padding[-1])  # dilate
    new_y = interpolate(y, args.template_len + args.padding[-1])
    new_x *= (1 + args.scale_coeff*(np.random.rand() - 0.5))  # scale
    new_x = translate(new_x, args.max_translation)  #translate
    
    # add noise
    mask = new_x != 0
    new_x = mask*new_x + (1-mask)*corr_noise_like(new_x, args.corr_noise_scale)
    new_x = new_x + iid_noise_like(new_x, args.iid_noise_scale)
    
    # shear and interpolate
    new_x = shear(new_x, args.shear_scale)
    new_x = interpolate(new_x, args.final_seq_length) # subsample
    new_y = interpolate(new_y, args.final_seq_length)
    return new_x, new_y

def get_dataset_args(as_dict=False):
    arg_dict = {'num_samples': 5000,
            'train_split': 0.8,
            'template_len': 12,
            'padding': [36,60],
            'scale_coeff': .4, 
            'max_translation': 48,
            'corr_noise_scale': 0.25,
            'iid_noise_scale': 2e-2,
            'shear_scale': 0.75,
            'shuffle_seq': False,
            'final_seq_length': 40,
            'seed': 42}
    return arg_dict if as_dict else ObjectView(arg_dict)

def apply_ablations(arg_dict, n=7):
    ablations = [('shear_scale', 0),
                ('iid_noise_scale', 0),
                ('corr_noise_scale', 0),
                 ('max_translation', 1),
                 ('scale_coeff', 0),
                 ('padding', [arg_dict['padding'][-1], arg_dict['padding'][-1]]),
                 ('padding', [0, 0]),]
    num_ablations = min(n, len(ablations))
    for i in range(num_ablations):
        k, v = ablations[i]
        arg_dict[k] = v
    return arg_dict

templates = get_templates()
for i, n in enumerate(reversed(range(8))):
    np.random.seed(0)
    arg_dict = get_dataset_args(as_dict=True)
    arg_dict = apply_ablations(arg_dict, n=n)
    args = ObjectView(arg_dict)
    do_transform = args.padding[0] != 0
    # fig = plot_signals(templates['x'], templates['t'], labels=None if do_transform else templates['y'],
    #              args=args, ratio=2.2 if do_transform else 0.8,
    #              do_transform=do_transform)
#     fig.savefig(PROJECT_DIR + 'static/transform_{}.png'.format(i))

def make_dataset(args=None, template=None, ):
    templates = get_templates() if template is None else template
    args = get_dataset_args() if args is None else args
    np.random.seed(args.seed) # reproducibility
    
    xs, ys = [], []
    samples_per_class = args.num_samples // len(templates['y'])
    for label_ix in range(len(templates['y'])):
        for example_ix in range(samples_per_class):
            x = templates['x'][label_ix]
            t = templates['t']
            y = templates['y'][label_ix]
            x, new_t = transform(x, t, args) # new_t transformation is same each time
            xs.append(x) ; ys.append(y)
    
    batch_shuffle = np.random.permutation(len(ys)) # shuffle batch dimension
    xs = np.stack(xs)[batch_shuffle]
    ys = np.stack(ys)[batch_shuffle]
    
    if args.shuffle_seq: # maybe shuffle the spatial dimension
        seq_shuffle = np.random.permutation(args.final_seq_length)
        xs = xs[...,seq_shuffle]
    
    new_t = new_t/xs.std()
    xs = (xs-xs.mean())/xs.std() # center the dataset & set standard deviation to 1

    # train / test split
    split_ix = int(len(ys)*args.train_split)
    dataset = {'x': xs[:split_ix], 'x_test': xs[split_ix:],
               'y': ys[:split_ix], 'y_test': ys[split_ix:],
               't':new_t, 'templates': templates}
    return dataset

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

args = get_dataset_args()
set_seed(args.seed)
args.corr_noise_scale = 0.5
args.iid_noise_scale = 2e-1
args.max_translation = 58 # Change these values to get more examples with a high
                          # memorization value
data = make_dataset(args=args)    # make the dataset

# Apply the estimates function on the new dataset

In [None]:
# generate a random subset of indices for the training and test data
random_num_generator = np.random.RandomState(15)

# Generate subset of random indices of size m from (0,train_data_size) without replacement.
random_indices = np.random.choice(train_data_size, size = m, replace = False)

print(len(random_indices))

train_images = data["x"][random_indices]
print(np.shape(train_images))

subsets = create_subsets(data,t,m)


# get the model info
args = get_model_args()

# list to keep all the models
list_of_mlp_models = []
list_of_ConvBase_models = []

# list ot keep all the training results
trained_mlp_model_results = []
trained_ConvBase_model_results = []


# MLP base
list_of_mlp_models, trained_mlp_model_results = train_MLPBase_models(t,subsets,args)

# ConvBase
list_of_ConvBase_models, trained_ConvBase_model_results = train_ConvBase_models(t,subsets,args)


# keep a list of all memorization values for histogram
mem_values_mlp_modified = []
infl_values_mlp_modified = []

# For every x_i in the training dataset find its memorization value
# and list of influences on each x_j from the testing dataset.
for x_i in tqdm(range(t)): 
  mem, infl = estimate_mem_infl(list_of_mlp_models,subsets,x_i )

  # save the memorization in the list of mem values
  mem_values_mlp_modified.append(mem)

  # save the influence in the list of infl_values_mlp_modified
  # infl_values_mlp_modified.append(infl)


mem_values_conv_base_modified = []
infl_values_conv_base_modified = []

# DO the same for ConvBase Model
for x_i in tqdm(range(t)): 
  mem, infl = estimate_mem_infl(list_of_ConvBase_models ,subsets,x_i )

  # save the memorization in the list of mem values
  mem_values_conv_base_modified.append(mem)

  # save the influence in the list of infl_values_conv_base_modified
  # infl_values_conv_base_modified.append(infl)

# Histogram of Memorization MLP Model on Modified Dataset

# Histogram of Memorization MLP Model

In [None]:
# Fixing random state for reproducibility
np.random.seed(19680801)
# the histogram of the data
n, bins, patches = plt.hist(mem_values_mlp_modified, 20, facecolor='g', alpha=0.75)

plt.title('Memorizaztion Values Distribution')
plt.xlabel('Memorizaztion Values')
plt.ylabel('Freequency')
plt.xlim(-1, 1)
plt.savefig("MLP_model_histogram_modified.png")
plt.show()


# Histogram of insfluence of X0 on the test data samples (MLP)

In [None]:
# Fixing random state for reproducibility
# np.random.seed(19680801)
# # the histogram of the data
# n, bins, patches = plt.hist(infl_values_mlp_modified[0], 20, facecolor='g', alpha=0.75)

# plt.title('Influence Values Distribution MLP')
# plt.xlabel('Influence Values')
# plt.ylabel('Freequency')
# plt.xlim(-1, 1)
# plt.show()
# plt.savefig("MLP_model_histogram_influence_modified.png")

# Histogram of Memorization ConvBase Model

In [None]:
# Fixing random state for reproducibility
np.random.seed(19680801)
# the histogram of the data
n, bins, patches = plt.hist(mem_values_conv_base_modified, 20, facecolor='g', alpha=0.75)

plt.title('Memorizaztion Values Distribution')
plt.xlabel('Memorizaztion Values')
plt.ylabel('Freequency')
plt.xlim(-1, 1)
plt.savefig("ConvBase_model_histogram_modified.png")
plt.show()


# Histogram of insfluence of X0 on the test data samples (ConvBase)

In [None]:
# # Fixing random state for reproducibility
# np.random.seed(19680801)
# # the histogram of the data
# n, bins, patches = plt.hist(infl_values_conv_base_modified[0], 20, facecolor='g', alpha=0.75)

# plt.title('Influence Values Distribution Conv Base')
# plt.xlabel('Influence Values')
# plt.ylabel('Freequency')
# plt.xlim(-1, 1)
# plt.show()
# plt.savefig("ConvBase_model_histogram_influence_modified.png")

In [None]:
# # Fixing random state for reproducibility
# np.random.seed(19680801)
# # the histogram of the data
# n, bins, patches = plt.hist(mem_values_mlp_modified, 20, facecolor='g', alpha=0.75)

# plt.title('Memorizaztion Values Distribution')
# plt.xlabel('Memorizaztion Values')
# plt.ylabel('Freequency')
# plt.xlim(-1, 1)
# plt.show()
# plt.savefig("MLP_model_histogram_modified.png")

In [None]:
# # Fixing random state for reproducibility
# np.random.seed(19680801)
# # the histogram of the data
# n, bins, patches = plt.hist(mem_values_conv_base_modified, 20, facecolor='g', alpha=0.75)

# plt.title('Memorizaztion Values Distribution')
# plt.xlabel('Memorizaztion Values')
# plt.ylabel('Freequency')
# plt.xlim(-1, 1)
# plt.show()
# plt.savefig("ConvBase_model_histogram_modified.png")