# Model & Dataset

In [None]:
! git clone https://github.com/Tikquuss/mlp_grokking

In [None]:
%cd mlp_grokking

In [None]:
! pip install -r requirements.txt

In [None]:
from src.modeling import Model
from src.dataset import get_dataloader
from src.trainer import train
from src.utils import AttrDict

In [None]:
train_pct=80
weight_decay=0.0
representation_lr=0.001
decoder_lr=0.001
representation_dropout=0.0
decoder_dropout=0.0
opt="adam"

#group_name=f"tdf={train_pct}-wd={weight_decay}-r_lr={representation_lr}-d_lr={decoder_lr}-r_d={representation_dropout}-d_d={decoder_dropout}-opt={opt}"
group_name=f"1"


random_seed=0
operator="+"
modular=False

log_dir="../log_files"

p = 10
task = "classification"

params = AttrDict({
    ### Main parameters
    "task" : task,
    "exp_id" : f"{task}_{group_name}",
    "log_dir" : f"{log_dir}/{random_seed}",

    ### Model
    "emb_dim" : 8, 
    "hidden_dim" : 16,  
    "n_layers" : 1,
	  "representation_dropout" : representation_dropout,
	  "decoder_dropout" : decoder_dropout,
    "pad_index" : None, 
    "p" : p, 

    ### Dataset
    "operator" : operator, 
    "modular" : modular,
    "train_pct" : train_pct,
    "batch_size" : 512,

    ### Optimizer
    "optimizer" : f"{opt},weight_decay={weight_decay},beta1=0.9,beta2=0.99,eps=0.00000001",
    "representation_lr" : representation_lr,
    "decoder_lr" : decoder_lr,

    ### LR Scheduler
    "lr_scheduler" : None,
    #"lr_scheduler" : "reduce_lr_on_plateau,factor=0.2,patience=20,min_lr=0.00005,mode=min,monitor=val_loss",
    
    ### Training
    "max_epochs" : 10, 
    "validation_metrics" : "val_loss",
    "checkpoint_path" : None, 
    "model_name": "", 
    "every_n_epochs":25, 
    "every_n_epochs_show":25, 
    "early_stopping_patience":1e9, 
    "save_top_k":-1,

    # Wandb 
    "use_wandb" : False,
    "wandb_entity" : "grokking_ppsp",
    "wandb_project" : f"toy_model_grokking_op={operator}-p={p}-task={task}-mod={modular}",
    "group_name" : group_name,

    "group_vars" : None,

    ### Intrinsic Dimension Estimation
    "ID_params" : {},
    #"ID_params": {"method" : "mle", "k":2},
    #"ID_params": {"method" : "twonn"},
    
    # Devices & Seed
    "accelerator" : "auto",
    "devices" : "auto",
    "random_seed": random_seed,

    ### Early_stopping (for grokking) : Stop the training `patience` epochs after the `metric` has reached the value `metric_threshold` 
    #"early_stopping_grokking" : None,
    "early_stopping_grokking" : "patience=int(1000),metric=str(val_acc),metric_threshold=float(90.0)",

})
params["weight_decay"] = weight_decay
params["regression"] = task == "regression"
train_loader, val_loader, dataloader, data_infos = get_dataloader(
    p, train_pct, regression = params.regression, operator=params.operator, 
    modular = params.modular, batch_size=params.batch_size, num_workers=2
)
print(data_infos)
params["data_infos"] = data_infos

In [None]:
model, result = train(params, train_loader, val_loader)

In [None]:
#! rm -r /content/log_files/0

In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/log_files/0/classification_1/lightning_logs

In [None]:
import re 

def sorted_nicely(l): 
    """ Sort the given iterable in the way that humans expect.
    https://stackoverflow.com/a/2669120/11814682
    """ 
    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

In [None]:
import os

In [None]:
pretrained_folder = "/content/log_files/0/classification_1"

model_files = os.listdir(pretrained_folder)
model_files = [f for f in model_files if re.match('^epoch=[0-9]+-val_loss=[0-9]+\.[0-9]+.ckpt$', f)]
model_files = sorted_nicely(model_files)
model_files = [pretrained_folder + "/" + f for f in model_files]

model_files

In [None]:
Model.load_from_checkpoint(model_files[0])

In [None]:
%cd /content/

# loss-landscape

In [None]:
#! rm -r loss-landscape

In [None]:
! git clone https://github.com/Tikquuss/loss-landscape

In [None]:
%cd loss-landscape

In [None]:
! pip install -r requirements.txt

In [None]:
from utils import AttrDict

In [None]:
lightning_module_class = Model

## code

### plot_surface

In [None]:
from plot_surface import plot_surface

In [None]:
args = AttrDict({ 
    
    'mpi' : True, # use cuda
    'cuda' : False, # use mpi
    'threads' : 2, # 'number of threads'
    'ngpu' : 1, # 'number of GPUs to use for each rank, useful for data parallel evaluation

    # data parameters

    'raw_data' :False, # 'no data preprocessing'
    'data_split' : 1, #'the number of splits for the dataloader')
    'split_idx' : 0, # 'the index of data splits for the dataloader'

    # model parameters
    
    # parser.add_argument('--model', default='resnet56', help='model name')
    # parser.add_argument('--model_folder', default='', help='the common folder that contains model_file and model_file2')
    'model_file' : model_files[0], # path to the trained model file
    #'model_file2' : model_files[-1], # use (model_file2 - model_file) as the xdirection
    'model_file2' : "", # use (model_file2 - model_file) as the xdirection
    'model_file3' : "", # use (model_file2 - model_file) as the xdirection
    #'loss_name' : 'crossentropy', # help='loss functions: crossentropy | mse')

    # direction parameters

    'dir_file' : '',  # 'specify the name of direction file, or the path to an eisting direction file
    'dir_type' : 'weights', #'direction type: weights | states (including BN\'s running_mean/var)'
    'x' : '-1:1:51', #'A string with format xmin:x_max:xnum'
    #'y' : None, #'A string with format ymin:ymax:ynum'
    'y' : '-1:1:51', #'A string with format ymin:ymax:ynum'
    'xnorm' : '', # 'direction normalization: filter | layer | weight'
    'ynorm' : '', # 'direction normalization: filter | layer | weight'
    'xignore' : '', #'ignore bias and BN parameters: biasbn'
    'yignore' : '', #'ignore bias and BN parameters: biasbn'
    'same_dir' : False, # 'use the same random direction for both x-axis and y-axis'
    'idx' : 0, # 'the index for the repeatness experiment')
    'surf_file' : '', # 'customize the name of surface file, could be an existing file.'

    # plot parameters

    'proj_file' : '', # 'the .h5 file contains projected optimization trajectory.'
    'loss_max' : 40, # 'Maximum value to show in 1D plot'
    'vmax' : 10, # 'Maximum value to map'
    'vmin' : 0.1, # 'Miminum value to map'
    'vlevel' : 0.5, # 'plot contours every vlevel'
    'show' : True, # 'show plotted figures'
    'log' : False, # 'use log scale for loss values'
    'plot' : True, # 'plot figures after computation'
})

In [None]:
dataloader = train_loader
#dataloader = val_loader

dir_file, surf_file = plot_surface(args, lightning_module_class, dataloader, metrics = ['test_loss', 'test_acc'])

### plot_trajectory

In [None]:
from plot_trajectory import plot_trajectory

In [None]:
args = AttrDict({ 
    'model_folder' : pretrained_folder, # 'folders for models to be projected'
    'dir_type' : 'weights', #"""direction type: weights (all weights except bias and BN paras) states (include BN.running_mean/var)""")
    'ignore' : '', # 'ignore bias and BN paras: biasbn (no bias or bn)')'
    'save_epoch' : 1, # 'save models every few epochs')

    'dir_file' : '', #'load the direction file for projection')
})

In [None]:
#os.remove("/content/log_files/0/classification_1/PCA_weights_save_epoch=1/directions.h5_proj_cos.h5")

In [None]:
proj_file, dir_file = plot_trajectory(args, model_files, lightning_module_class)

### plot_hessian_eigen

In [None]:
from plot_hessian_eigen import plot_hessian_eigen

In [None]:
args = AttrDict({ 
    
    'mpi' : True, # use cuda
    'cuda' : False, # use mpi
    'threads' : 2, # 'number of threads'
    'ngpu' : 1, # 'number of GPUs to use for each rank, useful for data parallel evaluation

    # data parameters

    'raw_data' :False, # 'no data preprocessing'
    'data_split' : 1, #'the number of splits for the dataloader')
    'split_idx' : 0, # 'the index of data splits for the dataloader'

    # model parameters
    
    # parser.add_argument('--model', default='resnet56', help='model name')
    # parser.add_argument('--model_folder', default='', help='the common folder that contains model_file and model_file2')
    'model_file' : model_files[0], # path to the trained model file
    'model_file2' : model_files[-1], # use (model_file2 - model_file) as the xdirection
    'model_file3' : "", # use (model_file2 - model_file) as the xdirection
    #'loss_name' : 'crossentropy', # help='loss functions: crossentropy | mse')

    # direction parameters

    'dir_file' : '',  # 'specify the name of direction file, or the path to an eisting direction file
    'dir_type' : 'weights', #'direction type: weights | states (including BN\'s running_mean/var)'
    'x' : '-1:1:51', #'A string with format xmin:x_max:xnum'
    'y' : None, #'A string with format ymin:ymax:ynum'
    #'y' : '-1:1:51', #'A string with format ymin:ymax:ynum'
    'xnorm' : '', # 'direction normalization: filter | layer | weight'
    'ynorm' : '', # 'direction normalization: filter | layer | weight'
    'xignore' : '', #'ignore bias and BN parameters: biasbn'
    'yignore' : '', #'ignore bias and BN parameters: biasbn'
    'same_dir' : False, # 'use the same random direction for both x-axis and y-axis'
    'idx' : 0, # 'the index for the repeatness experiment')
    'surf_file' : '', # 'customize the name of surface file, could be an existing file.'

    # plot parameters

    'show' : True, # help='show plotted figures')
    'plot' : True, #  help='plot figures after computation')
})

In [None]:
dataloader = train_loader
#dataloader = val_loader

In [None]:
def get_loss(pl_module, batch):
    """
    Given a batch of data, this function returns the  loss
    """    
    x, y = batch
    tensor, _, _ = pl_module.forward(x)
    loss = pl_module.criterion(input = tensor, target=y)
    return loss

In [None]:
dir_file, surf_file = plot_hessian_eigen(args, lightning_module_class, dataloader, get_loss)

## from scratsh

In [None]:
from torch.nn.utils import parameters_to_vector, vector_to_parameters

In [None]:
theta = parameters_to_vector(model.parameters())

In [None]:
model_2 = Model(params)
vector_to_parameters(theta, model_2.parameters())