In [1]:
#use_config = False
#config_path = "config/config.yaml"
use_config = True
config_path = "config/config-nopretrain.yaml"

In [2]:
sweep_config = {
        "method": "bayes",
        "metric": { "name": "val/acc", "goal": "maximize"},
        "parameters": {
            "model_name": { 
                "values": ["resnet50", "resnest26d"]
                #"values": ["resnet18", "resnet50", "resnest14d", "resnest26d"]
            },
            "optimizer_name": { 
                "values": ["Muon"]
                #"values": ["AdamW", "SGD", "Adam", "SAM", "Muon"]
            },
            "scheduler_name": { 
                "values": ["ReduceLROnPlateau"]
            },
            "lr_adam": {
                #"values": [2e-4, 3e-4, 5e-4, 1e-4]
                    "values": [0.009]
            },
            "weight_decay_adam": {
                #"values": [0.02, 0.05]
                    "values": [5e-4]
            },
            "lr_sam": {
                #"values": [0.05, 0.01, 0.02] #pretrain
                "values": [0.05, 0.025]
            },
            "lr_sgd": {
                #"values": [0.05, 0.01, 0.02] #pretrain
                "values": [0.1, 0.05, 0.025]
            },
            "weight_decay_sgd": {
                "values": [5e-4, 1e-4, 1e-3]
            },
            "epochs": {
                "value": 200
            },
            "rho": {
                "values": [0.05, 0.1]
            },
            "gamma": {
                "values": [0.1, 0.2]
            }
        }
    }

In [3]:
import yaml
import argparse
import sys
import random
import numpy as np
import torch

from data.loader import get_dataloaders
from models.model_factory import create_model
from optim_scheduler.optim_factory import get_optimizer
from optim_scheduler.scheduler_factory import get_scheduler
from training.trainer import Trainer

In [4]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'tema3.ipynb'

In [5]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [6]:
def load_config(path):
    return yaml.safe_load(open(path))

In [7]:
import torch.distributed as dist

def init_dist_if_needed():
    if not dist.is_available():
        return
    if dist.is_initialized():
        return
    dist.init_process_group(
        backend="gloo",                 
        init_method="tcp://127.0.0.1:29500",
        rank=0,
        world_size=1,
    )

In [None]:
if use_config:
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", default=config_path)
    args = parser.parse_args(args=[])  
    
    cfg = load_config(args.config)
    set_seed(cfg.get('seed', 42))
    
    device = torch.device(cfg.get('device','cuda' if torch.cuda.is_available() else 'cpu'))

    print(cfg['model']['name'])
    
    is_muon = (cfg['optimizer']['name'] == "Muon")
    is_sam = (cfg['optimizer']['name'] == "SAM")
    
    if is_muon:
        init_dist_if_needed()
    
    train_loader, val_loader, test_loader, num_classes = get_dataloaders(cfg)
    
    model = create_model(cfg, num_classes)
    model.to(device, non_blocking=True)
    model = torch.jit.script(model)
    
    optim_obj = get_optimizer(cfg['optimizer']['name'], model, cfg)
    scheduler = get_scheduler(cfg['scheduler']['name'], optim_obj, cfg) if cfg.get('scheduler') else None
    
    trainer = Trainer(model, optim_obj, scheduler, device, cfg, is_sam=is_sam)
    trainer.fit(train_loader, val_loader, test_loader, cfg['training']['epochs']) 


resnest26d


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmonicagabrielarepede[0m ([33mmonicagabrielarepede-universitatea-alexandru-ioan-cuza-d[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch 1: train_loss=4.3543 train_acc=1.3657 val_loss=3.9632 val_acc=0.1530 test_loss=4.0167 test_acc=0.1491 time=35.3s
Epoch 2: train_loss=4.2230 train_acc=1.6662 val_loss=3.7950 val_acc=0.1992 test_loss=3.7809 test_acc=0.2002 time=36.6s
Epoch 3: train_loss=4.1363 train_acc=1.8470 val_loss=3.6946 val_acc=0.2224 test_loss=3.7060 test_acc=0.2120 time=36.6s
Epoch 4: train_loss=4.0450 train_acc=1.5730 val_loss=3.5571 val_acc=0.2654 test_loss=3.5593 test_acc=0.2578 time=34.8s


In [None]:
#pip install git+https://github.com/KellerJordan/Muon

In [None]:
#!git clone https://github.com/davda54/sam.git

In [None]:
import wandb
def train_func(cfg_base):
    with wandb.init(config=cfg_base):
        cfg = wandb.config
        set_seed(cfg.get('seed', 42))
        
        device = torch.device(cfg.get('device', 'cuda' if torch.cuda.is_available() else 'cpu'))
        
        cfg['model']['name'] = cfg.model_name
        cfg['optimizer']['name'] = cfg.optimizer_name
        if (cfg['optimizer']['name']=="AdamW" or cfg['optimizer']['name']=="Muon"):
            cfg['optimizer']['lr'] = float(cfg.lr_adam)
        if (cfg['optimizer']['name']=="SGD"):
            cfg['optimizer']['lr'] = float(cfg.lr_sgd)
        if (cfg['optimizer']['name']=="SAM"):
            cfg['optimizer']['lr'] = float(cfg.lr_sam)
        
        cfg['scheduler']['name'] = cfg.scheduler_name
        cfg['training']['epochs'] = cfg.epochs
        cfg["scheduler"]["gamma"] = float(cfg.gamma)
        
        if (cfg['optimizer']['name']=="AdamW" or cfg['optimizer']['name']=="Muon"):
            cfg['optimizer']['weight_decay'] = cfg.get("weight_decay_adam", 0.01) 
        if (cfg['optimizer']['name']=="SAM" or cfg['optimizer']['name']=="SGD"):
            cfg['optimizer']['weight_decay'] = cfg.get("weight_decay_sgd", 0.01)

        is_muon = (cfg['optimizer']['name'] == "Muon")
        is_sam = (cfg['optimizer']['name'] == "SAM")

        if is_sam:
            cfg["optimizer"]["rho"] = float(cfg.rho)
        
        if is_muon:
            init_dist_if_needed() 
                
        train_loader, val_loader, test_loader, num_classes = get_dataloaders(cfg) 
        model = create_model(cfg, num_classes)
        

        model.to(device, non_blocking=True)
        model = torch.jit.script(model)
        
        optim_obj = get_optimizer(cfg['optimizer']['name'], model, cfg)
        scheduler = get_scheduler(cfg['scheduler']['name'], optim_obj, cfg) if cfg.get('scheduler') else None
        
        trainer = Trainer(model, optim_obj, scheduler, device, cfg, is_sam=is_sam)
        trainer.fit(train_loader, val_loader, test_loader, cfg['training']['epochs']) 
        

In [None]:
if not use_config:
    if __name__ == "__main__":
        parser = argparse.ArgumentParser()
        parser.add_argument("--config", default=config_path)
        args = parser.parse_args(args=[])  
        
        cfg_base = load_config(args.config)
        
        sweep_id = wandb.sweep(sweep=sweep_config, project='cifar100-sweep3_nopretrain')
        wandb.agent(sweep_id, function=lambda:train_func(cfg_base), count=70) 