In [1]:
!module load cuda/11.6.2
!module load cudnn/8.6.0.163-cuda11
!pip install -q --use-feature=2020-resolver pynvml zstandard datasets psutil transformers torch bitsandbytes accelerate loralib peft wandb  
!echo "Hostname: $(hostname)"
!echo "Processor: $(lscpu | grep 'Model name' | awk -F ':' '{print $2}' | xargs)"
!echo "RAM: $(free -h | grep 'Mem:' | awk '{print $4}')"

You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Hostname: gr025.hpc.nyu.edu
Processor: Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
RAM: 21Gi


In [2]:
!echo "GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader)"
!echo "GPU Memory: $(nvidia-smi | grep MiB |  awk '{print $9 $10 $11}')"

GPU: Quadro RTX 8000
GPU Memory: 0MiB/46080MiB


In [3]:
# Change net ID here to use your scratch folder
ENV = "dev"
NET_ID = "sa7055"
DATA_PATH =  f"/scratch/{NET_ID}/fine_tuning" 
ROOT_PATH = f"/scratch/{NET_ID}/fine_tuning/{ENV}"

# Global configurations
confige = {
    "DATASET_URL": "https://the-eye.eu/public/AI/pile_v2/data",
    "DATASET_NAME": "NIH_ExPORTER_awarded_grant_text",
    "NUM_WORKERS": 8,
    "DATASET_SPLIT_RATIO": 0.9,
    "PADDING_STRATEGY": "max_length",
    "MAX_TOKENS": 512,
    "MIN_GENERATION": 512,
    "MODEL_NAME": "facebook/opt-125m",
    "TOKENIZED_NAME": "opt_2700m_512",
    "BATCH_SIZE": 64,
    "NUM_EPOCHS": 30,
    "LEARNING_RATE": 5e-4,
    "MIN_LEARNING_RATE": 5e-5,
    "EPSILON": 1e-8,
    "BETAS": (0.9,0.95),
    "GRADIENT_CLIP": 1.0,
    "WEIGHT_DECAY": 0.01,
    "DECAY_STYLE": "cosine", #not used currently
    "WARMUP_RATIO": 0.003,
    "SAMPLING_INTERVAL": 20,
    "CHECKPOINTING_INTERVAL": 100,
    "VALIDATION_INTERVAL": 500,
    "GRADIENT_ACCUMULATION_STEPS": 4, #TODO: need to bring this back
    
    "DYNAMIC_LR": False,
    "PEFT": False,
}

from peft import LoraConfig, PeftConfig, get_peft_model 
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

# Ensure that packages can be found
import sys
sys.path.insert(0, f"/home/{NET_ID}/.local/lib/python3.8/site-packages")

# Ensure that GPU can be found
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
# os.environ["TORCHDYNAMO_DISABLE"] = "1"

# Setup logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s')

# Packages for profiling
import inspect
import math
import random
import psutil
from time import time
from tqdm import tqdm
import tqdm.notebook as tq
from pynvml import *

# Packages for data loading
from datasets import load_dataset, load_from_disk, DatasetDict, Dataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Core packages
import torch
torch.backends.cudnn.benchmark = True
torch.cuda.empty_cache()
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(False)
logging.info(f"Is Flash Attention Enabled: {torch.backends.cuda.flash_sdp_enabled()}")
logging.info(f"Is Mem Efficient SDP Enabled: {torch.backends.cuda.mem_efficient_sdp_enabled()}")
logging.info(f"Is Math SDP Enabled: {torch.backends.cuda.math_sdp_enabled()}")


from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_skip_modules=["lm_head"],
    llm_int8_threshold=3.0
)
from transformers.optimization import Adafactor
import bitsandbytes.optim as bnb_optim


# Get GPU Utilization
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    logging.info(f"GPU memory occupied: {info.used//1024**2} MB.")
    

# Returns RAM usage in MB
def get_ram_usage():
    return psutil.Process().memory_info().rss / (1024 * 1024)

# Returns number of trainable parameters and percentage
def print_trainable_parameters(model):
        trainable_params = 0
        all_param = 0
        for _, param in model.named_parameters():
            all_param += param.numel()
            if param.requires_grad:
                trainable_params += param.numel()
        logger.info(
            f"Parameters: Trainable- {trainable_params/1e6:.2f}M|| All- {all_param/1e6:.2f}M || Trainable%- {100 * trainable_params / all_param}"
        )

#Takes a batch of inputs and runs the tokenizer on them
def tokenize_function(examples, tokenizer):
    return tokenizer(
        examples["text"],
        padding=confige["PADDING_STRATEGY"],
        truncation=True,
        max_length=confige["MAX_TOKENS"],
        return_attention_mask=True
    )

# Tokenizes dataset and creates train and validation split
def preprocess_data(dataset, tokenizer):
    tokenized_dataset_path = f"{DATA_PATH}/datasets/tokenized_{confige['DATASET_NAME']}_{confige['TOKENIZED_NAME']}"
    train_dataset_path = f"{tokenized_dataset_path}_train"
    valid_dataset_path = f"{tokenized_dataset_path}_valid"
    if os.path.exists(train_dataset_path) and os.path.exists(valid_dataset_path):
        logger.info(f"Loading dataset from disk...")
        start_time = time()
        train_dataset = load_from_disk(train_dataset_path)
        valid_dataset = load_from_disk(valid_dataset_path)
        elapsed_time = time() - start_time
        logger.info(f"Time taken to load dataset from : {elapsed_time:.2f} seconds")
        return train_dataset, valid_dataset
        
    logger.info(f"Tokenizing the dataset...")
    start_time = time()
    try:
        tokenized_dataset = load_from_disk(tokenized_dataset_path)
    except Exception as e:
        logging.error(e)
        tokenized_dataset = dataset.map(
            tokenize_function,
            fn_kwargs={'tokenizer': tokenizer},
            batched=True,
            num_proc=8,
            remove_columns=["text", "meta"],
        )
        tokenized_dataset.save_to_disk(tokenized_dataset_path)

    elapsed_time = time() - start_time
    logger.info(f"Time taken to tokenize the dataset: {elapsed_time:.2f} seconds")

    logger.info(f"Splitting the dataset...")
    start_time = time()
    
    if os.path.exists(train_dataset_path) and os.path.exists(valid_dataset_path):
        train_dataset = load_from_disk(train_dataset_path)
        valid_dataset = load_from_disk(valid_dataset_path)
    else:
        train_size = int(confige["DATASET_SPLIT_RATIO"] * len(tokenized_dataset))
        datasets = DatasetDict({
            'train': Dataset.from_dict(tokenized_dataset[:train_size]),
            'valid': Dataset.from_dict(tokenized_dataset[train_size:])
        })
        train_dataset = datasets['train']
        valid_dataset = datasets['valid']
        train_dataset.save_to_disk(train_dataset_path)
        valid_dataset.save_to_disk(valid_dataset_path)
    elapsed_time = time() - start_time
    logger.info(f"Time taken to split the datasets (or load pre-split datasets): {elapsed_time:.2f} seconds")
    
    return train_dataset, valid_dataset

# Creates data loaders
def create_dataloaders(train_dataset, valid_dataset, data_collator):
    logger.info(f"Creating data loaders...")
    start_time = time()
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=confige["BATCH_SIZE"],
                                  num_workers=confige["NUM_WORKERS"],
                                  collate_fn=data_collator,
                                  pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset,
                                  sampler=SequentialSampler(valid_dataset),
                                  batch_size=confige["BATCH_SIZE"],
                                  num_workers=confige["NUM_WORKERS"],
                                  collate_fn=data_collator,
                                  pin_memory=True)
    elapsed_time = time() - start_time
    logging.info(f"Time taken to create data loaders: {elapsed_time:.2f} seconds")
    return train_dataloader, valid_dataloader

# Fetches tokenizer relevant to the model
def create_or_load_tokenizer(checkpointed_path=None):
    if checkpointed_path:
        tokenizer = AutoTokenizer.from_pretrained(checkpointed_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(confige["MODEL_NAME"], cache_dir=f"{DATA_PATH}/datasets")
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = 'left'
    return tokenizer

# Data preparation
def run_data_pipeline(tokenizer, load_from_file=False):
    # Measure how much RAM is being used before anything runs
    ram_usage = get_ram_usage()
    logging.info(f"Baseline: RAM used: {ram_usage:.2f} MB")

    # Load data, either from url or from datasets folder
    data_file_url = f"{confige['DATASET_URL']}/{confige['DATASET_NAME']}.jsonl.zst"
    try:
        if load_from_file:
            raise Exception
        dataset = load_dataset("json",
                               data_files=data_file_url,
                               num_proc=confige["NUM_WORKERS"],
                               split="train",
                               cache_dir=f"{DATA_PATH}/datasets")
    except Exception as e:
        logging.error(e)
        dataset = load_dataset("json",
                               data_files=f"{DATA_PATH}/datasets/{confige['DATASET_NAME']}.jsonl.zst",
                               num_proc=confige["NUM_WORKERS"],
                               split="train",
                               cache_dir=f"{DATA_PATH}/datasets")

    # Measurements relevant to the dataset
    ram_usage = get_ram_usage()
    logging.info(f"RAM used: {ram_usage:.2f} MB")
    logging.info(f"Dataset sample: {dataset[10]}")
    size_gb = dataset.dataset_size / (1024 ** 3)
    logging.info(f"Dataset size (cache file) : {size_gb:.2f} GB")

    # Fetch a tokenizer and tokenize + split the dataset
    train_dataset, valid_dataset = preprocess_data(dataset, tokenizer)

    # Create a data collator and use it to make data loaders
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    train_dataloader, valid_dataloader = create_dataloaders(train_dataset, valid_dataset, data_collator)

    return {
        "TRAIN_DATASET": train_dataset,
        "VALIDATION_DATASET": valid_dataset,
        "TRAIN_DATALOADER": train_dataloader,
        "VALIDATION_DATALOADER": valid_dataloader,
        "TOKENIZER": tokenizer
    }

#Get optimizer
def fetch_optimizer(model):
    # separate out all parameters to those that will and won't experience regularizing weight decay
    decay = set()
    no_decay = set()
    whitelist_weight_modules = (torch.nn.Linear, )
    blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
    for mn, m in model.named_modules():
        for pn, p in m.named_parameters():
            fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
            if pn.endswith('bias'):
                # all biases will not be decayed
                no_decay.add(fpn)
            elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                # weights of whitelist modules will be weight decayed
                decay.add(fpn)
            elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                # weights of blacklist modules will NOT be weight decayed
                no_decay.add(fpn)
    
    head_layers = set(['lm_head.weight', '_orig_mod.lm_head.weight', 'base_model.model.lm_head.0.weight'])
    decay = set([d for d in decay if d not in head_layers])

    # validate that we considered every parameter
    param_dict = {pn: p for pn, p in model.named_parameters()}
    inter_params = decay & no_decay
    union_params = decay | no_decay
    assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
    assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                % (str(param_dict.keys() - union_params), )

    # create the pytorch optimizer object
    optim_groups = [
        {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": confige["WEIGHT_DECAY"]},
        {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
    ]
    
    if(config["PEFT"]):
        optimizer = bnb_optim.AdamW(optim_groups, lr=confige["LEARNING_RATE"], betas=confige["BETAS"], weight_decay=config["WEIGHT_DECAY"], optim_bits=8)
        manager = bnb_optim.GlobalOptimManager.get_instance()

        skipped = 0
        for module in model.modules():
            if isinstance(module, torch.nn.Embedding):
                skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
                manager.register_module_override(module, "weight", {"optim_bits": 32})
                logger.info(f"bitsandbytes: will optimize {module} in fp32")
        logger.info(f"Quantizing: Skipped: {skipped/2**20}M params")
    else:
        # new PyTorch nightly has a new 'fused' option for AdamW that is much faster, only works for floating point values
        use_fused = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        logger.info(f"Using fused AdamW: {use_fused}")
        fused_arg_dict = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=confige["LEARNING_RATE"], betas=confige["BETAS"], weight_decay=confige["WEIGHT_DECAY"], **fused_arg_dict)

    return optimizer

# Get learning rate per iteration
def get_lr(it, max_iters):
    warmup_iters = int(config["WARMUP_RATIO"]*max_iters)
    if it < warmup_iters:
        return confige["LEARNING_RATE"] * it / warmup_iters
    if it > max_iters:
        return confige["MIN_LEARNING_RATE"]
    
    #Cosine decay after warmup phase is over
    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return confige["MIN_LEARNING_RATE"] + coeff * (confige["LEARNING_RATE"] - confige["MIN_LEARNING_RATE"])


# Create model
def create_or_load_model(checkpointed_path=None, quantized=confige["PEFT"], frozen=False, cast_layer_norm_to_fp32=False, cast_output_to_fp32=False):
    class CastOutputToFloat(torch.nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if checkpointed_path:
        model = AutoModelForCausalLM.from_pretrained(checkpointed_path)
        model.to(device)
    else:
        configuration = AutoConfig.from_pretrained(confige["MODEL_NAME"])
        
        if quantized:
             model = AutoModelForCausalLM.from_pretrained(confige["MODEL_NAME"], config=configuration, load_in_8bit=True, device_map='auto', quantization_config=quantization_config)
        else:
            model = AutoModelForCausalLM.from_pretrained(confige["MODEL_NAME"], config=configuration)
            model.to(device)
            
        if frozen:
            for param in model.parameters():
                param.requires_grad = False
                
        if cast_layer_norm_to_fp32:
            for param in model.parameters():
                if param.ndim == 1:
                    param.data = param.data.to(torch.float32)
                
    #Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    
    if cast_output_to_fp32:
        model.lm_head = CastOutputToFloat(model.lm_head)
    
    # Log details
    logger.info(f"Model: {confige['MODEL_NAME']}")
    print_trainable_parameters(model)
    logger.info(f"Memory Memory Footprint: {model.get_memory_footprint() / 1e6:,} MB")
    logger.info(f"Model is on device: {model.device}")
    
    model.config.use_cache = False
    return model, device

# Use the model to generate text
def generate(model, inputs):
    output_sequence = model.generate(
        **inputs,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        min_length=confige["MIN_GENERATION"],
        max_length=2*confige["MIN_GENERATION"],
        top_p=0.95,
        num_return_sequences=1
    )
    return output_sequence
    
def inference(model, tokenizer, device, quantized=confige["PEFT"]):
    # Put the model in eval mode and enable caching
    model.config.use_cache = True
    model.eval()
    
    inputs = tokenizer(tokenizer.eos_token+"This is", return_tensors="pt").to(device)
    # Generate a sequence of text tokens
    with torch.no_grad():
        if quantized:
            with torch.cuda.amp.autocast():
                output_sequence = generate(model, inputs)
        else:
            output_sequence = generate(model, inputs)
        

    # Decode the tokens to text
    generated_text = tokenizer.decode(output_sequence[0], 
                                      skip_special_tokens=True).replace('\n', '').replace('\t', ' ')

    # Put the model back into train mode and disable caching
    model.train()
    model.config.use_cache = False
    
    return generated_text

# Evaluate the model on a data loader
def validate(model, device, valid_dataloader):
    model.eval()
    total_eval_loss = 0.0
    counter = 0
    for index, batch in tqdm(enumerate(valid_dataloader,1)):
        if counter<5:
                print_gpu_utilization()
                counter+=1
        batch = {k: v.pin_memory().to(device, non_blocking=True) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss = outputs.loss
        total_eval_loss += loss.item()
        avg_eval_loss = total_eval_loss / index
        logging.info(f"Validation: Batch {index}/{len(valid_dataloader)}, Loss: {avg_eval_loss:.4f}")

    perplexity = torch.exp(torch.as_tensor(avg_eval_loss)).item()
    model.train()
    return avg_eval_loss, perplexity



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/sa7055/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/sa7055/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


2023-05-05 08:59:04,248 - INFO - <module>:85 - Is Flash Attention Enabled: True
2023-05-05 08:59:04,249 - INFO - <module>:86 - Is Mem Efficient SDP Enabled: False
2023-05-05 08:59:04,249 - INFO - <module>:87 - Is Math SDP Enabled: False


In [4]:
# Log in to your W&B account
import wandb
wandb.login()

2023-05-05 08:59:04,458 - ERROR - notebook_metadata:231 - Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msujaymanutd07[0m ([33mproject_work[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
sweep_config = {
    'method': 'random'
    }

In [6]:
metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [7]:
parameters_dict = {
    "GRADIENT_ACCUMULATION_STEPS": {
    'values': [4,8,16]
        },    
    "LEARNING_RATE": {
    "values": [5e-5, 5e-4]
},
    "GRADIENT_CLIP": {
          'values': [0.5, 1, 2.5,5]
        },
    "BETAS":{
        'values':[(0.85,0.9),(0.9,0.95)]
    },
    "WEIGHT_DECAY": {
        'values': [0.001,0.01]
    },
    "SAMPLING_INTERVAL": {
        'values':[10,20,50]
    },
    "NUM_EPOCHS": {
        'value': 1}
    
    }

sweep_config['parameters'] = parameters_dict

In [8]:
import pprint

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'BETAS': {'values': [(0.85, 0.9), (0.9, 0.95)]},
                'GRADIENT_ACCUMULATION_STEPS': {'values': [4, 8, 16]},
                'GRADIENT_CLIP': {'values': [0.5, 1, 2.5, 5]},
                'LEARNING_RATE': {'values': [5e-05, 0.0005]},
                'NUM_EPOCHS': {'value': 1},
                'SAMPLING_INTERVAL': {'values': [10, 20, 50]},
                'WEIGHT_DECAY': {'values': [0.001, 0.01]}}}


In [9]:

# Train the model
def train(model, device, data_dict, start_epoch=1, start_iteration_number=0):
    with wandb.init(config=sweep_config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        

        
        train_dataloader = data_dict["TRAIN_DATALOADER"]
        valid_dataloader = data_dict["VALIDATION_DATALOADER"]
        tokenizer = data_dict["TOKENIZER"]

        # Scaler for mixed precision training
        scaler = torch.cuda.amp.GradScaler(enabled=True)

        # Early stopping
        patience = 5
        min_loss = float("inf")
        epochs_since_min_loss = 0

        optimizer = Adafactor(model.parameters(), lr=config["LEARNING_RATE"], scale_parameter=False, relative_step=False, warmup_init=False)
        model.train()

        max_iters = len(train_dataloader)*config["NUM_EPOCHS"] 
        learning_rate = config["LEARNING_RATE"]
        VALIDATION_INTERVAL = 500
        # Go through each epoch
        iteration_number = start_iteration_number
        for epoch in tqdm(range(start_epoch,config["NUM_EPOCHS"]+1)):
            iteration_number_per_epoch = 0
            running_loss = 0.0
            logging.info(f"Epoch: {epoch}/{config['NUM_EPOCHS']}")

            #Go through each batch in the data loader
            for index, batch in tqdm(enumerate(train_dataloader, 1), total=len(train_dataloader)):
                iteration_number+=1
                optimizer.zero_grad(set_to_none=True)

                # For the initial warmup phase, keep an eye on the GPU utilization
                if iteration_number_per_epoch<5:
                    print_gpu_utilization()
                    iteration_number_per_epoch+=1

                #Validate the model at each validation interval
                if index%VALIDATION_INTERVAL==0:
                    logging.info("Running Validation...")
                    avg_eval_loss, perplexity = validate(model, device, valid_dataloader)
                    logging.info(f"Batch {index}/{len(train_dataloader)}, Validation Loss: {avg_eval_loss:.4f}, Perplexity: {perplexity:.2f}")
                    with open(f"{log_save_path}/validation.log", "a") as f:
                        f.write(f"{epoch}\t{index}\t{avg_eval_loss}\t{perplexity}\n")

                #Load batches in a non-blocking manner
                batch = {k: v.pin_memory().to(device, non_blocking=True) for k, v in batch.items()}

                #Forward pass using mixed precision training
                with torch.cuda.amp.autocast(dtype=torch.float16):
                    outputs = model(**batch)
                    loss = outputs.loss
                    loss = loss / config["GRADIENT_ACCUMULATION_STEPS"]

                # Log the loss
                running_loss += (loss.item()*config["GRADIENT_ACCUMULATION_STEPS"])
                avg_loss = running_loss / index
                logging.info(f"Batch {index}/{len(train_dataloader)}, Loss: {avg_loss:.4f}, Learning Rate: {learning_rate}")
                wandb.log({"loss": avg_loss, "epoch": epoch})
                # Backward pass
                scaler.scale(loss).backward()

                if index % config["GRADIENT_ACCUMULATION_STEPS"] == 0:
                    # Gradient clipping mechanism
                    if "GRADIENT_CLIP" in config:
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(model.parameters(), config["GRADIENT_CLIP"])
                        scaler.step(optimizer)
                    scaler.update()


            # After each epoch, check if the training loss has improved
            if avg_loss < min_loss:
                min_loss = avg_loss
                epochs_since_min_loss = 0
            else:
                epochs_since_min_loss += 1

            # Early stopping mechanism
            if epochs_since_min_loss >= patience:
                logger.info("Early stopping triggered. No improvement in training loss for {} epochs.".format(patience))
                break

In [10]:
import wandb


sweep_id = wandb.sweep(sweep_config,project = "fine_tuning")

print_gpu_utilization()
checkpointed_path = None
tokenizer = create_or_load_tokenizer(checkpointed_path=checkpointed_path)
data_dict = run_data_pipeline(tokenizer, load_from_file=False)


def train_agent():
    wandb.init(config=sweep_config)
    config = wandb.config
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #model = MyModel(config.input_size, config.hidden_size, config.output_size)
    model, device = create_or_load_model(checkpointed_path=checkpointed_path)
    train(model, device, data_dict)

wandb.agent(sweep_id, function=train_agent, count=20)


2023-05-05 08:59:08,216 - INFO - print_gpu_utilization:106 - GPU memory occupied: 526 MB.


Create sweep with ID: rz49gtgw
Sweep URL: https://wandb.ai/project_work/fine_tuning/sweeps/rz49gtgw


2023-05-05 08:59:08,625 - INFO - run_data_pipeline:222 - Baseline: RAM used: 436.96 MB
2023-05-05 08:59:11,819 - INFO - run_data_pipeline:244 - RAM used: 445.11 MB
2023-05-05 08:59:11,822 - INFO - run_data_pipeline:245 - Dataset sample: {'meta': {'APPLICATION_ID': 100075}, 'text': "ACF's Office of Refugee Resettlement (ORR) administers a variety of social service programs intended to connect newly resettled refugees with critical resources, help them become economically self-sufficient, and help them integrate into American society. One such program is the Refugee Cash Assistance (RCA) program, which provides both financial support and social services to newly resettled refugees. Refugee Cash Assistance is similar to TANF in that both are cash assistance programs that provide services aimed at promoting self-sufficiency; however the content, mode of delivery and rules surrounding these services vary significantly by state and locality. Some counties and states have reportedly integrate

2023-05-05 08:59:33,915 - INFO - create_or_load_model:370 - Model: facebook/opt-125m
2023-05-05 08:59:33,917 - INFO - print_trainable_parameters:121 - Parameters: Trainable- 125.24M|| All- 125.24M || Trainable%- 100.0
2023-05-05 08:59:33,918 - INFO - create_or_load_model:372 - Memory Memory Footprint: 500.957184 MB
2023-05-05 08:59:33,919 - INFO - create_or_load_model:373 - Model is on device: cuda:0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666820525036504, max=1.0)…

  0%|          | 0/1 [00:00<?, ?it/s]2023-05-05 08:59:39,647 - INFO - train:33 - Epoch: 1/1

  0%|          | 0/13861 [00:00<?, ?it/s][A2023-05-05 08:59:44,699 - INFO - print_gpu_utilization:106 - GPU memory occupied: 1604 MB.
2023-05-05 08:59:53,123 - INFO - train:65 - Batch 1/13861, Loss: 3.3928, Learning Rate: 5e-05

  0%|          | 1/13861 [00:14<54:37:31, 14.19s/it][A2023-05-05 08:59:54,475 - INFO - print_gpu_utilization:106 - GPU memory occupied: 27930 MB.
2023-05-05 08:59:54,905 - INFO - train:65 - Batch 2/13861, Loss: 3.3873, Learning Rate: 5e-05

  0%|          | 2/13861 [00:15<26:11:15,  6.80s/it][A2023-05-05 08:59:56,107 - INFO - print_gpu_utilization:106 - GPU memory occupied: 34202 MB.
2023-05-05 08:59:56,538 - INFO - train:65 - Batch 3/13861, Loss: 3.3769, Learning Rate: 5e-05

  0%|          | 3/13861 [00:17<17:05:14,  4.44s/it][A2023-05-05 08:59:57,733 - INFO - print_gpu_utilization:106 - GPU memory occupied: 34202 MB.
2023-05-05 08:59:58,164 - INFO - train:65 - Ba


  0%|          | 51/13861 [01:36<6:16:14,  1.63s/it][A2023-05-05 09:01:17,697 - INFO - train:65 - Batch 52/13861, Loss: 3.3523, Learning Rate: 5e-05

  0%|          | 52/13861 [01:38<6:15:51,  1.63s/it][A2023-05-05 09:01:19,327 - INFO - train:65 - Batch 53/13861, Loss: 3.3526, Learning Rate: 5e-05

  0%|          | 53/13861 [01:40<6:15:35,  1.63s/it][A2023-05-05 09:01:20,956 - INFO - train:65 - Batch 54/13861, Loss: 3.3526, Learning Rate: 5e-05

  0%|          | 54/13861 [01:41<6:15:23,  1.63s/it][A2023-05-05 09:01:22,587 - INFO - train:65 - Batch 55/13861, Loss: 3.3520, Learning Rate: 5e-05

  0%|          | 55/13861 [01:43<6:15:16,  1.63s/it][A2023-05-05 09:01:24,217 - INFO - train:65 - Batch 56/13861, Loss: 3.3505, Learning Rate: 5e-05

  0%|          | 56/13861 [01:45<6:18:28,  1.64s/it][A2023-05-05 09:01:25,894 - INFO - train:65 - Batch 57/13861, Loss: 3.3520, Learning Rate: 5e-05

  0%|          | 57/13861 [01:46<6:17:28,  1.64s/it][A2023-05-05 09:01:27,525 - INFO - train


  1%|          | 159/13861 [04:33<6:13:43,  1.64s/it][A2023-05-05 09:04:14,694 - INFO - train:65 - Batch 160/13861, Loss: 3.3185, Learning Rate: 5e-05

  1%|          | 160/13861 [04:35<6:16:46,  1.65s/it][A2023-05-05 09:04:16,376 - INFO - train:65 - Batch 161/13861, Loss: 3.3181, Learning Rate: 5e-05

  1%|          | 161/13861 [04:37<6:15:45,  1.65s/it][A2023-05-05 09:04:18,012 - INFO - train:65 - Batch 162/13861, Loss: 3.3178, Learning Rate: 5e-05

  1%|          | 162/13861 [04:38<6:14:58,  1.64s/it][A2023-05-05 09:04:19,646 - INFO - train:65 - Batch 163/13861, Loss: 3.3176, Learning Rate: 5e-05

  1%|          | 163/13861 [04:40<6:14:31,  1.64s/it][A2023-05-05 09:04:21,283 - INFO - train:65 - Batch 164/13861, Loss: 3.3170, Learning Rate: 5e-05

  1%|          | 164/13861 [04:42<6:14:11,  1.64s/it][A2023-05-05 09:04:22,919 - INFO - train:65 - Batch 165/13861, Loss: 3.3166, Learning Rate: 5e-05

  1%|          | 165/13861 [04:43<6:13:56,  1.64s/it][A2023-05-05 09:04:24,555 -


  2%|▏         | 213/13861 [06:02<6:12:45,  1.64s/it][A2023-05-05 09:05:43,363 - INFO - train:65 - Batch 214/13861, Loss: 3.2967, Learning Rate: 5e-05

  2%|▏         | 214/13861 [06:04<6:12:32,  1.64s/it][A2023-05-05 09:05:45,000 - INFO - train:65 - Batch 215/13861, Loss: 3.2968, Learning Rate: 5e-05

  2%|▏         | 215/13861 [06:05<6:12:24,  1.64s/it][A2023-05-05 09:05:46,636 - INFO - train:65 - Batch 216/13861, Loss: 3.2968, Learning Rate: 5e-05

  2%|▏         | 216/13861 [06:07<6:15:29,  1.65s/it][A2023-05-05 09:05:48,319 - INFO - train:65 - Batch 217/13861, Loss: 3.2964, Learning Rate: 5e-05

  2%|▏         | 217/13861 [06:09<6:14:28,  1.65s/it][A2023-05-05 09:05:49,955 - INFO - train:65 - Batch 218/13861, Loss: 3.2964, Learning Rate: 5e-05

  2%|▏         | 218/13861 [06:10<6:13:43,  1.64s/it][A2023-05-05 09:05:51,591 - INFO - train:65 - Batch 219/13861, Loss: 3.2965, Learning Rate: 5e-05

  2%|▏         | 219/13861 [06:12<6:13:16,  1.64s/it][A2023-05-05 09:05:53,229 -


  2%|▏         | 267/13861 [07:31<6:12:06,  1.64s/it][A2023-05-05 09:07:12,085 - INFO - train:65 - Batch 268/13861, Loss: 3.2836, Learning Rate: 5e-05

  2%|▏         | 268/13861 [07:33<6:11:46,  1.64s/it][A2023-05-05 09:07:13,723 - INFO - train:65 - Batch 269/13861, Loss: 3.2833, Learning Rate: 5e-05

  2%|▏         | 269/13861 [07:34<6:11:30,  1.64s/it][A2023-05-05 09:07:15,360 - INFO - train:65 - Batch 270/13861, Loss: 3.2829, Learning Rate: 5e-05

  2%|▏         | 270/13861 [07:36<6:11:18,  1.64s/it][A2023-05-05 09:07:16,998 - INFO - train:65 - Batch 271/13861, Loss: 3.2828, Learning Rate: 5e-05

  2%|▏         | 271/13861 [07:37<6:11:06,  1.64s/it][A2023-05-05 09:07:18,634 - INFO - train:65 - Batch 272/13861, Loss: 3.2825, Learning Rate: 5e-05

  2%|▏         | 272/13861 [07:39<6:14:10,  1.65s/it][A2023-05-05 09:07:20,319 - INFO - train:65 - Batch 273/13861, Loss: 3.2819, Learning Rate: 5e-05

  2%|▏         | 273/13861 [07:41<6:13:06,  1.65s/it][A2023-05-05 09:07:21,956 -


  2%|▏         | 321/13861 [09:00<6:11:46,  1.65s/it][A2023-05-05 09:08:40,829 - INFO - train:65 - Batch 322/13861, Loss: 3.2758, Learning Rate: 5e-05

  2%|▏         | 322/13861 [09:01<6:11:03,  1.64s/it][A2023-05-05 09:08:42,466 - INFO - train:65 - Batch 323/13861, Loss: 3.2756, Learning Rate: 5e-05

  2%|▏         | 323/13861 [09:03<6:10:31,  1.64s/it][A2023-05-05 09:08:44,103 - INFO - train:65 - Batch 324/13861, Loss: 3.2757, Learning Rate: 5e-05

  2%|▏         | 324/13861 [09:05<6:10:12,  1.64s/it][A2023-05-05 09:08:45,741 - INFO - train:65 - Batch 325/13861, Loss: 3.2754, Learning Rate: 5e-05

  2%|▏         | 325/13861 [09:06<6:09:55,  1.64s/it][A2023-05-05 09:08:47,378 - INFO - train:65 - Batch 326/13861, Loss: 3.2754, Learning Rate: 5e-05

  2%|▏         | 326/13861 [09:08<6:09:43,  1.64s/it][A2023-05-05 09:08:49,016 - INFO - train:65 - Batch 327/13861, Loss: 3.2753, Learning Rate: 5e-05

  2%|▏         | 327/13861 [09:09<6:09:40,  1.64s/it][A2023-05-05 09:08:50,654 -


  3%|▎         | 375/13861 [10:28<6:08:19,  1.64s/it][A2023-05-05 09:10:09,533 - INFO - train:65 - Batch 376/13861, Loss: 3.2673, Learning Rate: 5e-05

  3%|▎         | 376/13861 [10:30<6:11:21,  1.65s/it][A2023-05-05 09:10:11,217 - INFO - train:65 - Batch 377/13861, Loss: 3.2669, Learning Rate: 5e-05

  3%|▎         | 377/13861 [10:32<6:10:20,  1.65s/it][A2023-05-05 09:10:12,854 - INFO - train:65 - Batch 378/13861, Loss: 3.2668, Learning Rate: 5e-05

  3%|▎         | 378/13861 [10:33<6:09:32,  1.64s/it][A2023-05-05 09:10:14,491 - INFO - train:65 - Batch 379/13861, Loss: 3.2666, Learning Rate: 5e-05

  3%|▎         | 379/13861 [10:35<6:09:05,  1.64s/it][A2023-05-05 09:10:16,129 - INFO - train:65 - Batch 380/13861, Loss: 3.2663, Learning Rate: 5e-05

  3%|▎         | 380/13861 [10:37<6:08:42,  1.64s/it][A2023-05-05 09:10:17,767 - INFO - train:65 - Batch 381/13861, Loss: 3.2661, Learning Rate: 5e-05

  3%|▎         | 381/13861 [10:38<6:08:26,  1.64s/it][A2023-05-05 09:10:19,405 -


  3%|▎         | 429/13861 [11:57<6:07:11,  1.64s/it][A2023-05-05 09:11:38,293 - INFO - train:65 - Batch 430/13861, Loss: 3.2566, Learning Rate: 5e-05

  3%|▎         | 430/13861 [11:59<6:06:56,  1.64s/it][A2023-05-05 09:11:39,930 - INFO - train:65 - Batch 431/13861, Loss: 3.2564, Learning Rate: 5e-05

  3%|▎         | 431/13861 [12:00<6:06:46,  1.64s/it][A2023-05-05 09:11:41,567 - INFO - train:65 - Batch 432/13861, Loss: 3.2563, Learning Rate: 5e-05

  3%|▎         | 432/13861 [12:02<6:09:45,  1.65s/it][A2023-05-05 09:11:43,250 - INFO - train:65 - Batch 433/13861, Loss: 3.2562, Learning Rate: 5e-05

  3%|▎         | 433/13861 [12:04<6:08:45,  1.65s/it][A2023-05-05 09:11:44,889 - INFO - train:65 - Batch 434/13861, Loss: 3.2562, Learning Rate: 5e-05

  3%|▎         | 434/13861 [12:05<6:08:02,  1.64s/it][A2023-05-05 09:11:46,526 - INFO - train:65 - Batch 435/13861, Loss: 3.2560, Learning Rate: 5e-05

  3%|▎         | 435/13861 [12:07<6:07:32,  1.64s/it][A2023-05-05 09:11:48,163 -


  3%|▎         | 483/13861 [13:26<6:06:19,  1.64s/it][A2023-05-05 09:13:07,061 - INFO - train:65 - Batch 484/13861, Loss: 3.2503, Learning Rate: 5e-05

  3%|▎         | 484/13861 [13:27<6:05:54,  1.64s/it][A2023-05-05 09:13:08,698 - INFO - train:65 - Batch 485/13861, Loss: 3.2501, Learning Rate: 5e-05

  3%|▎         | 485/13861 [13:29<6:05:39,  1.64s/it][A2023-05-05 09:13:10,336 - INFO - train:65 - Batch 486/13861, Loss: 3.2502, Learning Rate: 5e-05

  4%|▎         | 486/13861 [13:31<6:05:27,  1.64s/it][A2023-05-05 09:13:11,973 - INFO - train:65 - Batch 487/13861, Loss: 3.2502, Learning Rate: 5e-05

  4%|▎         | 487/13861 [13:32<6:05:18,  1.64s/it][A2023-05-05 09:13:13,611 - INFO - train:65 - Batch 488/13861, Loss: 3.2500, Learning Rate: 5e-05

  4%|▎         | 488/13861 [13:34<6:08:16,  1.65s/it][A2023-05-05 09:13:15,295 - INFO - train:65 - Batch 489/13861, Loss: 3.2499, Learning Rate: 5e-05

  4%|▎         | 489/13861 [13:36<6:07:17,  1.65s/it][A2023-05-05 09:13:16,932 -



44it [00:47,  1.05s/it][A[A2023-05-05 09:14:21,916 - INFO - validate:433 - Validation: Batch 45/1541, Loss: 3.2862


45it [00:48,  1.05s/it][A[A2023-05-05 09:14:22,972 - INFO - validate:433 - Validation: Batch 46/1541, Loss: 3.2884


46it [00:49,  1.05s/it][A[A2023-05-05 09:14:24,020 - INFO - validate:433 - Validation: Batch 47/1541, Loss: 3.2902


47it [00:50,  1.05s/it][A[A2023-05-05 09:14:25,079 - INFO - validate:433 - Validation: Batch 48/1541, Loss: 3.2944


48it [00:51,  1.05s/it][A[A2023-05-05 09:14:26,124 - INFO - validate:433 - Validation: Batch 49/1541, Loss: 3.2936


49it [00:52,  1.05s/it][A[A2023-05-05 09:14:27,161 - INFO - validate:433 - Validation: Batch 50/1541, Loss: 3.2956


50it [00:53,  1.05s/it][A[A2023-05-05 09:14:28,208 - INFO - validate:433 - Validation: Batch 51/1541, Loss: 3.2935


51it [00:54,  1.05s/it][A[A2023-05-05 09:14:29,264 - INFO - validate:433 - Validation: Batch 52/1541, Loss: 3.2919


52it [00:56,  1.05s/it][A[A2023-05-05 09:14:



113it [02:00,  1.06s/it][A[A2023-05-05 09:15:35,005 - INFO - validate:433 - Validation: Batch 114/1541, Loss: 3.2065


114it [02:01,  1.06s/it][A[A2023-05-05 09:15:36,066 - INFO - validate:433 - Validation: Batch 115/1541, Loss: 3.2069


115it [02:02,  1.06s/it][A[A2023-05-05 09:15:37,119 - INFO - validate:433 - Validation: Batch 116/1541, Loss: 3.2067


116it [02:03,  1.06s/it][A[A2023-05-05 09:15:38,177 - INFO - validate:433 - Validation: Batch 117/1541, Loss: 3.2060


117it [02:04,  1.06s/it][A[A2023-05-05 09:15:39,236 - INFO - validate:433 - Validation: Batch 118/1541, Loss: 3.2059


118it [02:05,  1.06s/it][A[A2023-05-05 09:15:40,296 - INFO - validate:433 - Validation: Batch 119/1541, Loss: 3.2057


119it [02:07,  1.06s/it][A[A2023-05-05 09:15:41,357 - INFO - validate:433 - Validation: Batch 120/1541, Loss: 3.2044


120it [02:08,  1.06s/it][A[A2023-05-05 09:15:42,421 - INFO - validate:433 - Validation: Batch 121/1541, Loss: 3.2041


121it [02:09,  1.06s/it][A[A



181it [03:12,  1.06s/it][A[A2023-05-05 09:16:47,022 - INFO - validate:433 - Validation: Batch 182/1541, Loss: 3.1938


182it [03:13,  1.06s/it][A[A2023-05-05 09:16:48,082 - INFO - validate:433 - Validation: Batch 183/1541, Loss: 3.1934


183it [03:14,  1.06s/it][A[A2023-05-05 09:16:49,141 - INFO - validate:433 - Validation: Batch 184/1541, Loss: 3.1931


184it [03:15,  1.06s/it][A[A2023-05-05 09:16:50,200 - INFO - validate:433 - Validation: Batch 185/1541, Loss: 3.1931


185it [03:16,  1.06s/it][A[A2023-05-05 09:16:51,256 - INFO - validate:433 - Validation: Batch 186/1541, Loss: 3.1937


186it [03:17,  1.06s/it][A[A2023-05-05 09:16:52,314 - INFO - validate:433 - Validation: Batch 187/1541, Loss: 3.1937


187it [03:19,  1.06s/it][A[A2023-05-05 09:16:53,376 - INFO - validate:433 - Validation: Batch 188/1541, Loss: 3.1936


188it [03:20,  1.06s/it][A[A2023-05-05 09:16:54,435 - INFO - validate:433 - Validation: Batch 189/1541, Loss: 3.1932


189it [03:21,  1.06s/it][A[A



249it [04:24,  1.06s/it][A[A2023-05-05 09:17:59,059 - INFO - validate:433 - Validation: Batch 250/1541, Loss: 3.1848


250it [04:25,  1.06s/it][A[A2023-05-05 09:18:00,119 - INFO - validate:433 - Validation: Batch 251/1541, Loss: 3.1848


251it [04:26,  1.06s/it][A[A2023-05-05 09:18:01,177 - INFO - validate:433 - Validation: Batch 252/1541, Loss: 3.1846


252it [04:27,  1.06s/it][A[A2023-05-05 09:18:02,237 - INFO - validate:433 - Validation: Batch 253/1541, Loss: 3.1843


253it [04:28,  1.06s/it][A[A2023-05-05 09:18:03,299 - INFO - validate:433 - Validation: Batch 254/1541, Loss: 3.1842


254it [04:30,  1.06s/it][A[A2023-05-05 09:18:04,358 - INFO - validate:433 - Validation: Batch 255/1541, Loss: 3.1842


255it [04:31,  1.06s/it][A[A2023-05-05 09:18:05,417 - INFO - validate:433 - Validation: Batch 256/1541, Loss: 3.1841


256it [04:32,  1.06s/it][A[A2023-05-05 09:18:06,475 - INFO - validate:433 - Validation: Batch 257/1541, Loss: 3.1842


257it [04:33,  1.06s/it][A[A



317it [05:36,  1.06s/it][A[A2023-05-05 09:19:11,077 - INFO - validate:433 - Validation: Batch 318/1541, Loss: 3.1821


318it [05:37,  1.06s/it][A[A2023-05-05 09:19:12,137 - INFO - validate:433 - Validation: Batch 319/1541, Loss: 3.1821


319it [05:38,  1.06s/it][A[A2023-05-05 09:19:13,197 - INFO - validate:433 - Validation: Batch 320/1541, Loss: 3.1821


320it [05:39,  1.06s/it][A[A2023-05-05 09:19:14,256 - INFO - validate:433 - Validation: Batch 321/1541, Loss: 3.1821


321it [05:40,  1.06s/it][A[A2023-05-05 09:19:15,315 - INFO - validate:433 - Validation: Batch 322/1541, Loss: 3.1822


322it [05:42,  1.06s/it][A[A2023-05-05 09:19:16,373 - INFO - validate:433 - Validation: Batch 323/1541, Loss: 3.1822


323it [05:43,  1.06s/it][A[A2023-05-05 09:19:17,433 - INFO - validate:433 - Validation: Batch 324/1541, Loss: 3.1819


324it [05:44,  1.06s/it][A[A2023-05-05 09:19:18,494 - INFO - validate:433 - Validation: Batch 325/1541, Loss: 3.1818


325it [05:45,  1.06s/it][A[A



385it [06:48,  1.06s/it][A[A2023-05-05 09:20:23,087 - INFO - validate:433 - Validation: Batch 386/1541, Loss: 3.1829


386it [06:49,  1.06s/it][A[A2023-05-05 09:20:24,144 - INFO - validate:433 - Validation: Batch 387/1541, Loss: 3.1828


387it [06:50,  1.06s/it][A[A2023-05-05 09:20:25,205 - INFO - validate:433 - Validation: Batch 388/1541, Loss: 3.1825


388it [06:51,  1.06s/it][A[A2023-05-05 09:20:26,266 - INFO - validate:433 - Validation: Batch 389/1541, Loss: 3.1826


389it [06:53,  1.06s/it][A[A2023-05-05 09:20:27,324 - INFO - validate:433 - Validation: Batch 390/1541, Loss: 3.1827


390it [06:54,  1.06s/it][A[A2023-05-05 09:20:28,384 - INFO - validate:433 - Validation: Batch 391/1541, Loss: 3.1829


391it [06:55,  1.06s/it][A[A2023-05-05 09:20:29,440 - INFO - validate:433 - Validation: Batch 392/1541, Loss: 3.1828


392it [06:56,  1.06s/it][A[A2023-05-05 09:20:30,500 - INFO - validate:433 - Validation: Batch 393/1541, Loss: 3.1828


393it [06:57,  1.06s/it][A[A



453it [08:00,  1.06s/it][A[A2023-05-05 09:21:35,100 - INFO - validate:433 - Validation: Batch 454/1541, Loss: 3.1825


454it [08:01,  1.06s/it][A[A2023-05-05 09:21:36,158 - INFO - validate:433 - Validation: Batch 455/1541, Loss: 3.1827


455it [08:02,  1.06s/it][A[A2023-05-05 09:21:37,218 - INFO - validate:433 - Validation: Batch 456/1541, Loss: 3.1826


456it [08:03,  1.06s/it][A[A2023-05-05 09:21:38,276 - INFO - validate:433 - Validation: Batch 457/1541, Loss: 3.1825


457it [08:05,  1.06s/it][A[A2023-05-05 09:21:39,336 - INFO - validate:433 - Validation: Batch 458/1541, Loss: 3.1824


458it [08:06,  1.06s/it][A[A2023-05-05 09:21:40,395 - INFO - validate:433 - Validation: Batch 459/1541, Loss: 3.1822


459it [08:07,  1.06s/it][A[A2023-05-05 09:21:41,457 - INFO - validate:433 - Validation: Batch 460/1541, Loss: 3.1821


460it [08:08,  1.06s/it][A[A2023-05-05 09:21:42,515 - INFO - validate:433 - Validation: Batch 461/1541, Loss: 3.1820


461it [08:09,  1.06s/it][A[A



521it [09:12,  1.06s/it][A[A2023-05-05 09:22:47,109 - INFO - validate:433 - Validation: Batch 522/1541, Loss: 3.1811


522it [09:13,  1.06s/it][A[A2023-05-05 09:22:48,168 - INFO - validate:433 - Validation: Batch 523/1541, Loss: 3.1810


523it [09:14,  1.06s/it][A[A2023-05-05 09:22:49,227 - INFO - validate:433 - Validation: Batch 524/1541, Loss: 3.1810


524it [09:15,  1.06s/it][A[A2023-05-05 09:22:50,285 - INFO - validate:433 - Validation: Batch 525/1541, Loss: 3.1810


525it [09:17,  1.06s/it][A[A2023-05-05 09:22:51,346 - INFO - validate:433 - Validation: Batch 526/1541, Loss: 3.1810


526it [09:18,  1.06s/it][A[A2023-05-05 09:22:52,405 - INFO - validate:433 - Validation: Batch 527/1541, Loss: 3.1810


527it [09:19,  1.06s/it][A[A2023-05-05 09:22:53,464 - INFO - validate:433 - Validation: Batch 528/1541, Loss: 3.1810


528it [09:20,  1.06s/it][A[A2023-05-05 09:22:54,523 - INFO - validate:433 - Validation: Batch 529/1541, Loss: 3.1811


529it [09:21,  1.06s/it][A[A



589it [10:24,  1.06s/it][A[A2023-05-05 09:23:59,153 - INFO - validate:433 - Validation: Batch 590/1541, Loss: 3.1808


590it [10:25,  1.06s/it][A[A2023-05-05 09:24:00,211 - INFO - validate:433 - Validation: Batch 591/1541, Loss: 3.1807


591it [10:26,  1.06s/it][A[A2023-05-05 09:24:01,273 - INFO - validate:433 - Validation: Batch 592/1541, Loss: 3.1807


592it [10:28,  1.06s/it][A[A2023-05-05 09:24:02,331 - INFO - validate:433 - Validation: Batch 593/1541, Loss: 3.1807


593it [10:29,  1.06s/it][A[A2023-05-05 09:24:03,394 - INFO - validate:433 - Validation: Batch 594/1541, Loss: 3.1807


594it [10:30,  1.06s/it][A[A2023-05-05 09:24:04,450 - INFO - validate:433 - Validation: Batch 595/1541, Loss: 3.1809


595it [10:31,  1.06s/it][A[A2023-05-05 09:24:05,507 - INFO - validate:433 - Validation: Batch 596/1541, Loss: 3.1809


596it [10:32,  1.06s/it][A[A2023-05-05 09:24:06,565 - INFO - validate:433 - Validation: Batch 597/1541, Loss: 3.1810


597it [10:33,  1.06s/it][A[A



657it [11:36,  1.06s/it][A[A2023-05-05 09:25:11,160 - INFO - validate:433 - Validation: Batch 658/1541, Loss: 3.1816


658it [11:37,  1.06s/it][A[A2023-05-05 09:25:12,220 - INFO - validate:433 - Validation: Batch 659/1541, Loss: 3.1817


659it [11:38,  1.06s/it][A[A2023-05-05 09:25:13,278 - INFO - validate:433 - Validation: Batch 660/1541, Loss: 3.1817


660it [11:40,  1.06s/it][A[A2023-05-05 09:25:14,335 - INFO - validate:433 - Validation: Batch 661/1541, Loss: 3.1819


661it [11:41,  1.06s/it][A[A2023-05-05 09:25:15,393 - INFO - validate:433 - Validation: Batch 662/1541, Loss: 3.1820


662it [11:42,  1.06s/it][A[A2023-05-05 09:25:16,450 - INFO - validate:433 - Validation: Batch 663/1541, Loss: 3.1820


663it [11:43,  1.06s/it][A[A2023-05-05 09:25:17,509 - INFO - validate:433 - Validation: Batch 664/1541, Loss: 3.1821


664it [11:44,  1.06s/it][A[A2023-05-05 09:25:18,571 - INFO - validate:433 - Validation: Batch 665/1541, Loss: 3.1822


665it [11:45,  1.06s/it][A[A



725it [12:48,  1.06s/it][A[A2023-05-05 09:26:23,062 - INFO - validate:433 - Validation: Batch 726/1541, Loss: 3.1891


726it [12:49,  1.06s/it][A[A2023-05-05 09:26:24,118 - INFO - validate:433 - Validation: Batch 727/1541, Loss: 3.1892


727it [12:50,  1.06s/it][A[A2023-05-05 09:26:25,173 - INFO - validate:433 - Validation: Batch 728/1541, Loss: 3.1893


728it [12:51,  1.06s/it][A[A2023-05-05 09:26:26,230 - INFO - validate:433 - Validation: Batch 729/1541, Loss: 3.1893


729it [12:52,  1.06s/it][A[A2023-05-05 09:26:27,282 - INFO - validate:433 - Validation: Batch 730/1541, Loss: 3.1894


730it [12:54,  1.06s/it][A[A2023-05-05 09:26:28,342 - INFO - validate:433 - Validation: Batch 731/1541, Loss: 3.1895


731it [12:55,  1.06s/it][A[A2023-05-05 09:26:29,400 - INFO - validate:433 - Validation: Batch 732/1541, Loss: 3.1896


732it [12:56,  1.06s/it][A[A2023-05-05 09:26:30,455 - INFO - validate:433 - Validation: Batch 733/1541, Loss: 3.1897


733it [12:57,  1.06s/it][A[A



793it [14:00,  1.05s/it][A[A2023-05-05 09:27:34,738 - INFO - validate:433 - Validation: Batch 794/1541, Loss: 3.1900


794it [14:01,  1.05s/it][A[A2023-05-05 09:27:35,789 - INFO - validate:433 - Validation: Batch 795/1541, Loss: 3.1899


795it [14:02,  1.05s/it][A[A2023-05-05 09:27:36,845 - INFO - validate:433 - Validation: Batch 796/1541, Loss: 3.1899


796it [14:03,  1.05s/it][A[A2023-05-05 09:27:37,898 - INFO - validate:433 - Validation: Batch 797/1541, Loss: 3.1899


797it [14:04,  1.05s/it][A[A2023-05-05 09:27:38,953 - INFO - validate:433 - Validation: Batch 798/1541, Loss: 3.1899


798it [14:05,  1.05s/it][A[A2023-05-05 09:27:40,008 - INFO - validate:433 - Validation: Batch 799/1541, Loss: 3.1898


799it [14:06,  1.05s/it][A[A2023-05-05 09:27:41,061 - INFO - validate:433 - Validation: Batch 800/1541, Loss: 3.1897


800it [14:07,  1.05s/it][A[A2023-05-05 09:27:42,115 - INFO - validate:433 - Validation: Batch 801/1541, Loss: 3.1896


801it [14:08,  1.05s/it][A[A



861it [15:12,  1.06s/it][A[A2023-05-05 09:28:46,474 - INFO - validate:433 - Validation: Batch 862/1541, Loss: 3.1885


862it [15:13,  1.06s/it][A[A2023-05-05 09:28:47,533 - INFO - validate:433 - Validation: Batch 863/1541, Loss: 3.1884


863it [15:14,  1.06s/it][A[A2023-05-05 09:28:48,590 - INFO - validate:433 - Validation: Batch 864/1541, Loss: 3.1884


864it [15:15,  1.06s/it][A[A2023-05-05 09:28:49,646 - INFO - validate:433 - Validation: Batch 865/1541, Loss: 3.1885


865it [15:16,  1.06s/it][A[A2023-05-05 09:28:50,700 - INFO - validate:433 - Validation: Batch 866/1541, Loss: 3.1886


866it [15:17,  1.06s/it][A[A2023-05-05 09:28:51,754 - INFO - validate:433 - Validation: Batch 867/1541, Loss: 3.1886


867it [15:18,  1.06s/it][A[A2023-05-05 09:28:52,810 - INFO - validate:433 - Validation: Batch 868/1541, Loss: 3.1888


868it [15:19,  1.06s/it][A[A2023-05-05 09:28:53,870 - INFO - validate:433 - Validation: Batch 869/1541, Loss: 3.1888


869it [15:20,  1.06s/it][A[A



929it [16:23,  1.06s/it][A[A2023-05-05 09:29:58,302 - INFO - validate:433 - Validation: Batch 930/1541, Loss: 3.1867


930it [16:25,  1.06s/it][A[A2023-05-05 09:29:59,357 - INFO - validate:433 - Validation: Batch 931/1541, Loss: 3.1867


931it [16:26,  1.06s/it][A[A2023-05-05 09:30:00,411 - INFO - validate:433 - Validation: Batch 932/1541, Loss: 3.1866


932it [16:27,  1.06s/it][A[A2023-05-05 09:30:01,467 - INFO - validate:433 - Validation: Batch 933/1541, Loss: 3.1866


933it [16:28,  1.06s/it][A[A2023-05-05 09:30:02,524 - INFO - validate:433 - Validation: Batch 934/1541, Loss: 3.1866


934it [16:29,  1.06s/it][A[A2023-05-05 09:30:03,577 - INFO - validate:433 - Validation: Batch 935/1541, Loss: 3.1865


935it [16:30,  1.06s/it][A[A2023-05-05 09:30:04,633 - INFO - validate:433 - Validation: Batch 936/1541, Loss: 3.1865


936it [16:31,  1.06s/it][A[A2023-05-05 09:30:05,689 - INFO - validate:433 - Validation: Batch 937/1541, Loss: 3.1865


937it [16:32,  1.06s/it][A[A



997it [17:35,  1.05s/it][A[A2023-05-05 09:31:10,016 - INFO - validate:433 - Validation: Batch 998/1541, Loss: 3.1855


998it [17:36,  1.05s/it][A[A2023-05-05 09:31:11,070 - INFO - validate:433 - Validation: Batch 999/1541, Loss: 3.1855


999it [17:37,  1.05s/it][A[A2023-05-05 09:31:12,127 - INFO - validate:433 - Validation: Batch 1000/1541, Loss: 3.1854


1000it [17:38,  1.05s/it][A[A2023-05-05 09:31:13,181 - INFO - validate:433 - Validation: Batch 1001/1541, Loss: 3.1854


1001it [17:39,  1.05s/it][A[A2023-05-05 09:31:14,234 - INFO - validate:433 - Validation: Batch 1002/1541, Loss: 3.1854


1002it [17:40,  1.05s/it][A[A2023-05-05 09:31:15,286 - INFO - validate:433 - Validation: Batch 1003/1541, Loss: 3.1855


1003it [17:42,  1.05s/it][A[A2023-05-05 09:31:16,341 - INFO - validate:433 - Validation: Batch 1004/1541, Loss: 3.1855


1004it [17:43,  1.05s/it][A[A2023-05-05 09:31:17,390 - INFO - validate:433 - Validation: Batch 1005/1541, Loss: 3.1855


1005it [17:44,  1.0



1064it [18:46,  1.06s/it][A[A2023-05-05 09:32:20,730 - INFO - validate:433 - Validation: Batch 1065/1541, Loss: 3.1868


1065it [18:47,  1.06s/it][A[A2023-05-05 09:32:21,785 - INFO - validate:433 - Validation: Batch 1066/1541, Loss: 3.1870


1066it [18:48,  1.06s/it][A[A2023-05-05 09:32:22,846 - INFO - validate:433 - Validation: Batch 1067/1541, Loss: 3.1870


1067it [18:49,  1.06s/it][A[A2023-05-05 09:32:23,901 - INFO - validate:433 - Validation: Batch 1068/1541, Loss: 3.1870


1068it [18:50,  1.06s/it][A[A2023-05-05 09:32:24,957 - INFO - validate:433 - Validation: Batch 1069/1541, Loss: 3.1870


1069it [18:51,  1.06s/it][A[A2023-05-05 09:32:26,012 - INFO - validate:433 - Validation: Batch 1070/1541, Loss: 3.1872


1070it [18:52,  1.06s/it][A[A2023-05-05 09:32:27,065 - INFO - validate:433 - Validation: Batch 1071/1541, Loss: 3.1873


1071it [18:53,  1.06s/it][A[A2023-05-05 09:32:28,123 - INFO - validate:433 - Validation: Batch 1072/1541, Loss: 3.1874


1072it [18:54,



1131it [19:57,  1.06s/it][A[A2023-05-05 09:33:31,547 - INFO - validate:433 - Validation: Batch 1132/1541, Loss: 3.1872


1132it [19:58,  1.06s/it][A[A2023-05-05 09:33:32,606 - INFO - validate:433 - Validation: Batch 1133/1541, Loss: 3.1872


1133it [19:59,  1.06s/it][A[A2023-05-05 09:33:33,663 - INFO - validate:433 - Validation: Batch 1134/1541, Loss: 3.1872


1134it [20:00,  1.06s/it][A[A2023-05-05 09:33:34,718 - INFO - validate:433 - Validation: Batch 1135/1541, Loss: 3.1872


1135it [20:01,  1.06s/it][A[A2023-05-05 09:33:35,774 - INFO - validate:433 - Validation: Batch 1136/1541, Loss: 3.1872


1136it [20:02,  1.06s/it][A[A2023-05-05 09:33:36,829 - INFO - validate:433 - Validation: Batch 1137/1541, Loss: 3.1872


1137it [20:03,  1.06s/it][A[A2023-05-05 09:33:37,884 - INFO - validate:433 - Validation: Batch 1138/1541, Loss: 3.1872


1138it [20:04,  1.06s/it][A[A2023-05-05 09:33:38,941 - INFO - validate:433 - Validation: Batch 1139/1541, Loss: 3.1872


1139it [20:05,



1198it [21:07,  1.05s/it][A[A2023-05-05 09:34:42,237 - INFO - validate:433 - Validation: Batch 1199/1541, Loss: 3.1865


1199it [21:08,  1.05s/it][A[A2023-05-05 09:34:43,291 - INFO - validate:433 - Validation: Batch 1200/1541, Loss: 3.1865


1200it [21:10,  1.05s/it][A[A2023-05-05 09:34:44,345 - INFO - validate:433 - Validation: Batch 1201/1541, Loss: 3.1866


1201it [21:11,  1.05s/it][A[A2023-05-05 09:34:45,399 - INFO - validate:433 - Validation: Batch 1202/1541, Loss: 3.1865


1202it [21:12,  1.05s/it][A[A2023-05-05 09:34:46,454 - INFO - validate:433 - Validation: Batch 1203/1541, Loss: 3.1865


1203it [21:13,  1.05s/it][A[A2023-05-05 09:34:47,508 - INFO - validate:433 - Validation: Batch 1204/1541, Loss: 3.1864


1204it [21:14,  1.06s/it][A[A2023-05-05 09:34:48,562 - INFO - validate:433 - Validation: Batch 1205/1541, Loss: 3.1864


1205it [21:15,  1.05s/it][A[A2023-05-05 09:34:49,616 - INFO - validate:433 - Validation: Batch 1206/1541, Loss: 3.1863


1206it [21:16,



1265it [22:18,  1.06s/it][A[A2023-05-05 09:35:52,882 - INFO - validate:433 - Validation: Batch 1266/1541, Loss: 3.1858


1266it [22:19,  1.06s/it][A[A2023-05-05 09:35:53,935 - INFO - validate:433 - Validation: Batch 1267/1541, Loss: 3.1858


1267it [22:20,  1.05s/it][A[A2023-05-05 09:35:54,989 - INFO - validate:433 - Validation: Batch 1268/1541, Loss: 3.1857


1268it [22:21,  1.05s/it][A[A2023-05-05 09:35:56,043 - INFO - validate:433 - Validation: Batch 1269/1541, Loss: 3.1857


1269it [22:22,  1.05s/it][A[A2023-05-05 09:35:57,100 - INFO - validate:433 - Validation: Batch 1270/1541, Loss: 3.1857


1270it [22:23,  1.06s/it][A[A2023-05-05 09:35:58,153 - INFO - validate:433 - Validation: Batch 1271/1541, Loss: 3.1857


1271it [22:24,  1.05s/it][A[A2023-05-05 09:35:59,210 - INFO - validate:433 - Validation: Batch 1272/1541, Loss: 3.1857


1272it [22:25,  1.06s/it][A[A2023-05-05 09:36:00,264 - INFO - validate:433 - Validation: Batch 1273/1541, Loss: 3.1858


1273it [22:27,



1332it [23:29,  1.06s/it][A[A2023-05-05 09:37:03,665 - INFO - validate:433 - Validation: Batch 1333/1541, Loss: 3.1873


1333it [23:30,  1.06s/it][A[A2023-05-05 09:37:04,721 - INFO - validate:433 - Validation: Batch 1334/1541, Loss: 3.1873


1334it [23:31,  1.06s/it][A[A2023-05-05 09:37:05,781 - INFO - validate:433 - Validation: Batch 1335/1541, Loss: 3.1873


1335it [23:32,  1.06s/it][A[A2023-05-05 09:37:06,840 - INFO - validate:433 - Validation: Batch 1336/1541, Loss: 3.1873


1336it [23:33,  1.06s/it][A[A2023-05-05 09:37:07,902 - INFO - validate:433 - Validation: Batch 1337/1541, Loss: 3.1873


1337it [23:34,  1.06s/it][A[A2023-05-05 09:37:08,961 - INFO - validate:433 - Validation: Batch 1338/1541, Loss: 3.1873


1338it [23:35,  1.06s/it][A[A2023-05-05 09:37:10,016 - INFO - validate:433 - Validation: Batch 1339/1541, Loss: 3.1873


1339it [23:36,  1.06s/it][A[A2023-05-05 09:37:11,073 - INFO - validate:433 - Validation: Batch 1340/1541, Loss: 3.1874


1340it [23:37,



1399it [24:40,  1.06s/it][A[A2023-05-05 09:38:14,565 - INFO - validate:433 - Validation: Batch 1400/1541, Loss: 3.1870


1400it [24:41,  1.06s/it][A[A2023-05-05 09:38:15,625 - INFO - validate:433 - Validation: Batch 1401/1541, Loss: 3.1869


1401it [24:42,  1.06s/it][A[A2023-05-05 09:38:16,688 - INFO - validate:433 - Validation: Batch 1402/1541, Loss: 3.1869


1402it [24:43,  1.06s/it][A[A2023-05-05 09:38:17,749 - INFO - validate:433 - Validation: Batch 1403/1541, Loss: 3.1868


1403it [24:44,  1.06s/it][A[A2023-05-05 09:38:18,810 - INFO - validate:433 - Validation: Batch 1404/1541, Loss: 3.1869


1404it [24:45,  1.06s/it][A[A2023-05-05 09:38:19,871 - INFO - validate:433 - Validation: Batch 1405/1541, Loss: 3.1868


1405it [24:46,  1.06s/it][A[A2023-05-05 09:38:20,933 - INFO - validate:433 - Validation: Batch 1406/1541, Loss: 3.1868


1406it [24:47,  1.06s/it][A[A2023-05-05 09:38:21,994 - INFO - validate:433 - Validation: Batch 1407/1541, Loss: 3.1868


1407it [24:48,



1466it [25:51,  1.06s/it][A[A2023-05-05 09:39:25,636 - INFO - validate:433 - Validation: Batch 1467/1541, Loss: 3.1861


1467it [25:52,  1.06s/it][A[A2023-05-05 09:39:26,696 - INFO - validate:433 - Validation: Batch 1468/1541, Loss: 3.1862


1468it [25:53,  1.06s/it][A[A2023-05-05 09:39:27,757 - INFO - validate:433 - Validation: Batch 1469/1541, Loss: 3.1862


1469it [25:54,  1.06s/it][A[A2023-05-05 09:39:28,816 - INFO - validate:433 - Validation: Batch 1470/1541, Loss: 3.1862


1470it [25:55,  1.06s/it][A[A2023-05-05 09:39:29,875 - INFO - validate:433 - Validation: Batch 1471/1541, Loss: 3.1863


1471it [25:56,  1.06s/it][A[A2023-05-05 09:39:30,934 - INFO - validate:433 - Validation: Batch 1472/1541, Loss: 3.1863


1472it [25:57,  1.06s/it][A[A2023-05-05 09:39:31,993 - INFO - validate:433 - Validation: Batch 1473/1541, Loss: 3.1863


1473it [25:58,  1.06s/it][A[A2023-05-05 09:39:33,055 - INFO - validate:433 - Validation: Batch 1474/1541, Loss: 3.1863


1474it [25:59,



1533it [27:02,  1.06s/it][A[A2023-05-05 09:40:36,550 - INFO - validate:433 - Validation: Batch 1534/1541, Loss: 3.1877


1534it [27:03,  1.06s/it][A[A2023-05-05 09:40:37,608 - INFO - validate:433 - Validation: Batch 1535/1541, Loss: 3.1877


1535it [27:04,  1.06s/it][A[A2023-05-05 09:40:38,661 - INFO - validate:433 - Validation: Batch 1536/1541, Loss: 3.1876


1536it [27:05,  1.06s/it][A[A2023-05-05 09:40:39,719 - INFO - validate:433 - Validation: Batch 1537/1541, Loss: 3.1876


1537it [27:06,  1.06s/it][A[A2023-05-05 09:40:40,778 - INFO - validate:433 - Validation: Batch 1538/1541, Loss: 3.1876


1538it [27:07,  1.06s/it][A[A2023-05-05 09:40:41,834 - INFO - validate:433 - Validation: Batch 1539/1541, Loss: 3.1875


1539it [27:08,  1.06s/it][A[A2023-05-05 09:40:42,888 - INFO - validate:433 - Validation: Batch 1540/1541, Loss: 3.1875


1540it [27:09,  1.06s/it][A[A2023-05-05 09:40:42,988 - INFO - validate:433 - Validation: Batch 1541/1541, Loss: 3.1873
1541it [27:09,  

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▆▇▇▇▇▇▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
epoch,1.0
loss,3.24953


Run jajg94a9 errored: NameError("name 'log_save_path' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run jajg94a9 errored: NameError("name 'log_save_path' is not defined")
[34m[1mwandb[0m: Agent Starting Run: vblbgrzx with config:
[34m[1mwandb[0m: 	BETAS: [0.85, 0.9]
[34m[1mwandb[0m: 	GRADIENT_ACCUMULATION_STEPS: 4
[34m[1mwandb[0m: 	GRADIENT_CLIP: 2.5
[34m[1mwandb[0m: 	LEARNING_RATE: 5e-05
[34m[1mwandb[0m: 	NUM_EPOCHS: 1
[34m[1mwandb[0m: 	SAMPLING_INTERVAL: 50
[34m[1mwandb[0m: 	WEIGHT_DECAY: 0.01
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


2023-05-05 09:40:58,784 - INFO - create_or_load_model:370 - Model: facebook/opt-125m
2023-05-05 09:40:58,785 - INFO - print_trainable_parameters:121 - Parameters: Trainable- 125.24M|| All- 125.24M || Trainable%- 100.0
2023-05-05 09:40:58,787 - INFO - create_or_load_model:372 - Memory Memory Footprint: 500.957184 MB
2023-05-05 09:40:58,787 - INFO - create_or_load_model:373 - Model is on device: cuda:0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0166681188546742, max=1.0))…

  0%|          | 0/1 [00:00<?, ?it/s]2023-05-05 09:41:05,311 - INFO - train:33 - Epoch: 1/1

  0%|          | 0/13861 [00:00<?, ?it/s][A2023-05-05 09:41:08,206 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36468 MB.
2023-05-05 09:41:08,657 - INFO - train:65 - Batch 1/13861, Loss: 3.2953, Learning Rate: 5e-05

  0%|          | 1/13861 [00:04<15:29:15,  4.02s/it][A2023-05-05 09:41:09,864 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36468 MB.
2023-05-05 09:41:10,298 - INFO - train:65 - Batch 2/13861, Loss: 3.2902, Learning Rate: 5e-05

  0%|          | 2/13861 [00:05<10:04:52,  2.62s/it][A2023-05-05 09:41:11,500 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36468 MB.
2023-05-05 09:41:11,934 - INFO - train:65 - Batch 3/13861, Loss: 3.2898, Learning Rate: 5e-05

  0%|          | 3/13861 [00:07<8:21:06,  2.17s/it] [A2023-05-05 09:41:13,135 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36468 MB.
2023-05-05 09:41:13,569 - INFO - train:65 - B


  0%|          | 51/13861 [01:26<6:18:30,  1.64s/it][A2023-05-05 09:42:32,758 - INFO - train:65 - Batch 52/13861, Loss: 3.3344, Learning Rate: 5e-05

  0%|          | 52/13861 [01:28<6:21:11,  1.66s/it][A2023-05-05 09:42:34,442 - INFO - train:65 - Batch 53/13861, Loss: 3.3338, Learning Rate: 5e-05

  0%|          | 53/13861 [01:29<6:19:51,  1.65s/it][A2023-05-05 09:42:36,080 - INFO - train:65 - Batch 54/13861, Loss: 3.3334, Learning Rate: 5e-05

  0%|          | 54/13861 [01:31<6:18:56,  1.65s/it][A2023-05-05 09:42:37,717 - INFO - train:65 - Batch 55/13861, Loss: 3.3336, Learning Rate: 5e-05

  0%|          | 55/13861 [01:33<6:18:13,  1.64s/it][A2023-05-05 09:42:39,354 - INFO - train:65 - Batch 56/13861, Loss: 3.3325, Learning Rate: 5e-05

  0%|          | 56/13861 [01:34<6:21:01,  1.66s/it][A2023-05-05 09:42:41,038 - INFO - train:65 - Batch 57/13861, Loss: 3.3323, Learning Rate: 5e-05

  0%|          | 57/13861 [01:36<6:19:41,  1.65s/it][A2023-05-05 09:42:42,675 - INFO - train


  1%|          | 159/13861 [04:24<6:15:41,  1.65s/it][A2023-05-05 09:45:30,870 - INFO - train:65 - Batch 160/13861, Loss: 3.2709, Learning Rate: 5e-05

  1%|          | 160/13861 [04:26<6:18:32,  1.66s/it][A2023-05-05 09:45:32,556 - INFO - train:65 - Batch 161/13861, Loss: 3.2706, Learning Rate: 5e-05

  1%|          | 161/13861 [04:27<6:17:08,  1.65s/it][A2023-05-05 09:45:34,194 - INFO - train:65 - Batch 162/13861, Loss: 3.2702, Learning Rate: 5e-05

  1%|          | 162/13861 [04:29<6:16:12,  1.65s/it][A2023-05-05 09:45:35,832 - INFO - train:65 - Batch 163/13861, Loss: 3.2699, Learning Rate: 5e-05

  1%|          | 163/13861 [04:31<6:15:30,  1.64s/it][A2023-05-05 09:45:37,471 - INFO - train:65 - Batch 164/13861, Loss: 3.2691, Learning Rate: 5e-05

  1%|          | 164/13861 [04:32<6:18:14,  1.66s/it][A2023-05-05 09:45:39,155 - INFO - train:65 - Batch 165/13861, Loss: 3.2686, Learning Rate: 5e-05

  1%|          | 165/13861 [04:34<6:16:58,  1.65s/it][A2023-05-05 09:45:40,794 -


  2%|▏         | 213/13861 [05:53<6:15:41,  1.65s/it][A2023-05-05 09:47:00,017 - INFO - train:65 - Batch 214/13861, Loss: 3.2525, Learning Rate: 5e-05

  2%|▏         | 214/13861 [05:55<6:14:51,  1.65s/it][A2023-05-05 09:47:01,657 - INFO - train:65 - Batch 215/13861, Loss: 3.2524, Learning Rate: 5e-05

  2%|▏         | 215/13861 [05:57<6:14:14,  1.65s/it][A2023-05-05 09:47:03,296 - INFO - train:65 - Batch 216/13861, Loss: 3.2524, Learning Rate: 5e-05

  2%|▏         | 216/13861 [05:58<6:16:58,  1.66s/it][A2023-05-05 09:47:04,981 - INFO - train:65 - Batch 217/13861, Loss: 3.2521, Learning Rate: 5e-05

  2%|▏         | 217/13861 [06:00<6:15:36,  1.65s/it][A2023-05-05 09:47:06,620 - INFO - train:65 - Batch 218/13861, Loss: 3.2518, Learning Rate: 5e-05

  2%|▏         | 218/13861 [06:01<6:14:42,  1.65s/it][A2023-05-05 09:47:08,258 - INFO - train:65 - Batch 219/13861, Loss: 3.2516, Learning Rate: 5e-05

  2%|▏         | 219/13861 [06:03<6:14:00,  1.64s/it][A2023-05-05 09:47:09,897 -


  2%|▏         | 267/13861 [07:22<6:12:41,  1.64s/it][A2023-05-05 09:48:29,120 - INFO - train:65 - Batch 268/13861, Loss: 3.2401, Learning Rate: 5e-05

  2%|▏         | 268/13861 [07:24<6:15:23,  1.66s/it][A2023-05-05 09:48:30,805 - INFO - train:65 - Batch 269/13861, Loss: 3.2399, Learning Rate: 5e-05

  2%|▏         | 269/13861 [07:26<6:14:08,  1.65s/it][A2023-05-05 09:48:32,444 - INFO - train:65 - Batch 270/13861, Loss: 3.2398, Learning Rate: 5e-05

  2%|▏         | 270/13861 [07:27<6:13:16,  1.65s/it][A2023-05-05 09:48:34,083 - INFO - train:65 - Batch 271/13861, Loss: 3.2394, Learning Rate: 5e-05

  2%|▏         | 271/13861 [07:29<6:12:35,  1.65s/it][A2023-05-05 09:48:35,721 - INFO - train:65 - Batch 272/13861, Loss: 3.2390, Learning Rate: 5e-05

  2%|▏         | 272/13861 [07:31<6:15:13,  1.66s/it][A2023-05-05 09:48:37,405 - INFO - train:65 - Batch 273/13861, Loss: 3.2385, Learning Rate: 5e-05

  2%|▏         | 273/13861 [07:32<6:13:55,  1.65s/it][A2023-05-05 09:48:39,044 -


  2%|▏         | 321/13861 [08:51<6:12:29,  1.65s/it][A2023-05-05 09:49:58,254 - INFO - train:65 - Batch 322/13861, Loss: 3.2280, Learning Rate: 5e-05

  2%|▏         | 322/13861 [08:53<6:11:37,  1.65s/it][A2023-05-05 09:49:59,892 - INFO - train:65 - Batch 323/13861, Loss: 3.2278, Learning Rate: 5e-05

  2%|▏         | 323/13861 [08:55<6:10:59,  1.64s/it][A2023-05-05 09:50:01,530 - INFO - train:65 - Batch 324/13861, Loss: 3.2277, Learning Rate: 5e-05

  2%|▏         | 324/13861 [08:56<6:13:44,  1.66s/it][A2023-05-05 09:50:03,215 - INFO - train:65 - Batch 325/13861, Loss: 3.2273, Learning Rate: 5e-05

  2%|▏         | 325/13861 [08:58<6:12:25,  1.65s/it][A2023-05-05 09:50:04,852 - INFO - train:65 - Batch 326/13861, Loss: 3.2271, Learning Rate: 5e-05

  2%|▏         | 326/13861 [09:00<6:11:32,  1.65s/it][A2023-05-05 09:50:06,490 - INFO - train:65 - Batch 327/13861, Loss: 3.2269, Learning Rate: 5e-05

  2%|▏         | 327/13861 [09:01<6:10:51,  1.64s/it][A2023-05-05 09:50:08,128 -


  3%|▎         | 375/13861 [10:21<6:09:27,  1.64s/it][A2023-05-05 09:51:27,289 - INFO - train:65 - Batch 376/13861, Loss: 3.2177, Learning Rate: 5e-05

  3%|▎         | 376/13861 [10:22<6:12:07,  1.66s/it][A2023-05-05 09:51:28,973 - INFO - train:65 - Batch 377/13861, Loss: 3.2176, Learning Rate: 5e-05

  3%|▎         | 377/13861 [10:24<6:10:51,  1.65s/it][A2023-05-05 09:51:30,611 - INFO - train:65 - Batch 378/13861, Loss: 3.2174, Learning Rate: 5e-05

  3%|▎         | 378/13861 [10:25<6:10:01,  1.65s/it][A2023-05-05 09:51:32,248 - INFO - train:65 - Batch 379/13861, Loss: 3.2171, Learning Rate: 5e-05

  3%|▎         | 379/13861 [10:27<6:09:23,  1.64s/it][A2023-05-05 09:51:33,887 - INFO - train:65 - Batch 380/13861, Loss: 3.2168, Learning Rate: 5e-05

  3%|▎         | 380/13861 [10:29<6:12:07,  1.66s/it][A2023-05-05 09:51:35,572 - INFO - train:65 - Batch 381/13861, Loss: 3.2165, Learning Rate: 5e-05

  3%|▎         | 381/13861 [10:30<6:10:50,  1.65s/it][A2023-05-05 09:51:37,209 -


  3%|▎         | 429/13861 [11:50<6:09:21,  1.65s/it][A2023-05-05 09:52:56,366 - INFO - train:65 - Batch 430/13861, Loss: 3.2106, Learning Rate: 5e-05

  3%|▎         | 430/13861 [11:51<6:08:29,  1.65s/it][A2023-05-05 09:52:58,003 - INFO - train:65 - Batch 431/13861, Loss: 3.2106, Learning Rate: 5e-05

  3%|▎         | 431/13861 [11:53<6:07:51,  1.64s/it][A2023-05-05 09:52:59,641 - INFO - train:65 - Batch 432/13861, Loss: 3.2106, Learning Rate: 5e-05

  3%|▎         | 432/13861 [11:55<6:10:31,  1.66s/it][A2023-05-05 09:53:01,324 - INFO - train:65 - Batch 433/13861, Loss: 3.2102, Learning Rate: 5e-05

  3%|▎         | 433/13861 [11:56<6:09:14,  1.65s/it][A2023-05-05 09:53:02,961 - INFO - train:65 - Batch 434/13861, Loss: 3.2101, Learning Rate: 5e-05

  3%|▎         | 434/13861 [11:58<6:08:20,  1.65s/it][A2023-05-05 09:53:04,597 - INFO - train:65 - Batch 435/13861, Loss: 3.2098, Learning Rate: 5e-05

  3%|▎         | 435/13861 [11:59<6:07:40,  1.64s/it][A2023-05-05 09:53:06,233 -


  3%|▎         | 483/13861 [13:19<6:06:24,  1.64s/it][A2023-05-05 09:54:25,376 - INFO - train:65 - Batch 484/13861, Loss: 3.2009, Learning Rate: 5e-05

  3%|▎         | 484/13861 [13:20<6:09:15,  1.66s/it][A2023-05-05 09:54:27,060 - INFO - train:65 - Batch 485/13861, Loss: 3.2008, Learning Rate: 5e-05

  3%|▎         | 485/13861 [13:22<6:07:57,  1.65s/it][A2023-05-05 09:54:28,698 - INFO - train:65 - Batch 486/13861, Loss: 3.2008, Learning Rate: 5e-05

  4%|▎         | 486/13861 [13:24<6:07:03,  1.65s/it][A2023-05-05 09:54:30,335 - INFO - train:65 - Batch 487/13861, Loss: 3.2005, Learning Rate: 5e-05

  4%|▎         | 487/13861 [13:25<6:06:25,  1.64s/it][A2023-05-05 09:54:31,973 - INFO - train:65 - Batch 488/13861, Loss: 3.2004, Learning Rate: 5e-05

  4%|▎         | 488/13861 [13:27<6:09:01,  1.66s/it][A2023-05-05 09:54:33,655 - INFO - train:65 - Batch 489/13861, Loss: 3.2004, Learning Rate: 5e-05

  4%|▎         | 489/13861 [13:29<6:07:47,  1.65s/it][A2023-05-05 09:54:35,293 -



44it [00:46,  1.04s/it][A[A2023-05-05 09:55:39,086 - INFO - validate:433 - Validation: Batch 45/1541, Loss: 3.2389


45it [00:47,  1.04s/it][A[A2023-05-05 09:55:40,138 - INFO - validate:433 - Validation: Batch 46/1541, Loss: 3.2418


46it [00:48,  1.05s/it][A[A2023-05-05 09:55:41,177 - INFO - validate:433 - Validation: Batch 47/1541, Loss: 3.2437


47it [00:49,  1.04s/it][A[A2023-05-05 09:55:42,229 - INFO - validate:433 - Validation: Batch 48/1541, Loss: 3.2480


48it [00:50,  1.05s/it][A[A2023-05-05 09:55:43,266 - INFO - validate:433 - Validation: Batch 49/1541, Loss: 3.2472


49it [00:51,  1.04s/it][A[A2023-05-05 09:55:44,294 - INFO - validate:433 - Validation: Batch 50/1541, Loss: 3.2489


50it [00:52,  1.04s/it][A[A2023-05-05 09:55:45,333 - INFO - validate:433 - Validation: Batch 51/1541, Loss: 3.2467


51it [00:53,  1.04s/it][A[A2023-05-05 09:55:46,380 - INFO - validate:433 - Validation: Batch 52/1541, Loss: 3.2448


52it [00:54,  1.04s/it][A[A2023-05-05 09:55:



113it [01:58,  1.05s/it][A[A2023-05-05 09:56:51,574 - INFO - validate:433 - Validation: Batch 114/1541, Loss: 3.1473


114it [01:59,  1.05s/it][A[A2023-05-05 09:56:52,626 - INFO - validate:433 - Validation: Batch 115/1541, Loss: 3.1476


115it [02:00,  1.05s/it][A[A2023-05-05 09:56:53,663 - INFO - validate:433 - Validation: Batch 116/1541, Loss: 3.1472


116it [02:01,  1.05s/it][A[A2023-05-05 09:56:54,712 - INFO - validate:433 - Validation: Batch 117/1541, Loss: 3.1464


117it [02:02,  1.05s/it][A[A2023-05-05 09:56:55,763 - INFO - validate:433 - Validation: Batch 118/1541, Loss: 3.1462


118it [02:04,  1.05s/it][A[A2023-05-05 09:56:56,818 - INFO - validate:433 - Validation: Batch 119/1541, Loss: 3.1459


119it [02:05,  1.05s/it][A[A2023-05-05 09:56:57,870 - INFO - validate:433 - Validation: Batch 120/1541, Loss: 3.1445


120it [02:06,  1.05s/it][A[A2023-05-05 09:56:58,922 - INFO - validate:433 - Validation: Batch 121/1541, Loss: 3.1441


121it [02:07,  1.05s/it][A[A



181it [03:10,  1.05s/it][A[A2023-05-05 09:58:03,078 - INFO - validate:433 - Validation: Batch 182/1541, Loss: 3.1311


182it [03:11,  1.05s/it][A[A2023-05-05 09:58:04,133 - INFO - validate:433 - Validation: Batch 183/1541, Loss: 3.1307


183it [03:12,  1.05s/it][A[A2023-05-05 09:58:05,189 - INFO - validate:433 - Validation: Batch 184/1541, Loss: 3.1303


184it [03:13,  1.05s/it][A[A2023-05-05 09:58:06,244 - INFO - validate:433 - Validation: Batch 185/1541, Loss: 3.1304


185it [03:14,  1.06s/it][A[A2023-05-05 09:58:07,297 - INFO - validate:433 - Validation: Batch 186/1541, Loss: 3.1310


186it [03:15,  1.05s/it][A[A2023-05-05 09:58:08,349 - INFO - validate:433 - Validation: Batch 187/1541, Loss: 3.1309


187it [03:16,  1.05s/it][A[A2023-05-05 09:58:09,403 - INFO - validate:433 - Validation: Batch 188/1541, Loss: 3.1309


188it [03:17,  1.05s/it][A[A2023-05-05 09:58:10,456 - INFO - validate:433 - Validation: Batch 189/1541, Loss: 3.1304


189it [03:18,  1.05s/it][A[A



249it [04:22,  1.06s/it][A[A2023-05-05 09:59:14,939 - INFO - validate:433 - Validation: Batch 250/1541, Loss: 3.1209


250it [04:23,  1.06s/it][A[A2023-05-05 09:59:15,999 - INFO - validate:433 - Validation: Batch 251/1541, Loss: 3.1209


251it [04:24,  1.06s/it][A[A2023-05-05 09:59:17,059 - INFO - validate:433 - Validation: Batch 252/1541, Loss: 3.1206


252it [04:25,  1.06s/it][A[A2023-05-05 09:59:18,120 - INFO - validate:433 - Validation: Batch 253/1541, Loss: 3.1203


253it [04:26,  1.06s/it][A[A2023-05-05 09:59:19,178 - INFO - validate:433 - Validation: Batch 254/1541, Loss: 3.1203


254it [04:27,  1.06s/it][A[A2023-05-05 09:59:20,236 - INFO - validate:433 - Validation: Batch 255/1541, Loss: 3.1202


255it [04:28,  1.06s/it][A[A2023-05-05 09:59:21,297 - INFO - validate:433 - Validation: Batch 256/1541, Loss: 3.1201


256it [04:29,  1.06s/it][A[A2023-05-05 09:59:22,357 - INFO - validate:433 - Validation: Batch 257/1541, Loss: 3.1202


257it [04:30,  1.06s/it][A[A



317it [05:34,  1.06s/it][A[A2023-05-05 10:00:26,942 - INFO - validate:433 - Validation: Batch 318/1541, Loss: 3.1177


318it [05:35,  1.06s/it][A[A2023-05-05 10:00:27,999 - INFO - validate:433 - Validation: Batch 319/1541, Loss: 3.1178


319it [05:36,  1.06s/it][A[A2023-05-05 10:00:29,057 - INFO - validate:433 - Validation: Batch 320/1541, Loss: 3.1178


320it [05:37,  1.06s/it][A[A2023-05-05 10:00:30,117 - INFO - validate:433 - Validation: Batch 321/1541, Loss: 3.1177


321it [05:38,  1.06s/it][A[A2023-05-05 10:00:31,177 - INFO - validate:433 - Validation: Batch 322/1541, Loss: 3.1179


322it [05:39,  1.06s/it][A[A2023-05-05 10:00:32,237 - INFO - validate:433 - Validation: Batch 323/1541, Loss: 3.1179


323it [05:40,  1.06s/it][A[A2023-05-05 10:00:33,296 - INFO - validate:433 - Validation: Batch 324/1541, Loss: 3.1176


324it [05:41,  1.06s/it][A[A2023-05-05 10:00:34,355 - INFO - validate:433 - Validation: Batch 325/1541, Loss: 3.1175


325it [05:42,  1.06s/it][A[A



385it [06:46,  1.06s/it][A[A2023-05-05 10:01:38,932 - INFO - validate:433 - Validation: Batch 386/1541, Loss: 3.1184


386it [06:47,  1.06s/it][A[A2023-05-05 10:01:39,989 - INFO - validate:433 - Validation: Batch 387/1541, Loss: 3.1183


387it [06:48,  1.06s/it][A[A2023-05-05 10:01:41,050 - INFO - validate:433 - Validation: Batch 388/1541, Loss: 3.1180


388it [06:49,  1.06s/it][A[A2023-05-05 10:01:42,111 - INFO - validate:433 - Validation: Batch 389/1541, Loss: 3.1181


389it [06:50,  1.06s/it][A[A2023-05-05 10:01:43,169 - INFO - validate:433 - Validation: Batch 390/1541, Loss: 3.1182


390it [06:51,  1.06s/it][A[A2023-05-05 10:01:44,229 - INFO - validate:433 - Validation: Batch 391/1541, Loss: 3.1184


391it [06:52,  1.06s/it][A[A2023-05-05 10:01:45,286 - INFO - validate:433 - Validation: Batch 392/1541, Loss: 3.1182


392it [06:53,  1.06s/it][A[A2023-05-05 10:01:46,346 - INFO - validate:433 - Validation: Batch 393/1541, Loss: 3.1183


393it [06:54,  1.06s/it][A[A



453it [07:58,  1.06s/it][A[A2023-05-05 10:02:50,955 - INFO - validate:433 - Validation: Batch 454/1541, Loss: 3.1181


454it [07:59,  1.06s/it][A[A2023-05-05 10:02:52,016 - INFO - validate:433 - Validation: Batch 455/1541, Loss: 3.1183


455it [08:00,  1.06s/it][A[A2023-05-05 10:02:53,075 - INFO - validate:433 - Validation: Batch 456/1541, Loss: 3.1182


456it [08:01,  1.06s/it][A[A2023-05-05 10:02:54,134 - INFO - validate:433 - Validation: Batch 457/1541, Loss: 3.1182


457it [08:02,  1.06s/it][A[A2023-05-05 10:02:55,195 - INFO - validate:433 - Validation: Batch 458/1541, Loss: 3.1180


458it [08:03,  1.06s/it][A[A2023-05-05 10:02:56,254 - INFO - validate:433 - Validation: Batch 459/1541, Loss: 3.1178


459it [08:04,  1.06s/it][A[A2023-05-05 10:02:57,314 - INFO - validate:433 - Validation: Batch 460/1541, Loss: 3.1177


460it [08:05,  1.06s/it][A[A2023-05-05 10:02:58,375 - INFO - validate:433 - Validation: Batch 461/1541, Loss: 3.1176


461it [08:06,  1.06s/it][A[A



521it [09:10,  1.06s/it][A[A2023-05-05 10:04:02,972 - INFO - validate:433 - Validation: Batch 522/1541, Loss: 3.1164


522it [09:11,  1.06s/it][A[A2023-05-05 10:04:04,032 - INFO - validate:433 - Validation: Batch 523/1541, Loss: 3.1163


523it [09:12,  1.06s/it][A[A2023-05-05 10:04:05,091 - INFO - validate:433 - Validation: Batch 524/1541, Loss: 3.1164


524it [09:13,  1.06s/it][A[A2023-05-05 10:04:06,149 - INFO - validate:433 - Validation: Batch 525/1541, Loss: 3.1163


525it [09:14,  1.06s/it][A[A2023-05-05 10:04:07,207 - INFO - validate:433 - Validation: Batch 526/1541, Loss: 3.1163


526it [09:15,  1.06s/it][A[A2023-05-05 10:04:08,269 - INFO - validate:433 - Validation: Batch 527/1541, Loss: 3.1163


527it [09:16,  1.06s/it][A[A2023-05-05 10:04:09,329 - INFO - validate:433 - Validation: Batch 528/1541, Loss: 3.1163


528it [09:17,  1.06s/it][A[A2023-05-05 10:04:10,387 - INFO - validate:433 - Validation: Batch 529/1541, Loss: 3.1163


529it [09:18,  1.06s/it][A[A



589it [10:22,  1.06s/it][A[A2023-05-05 10:05:15,015 - INFO - validate:433 - Validation: Batch 590/1541, Loss: 3.1159


590it [10:23,  1.06s/it][A[A2023-05-05 10:05:16,072 - INFO - validate:433 - Validation: Batch 591/1541, Loss: 3.1158


591it [10:24,  1.06s/it][A[A2023-05-05 10:05:17,132 - INFO - validate:433 - Validation: Batch 592/1541, Loss: 3.1158


592it [10:25,  1.06s/it][A[A2023-05-05 10:05:18,188 - INFO - validate:433 - Validation: Batch 593/1541, Loss: 3.1158


593it [10:26,  1.06s/it][A[A2023-05-05 10:05:19,252 - INFO - validate:433 - Validation: Batch 594/1541, Loss: 3.1159


594it [10:27,  1.06s/it][A[A2023-05-05 10:05:20,310 - INFO - validate:433 - Validation: Batch 595/1541, Loss: 3.1160


595it [10:28,  1.06s/it][A[A2023-05-05 10:05:21,367 - INFO - validate:433 - Validation: Batch 596/1541, Loss: 3.1161


596it [10:29,  1.06s/it][A[A2023-05-05 10:05:22,425 - INFO - validate:433 - Validation: Batch 597/1541, Loss: 3.1161


597it [10:30,  1.06s/it][A[A



657it [11:34,  1.06s/it][A[A2023-05-05 10:06:27,023 - INFO - validate:433 - Validation: Batch 658/1541, Loss: 3.1168


658it [11:35,  1.06s/it][A[A2023-05-05 10:06:28,084 - INFO - validate:433 - Validation: Batch 659/1541, Loss: 3.1169


659it [11:36,  1.06s/it][A[A2023-05-05 10:06:29,142 - INFO - validate:433 - Validation: Batch 660/1541, Loss: 3.1169


660it [11:37,  1.06s/it][A[A2023-05-05 10:06:30,198 - INFO - validate:433 - Validation: Batch 661/1541, Loss: 3.1171


661it [11:38,  1.06s/it][A[A2023-05-05 10:06:31,254 - INFO - validate:433 - Validation: Batch 662/1541, Loss: 3.1172


662it [11:39,  1.06s/it][A[A2023-05-05 10:06:32,311 - INFO - validate:433 - Validation: Batch 663/1541, Loss: 3.1172


663it [11:40,  1.06s/it][A[A2023-05-05 10:06:33,370 - INFO - validate:433 - Validation: Batch 664/1541, Loss: 3.1173


664it [11:41,  1.06s/it][A[A2023-05-05 10:06:34,431 - INFO - validate:433 - Validation: Batch 665/1541, Loss: 3.1175


665it [11:42,  1.06s/it][A[A



725it [12:46,  1.06s/it][A[A2023-05-05 10:07:38,974 - INFO - validate:433 - Validation: Batch 726/1541, Loss: 3.1254


726it [12:47,  1.06s/it][A[A2023-05-05 10:07:40,031 - INFO - validate:433 - Validation: Batch 727/1541, Loss: 3.1255


727it [12:48,  1.06s/it][A[A2023-05-05 10:07:41,089 - INFO - validate:433 - Validation: Batch 728/1541, Loss: 3.1256


728it [12:49,  1.06s/it][A[A2023-05-05 10:07:42,148 - INFO - validate:433 - Validation: Batch 729/1541, Loss: 3.1256


729it [12:50,  1.06s/it][A[A2023-05-05 10:07:43,202 - INFO - validate:433 - Validation: Batch 730/1541, Loss: 3.1257


730it [12:51,  1.06s/it][A[A2023-05-05 10:07:44,262 - INFO - validate:433 - Validation: Batch 731/1541, Loss: 3.1258


731it [12:52,  1.06s/it][A[A2023-05-05 10:07:45,323 - INFO - validate:433 - Validation: Batch 732/1541, Loss: 3.1260


732it [12:53,  1.06s/it][A[A2023-05-05 10:07:46,380 - INFO - validate:433 - Validation: Batch 733/1541, Loss: 3.1260


733it [12:54,  1.06s/it][A[A



793it [13:58,  1.06s/it][A[A2023-05-05 10:08:50,899 - INFO - validate:433 - Validation: Batch 794/1541, Loss: 3.1265


794it [13:59,  1.06s/it][A[A2023-05-05 10:08:51,957 - INFO - validate:433 - Validation: Batch 795/1541, Loss: 3.1264


795it [14:00,  1.06s/it][A[A2023-05-05 10:08:53,016 - INFO - validate:433 - Validation: Batch 796/1541, Loss: 3.1263


796it [14:01,  1.06s/it][A[A2023-05-05 10:08:54,075 - INFO - validate:433 - Validation: Batch 797/1541, Loss: 3.1263


797it [14:02,  1.06s/it][A[A2023-05-05 10:08:55,135 - INFO - validate:433 - Validation: Batch 798/1541, Loss: 3.1264


798it [14:03,  1.06s/it][A[A2023-05-05 10:08:56,200 - INFO - validate:433 - Validation: Batch 799/1541, Loss: 3.1263


799it [14:04,  1.06s/it][A[A2023-05-05 10:08:57,259 - INFO - validate:433 - Validation: Batch 800/1541, Loss: 3.1262


800it [14:05,  1.06s/it][A[A2023-05-05 10:08:58,320 - INFO - validate:433 - Validation: Batch 801/1541, Loss: 3.1261


801it [14:06,  1.06s/it][A[A



861it [15:10,  1.06s/it][A[A2023-05-05 10:10:02,954 - INFO - validate:433 - Validation: Batch 862/1541, Loss: 3.1250


862it [15:11,  1.06s/it][A[A2023-05-05 10:10:04,012 - INFO - validate:433 - Validation: Batch 863/1541, Loss: 3.1249


863it [15:12,  1.06s/it][A[A2023-05-05 10:10:05,072 - INFO - validate:433 - Validation: Batch 864/1541, Loss: 3.1249


864it [15:13,  1.06s/it][A[A2023-05-05 10:10:06,132 - INFO - validate:433 - Validation: Batch 865/1541, Loss: 3.1250


865it [15:14,  1.06s/it][A[A2023-05-05 10:10:07,189 - INFO - validate:433 - Validation: Batch 866/1541, Loss: 3.1251


866it [15:15,  1.06s/it][A[A2023-05-05 10:10:08,249 - INFO - validate:433 - Validation: Batch 867/1541, Loss: 3.1251


867it [15:16,  1.06s/it][A[A2023-05-05 10:10:09,310 - INFO - validate:433 - Validation: Batch 868/1541, Loss: 3.1253


868it [15:17,  1.06s/it][A[A2023-05-05 10:10:10,371 - INFO - validate:433 - Validation: Batch 869/1541, Loss: 3.1253


869it [15:18,  1.06s/it][A[A



929it [16:22,  1.06s/it][A[A2023-05-05 10:11:15,036 - INFO - validate:433 - Validation: Batch 930/1541, Loss: 3.1232


930it [16:23,  1.06s/it][A[A2023-05-05 10:11:16,096 - INFO - validate:433 - Validation: Batch 931/1541, Loss: 3.1231


931it [16:24,  1.06s/it][A[A2023-05-05 10:11:17,157 - INFO - validate:433 - Validation: Batch 932/1541, Loss: 3.1231


932it [16:25,  1.06s/it][A[A2023-05-05 10:11:18,218 - INFO - validate:433 - Validation: Batch 933/1541, Loss: 3.1230


933it [16:26,  1.06s/it][A[A2023-05-05 10:11:19,276 - INFO - validate:433 - Validation: Batch 934/1541, Loss: 3.1230


934it [16:27,  1.06s/it][A[A2023-05-05 10:11:20,333 - INFO - validate:433 - Validation: Batch 935/1541, Loss: 3.1229


935it [16:28,  1.06s/it][A[A2023-05-05 10:11:21,393 - INFO - validate:433 - Validation: Batch 936/1541, Loss: 3.1229


936it [16:29,  1.06s/it][A[A2023-05-05 10:11:22,454 - INFO - validate:433 - Validation: Batch 937/1541, Loss: 3.1229


937it [16:30,  1.06s/it][A[A



997it [17:34,  1.06s/it][A[A2023-05-05 10:12:27,081 - INFO - validate:433 - Validation: Batch 998/1541, Loss: 3.1217


998it [17:35,  1.06s/it][A[A2023-05-05 10:12:28,141 - INFO - validate:433 - Validation: Batch 999/1541, Loss: 3.1216


999it [17:36,  1.06s/it][A[A2023-05-05 10:12:29,200 - INFO - validate:433 - Validation: Batch 1000/1541, Loss: 3.1215


1000it [17:37,  1.06s/it][A[A2023-05-05 10:12:30,259 - INFO - validate:433 - Validation: Batch 1001/1541, Loss: 3.1215


1001it [17:38,  1.06s/it][A[A2023-05-05 10:12:31,318 - INFO - validate:433 - Validation: Batch 1002/1541, Loss: 3.1215


1002it [17:39,  1.06s/it][A[A2023-05-05 10:12:32,377 - INFO - validate:433 - Validation: Batch 1003/1541, Loss: 3.1216


1003it [17:40,  1.06s/it][A[A2023-05-05 10:12:33,438 - INFO - validate:433 - Validation: Batch 1004/1541, Loss: 3.1217


1004it [17:41,  1.06s/it][A[A2023-05-05 10:12:34,496 - INFO - validate:433 - Validation: Batch 1005/1541, Loss: 3.1216


1005it [17:42,  1.0



1064it [18:45,  1.06s/it][A[A2023-05-05 10:13:38,007 - INFO - validate:433 - Validation: Batch 1065/1541, Loss: 3.1234


1065it [18:46,  1.06s/it][A[A2023-05-05 10:13:39,064 - INFO - validate:433 - Validation: Batch 1066/1541, Loss: 3.1235


1066it [18:47,  1.06s/it][A[A2023-05-05 10:13:40,125 - INFO - validate:433 - Validation: Batch 1067/1541, Loss: 3.1235


1067it [18:48,  1.06s/it][A[A2023-05-05 10:13:41,180 - INFO - validate:433 - Validation: Batch 1068/1541, Loss: 3.1235


1068it [18:49,  1.06s/it][A[A2023-05-05 10:13:42,237 - INFO - validate:433 - Validation: Batch 1069/1541, Loss: 3.1236


1069it [18:50,  1.06s/it][A[A2023-05-05 10:13:43,294 - INFO - validate:433 - Validation: Batch 1070/1541, Loss: 3.1237


1070it [18:51,  1.06s/it][A[A2023-05-05 10:13:44,349 - INFO - validate:433 - Validation: Batch 1071/1541, Loss: 3.1239


1071it [18:52,  1.06s/it][A[A2023-05-05 10:13:45,403 - INFO - validate:433 - Validation: Batch 1072/1541, Loss: 3.1240


1072it [18:53,



1131it [19:56,  1.06s/it][A[A2023-05-05 10:14:48,927 - INFO - validate:433 - Validation: Batch 1132/1541, Loss: 3.1242


1132it [19:57,  1.06s/it][A[A2023-05-05 10:14:49,987 - INFO - validate:433 - Validation: Batch 1133/1541, Loss: 3.1241


1133it [19:58,  1.06s/it][A[A2023-05-05 10:14:51,050 - INFO - validate:433 - Validation: Batch 1134/1541, Loss: 3.1242


1134it [19:59,  1.06s/it][A[A2023-05-05 10:14:52,109 - INFO - validate:433 - Validation: Batch 1135/1541, Loss: 3.1241


1135it [20:00,  1.06s/it][A[A2023-05-05 10:14:53,168 - INFO - validate:433 - Validation: Batch 1136/1541, Loss: 3.1241


1136it [20:01,  1.06s/it][A[A2023-05-05 10:14:54,227 - INFO - validate:433 - Validation: Batch 1137/1541, Loss: 3.1241


1137it [20:02,  1.06s/it][A[A2023-05-05 10:14:55,284 - INFO - validate:433 - Validation: Batch 1138/1541, Loss: 3.1241


1138it [20:03,  1.06s/it][A[A2023-05-05 10:14:56,345 - INFO - validate:433 - Validation: Batch 1139/1541, Loss: 3.1241


1139it [20:04,



1198it [21:07,  1.06s/it][A[A2023-05-05 10:15:59,896 - INFO - validate:433 - Validation: Batch 1199/1541, Loss: 3.1235


1199it [21:08,  1.06s/it][A[A2023-05-05 10:16:00,952 - INFO - validate:433 - Validation: Batch 1200/1541, Loss: 3.1235


1200it [21:09,  1.06s/it][A[A2023-05-05 10:16:02,011 - INFO - validate:433 - Validation: Batch 1201/1541, Loss: 3.1236


1201it [21:10,  1.06s/it][A[A2023-05-05 10:16:03,073 - INFO - validate:433 - Validation: Batch 1202/1541, Loss: 3.1235


1202it [21:11,  1.06s/it][A[A2023-05-05 10:16:04,133 - INFO - validate:433 - Validation: Batch 1203/1541, Loss: 3.1235


1203it [21:12,  1.06s/it][A[A2023-05-05 10:16:05,194 - INFO - validate:433 - Validation: Batch 1204/1541, Loss: 3.1235


1204it [21:13,  1.06s/it][A[A2023-05-05 10:16:06,250 - INFO - validate:433 - Validation: Batch 1205/1541, Loss: 3.1234


1205it [21:14,  1.06s/it][A[A2023-05-05 10:16:07,311 - INFO - validate:433 - Validation: Batch 1206/1541, Loss: 3.1233


1206it [21:15,



1265it [22:18,  1.06s/it][A[A2023-05-05 10:17:10,884 - INFO - validate:433 - Validation: Batch 1266/1541, Loss: 3.1226


1266it [22:19,  1.06s/it][A[A2023-05-05 10:17:11,942 - INFO - validate:433 - Validation: Batch 1267/1541, Loss: 3.1227


1267it [22:20,  1.06s/it][A[A2023-05-05 10:17:13,000 - INFO - validate:433 - Validation: Batch 1268/1541, Loss: 3.1226


1268it [22:21,  1.06s/it][A[A2023-05-05 10:17:14,058 - INFO - validate:433 - Validation: Batch 1269/1541, Loss: 3.1226


1269it [22:22,  1.06s/it][A[A2023-05-05 10:17:15,117 - INFO - validate:433 - Validation: Batch 1270/1541, Loss: 3.1226


1270it [22:23,  1.06s/it][A[A2023-05-05 10:17:16,177 - INFO - validate:433 - Validation: Batch 1271/1541, Loss: 3.1226


1271it [22:24,  1.06s/it][A[A2023-05-05 10:17:17,237 - INFO - validate:433 - Validation: Batch 1272/1541, Loss: 3.1226


1272it [22:25,  1.06s/it][A[A2023-05-05 10:17:18,298 - INFO - validate:433 - Validation: Batch 1273/1541, Loss: 3.1227


1273it [22:26,



1332it [23:29,  1.06s/it][A[A2023-05-05 10:18:21,802 - INFO - validate:433 - Validation: Batch 1333/1541, Loss: 3.1246


1333it [23:30,  1.06s/it][A[A2023-05-05 10:18:22,859 - INFO - validate:433 - Validation: Batch 1334/1541, Loss: 3.1245


1334it [23:31,  1.06s/it][A[A2023-05-05 10:18:23,918 - INFO - validate:433 - Validation: Batch 1335/1541, Loss: 3.1245


1335it [23:32,  1.06s/it][A[A2023-05-05 10:18:24,977 - INFO - validate:433 - Validation: Batch 1336/1541, Loss: 3.1245


1336it [23:33,  1.06s/it][A[A2023-05-05 10:18:26,039 - INFO - validate:433 - Validation: Batch 1337/1541, Loss: 3.1246


1337it [23:34,  1.06s/it][A[A2023-05-05 10:18:27,100 - INFO - validate:433 - Validation: Batch 1338/1541, Loss: 3.1246


1338it [23:35,  1.06s/it][A[A2023-05-05 10:18:28,152 - INFO - validate:433 - Validation: Batch 1339/1541, Loss: 3.1246


1339it [23:36,  1.06s/it][A[A2023-05-05 10:18:29,210 - INFO - validate:433 - Validation: Batch 1340/1541, Loss: 3.1246


1340it [23:37,



1399it [24:39,  1.06s/it][A[A2023-05-05 10:19:32,705 - INFO - validate:433 - Validation: Batch 1400/1541, Loss: 3.1243


1400it [24:40,  1.06s/it][A[A2023-05-05 10:19:33,765 - INFO - validate:433 - Validation: Batch 1401/1541, Loss: 3.1242


1401it [24:42,  1.06s/it][A[A2023-05-05 10:19:34,825 - INFO - validate:433 - Validation: Batch 1402/1541, Loss: 3.1242


1402it [24:43,  1.06s/it][A[A2023-05-05 10:19:35,885 - INFO - validate:433 - Validation: Batch 1403/1541, Loss: 3.1241


1403it [24:44,  1.06s/it][A[A2023-05-05 10:19:36,948 - INFO - validate:433 - Validation: Batch 1404/1541, Loss: 3.1241


1404it [24:45,  1.06s/it][A[A2023-05-05 10:19:38,009 - INFO - validate:433 - Validation: Batch 1405/1541, Loss: 3.1241


1405it [24:46,  1.06s/it][A[A2023-05-05 10:19:39,070 - INFO - validate:433 - Validation: Batch 1406/1541, Loss: 3.1241


1406it [24:47,  1.06s/it][A[A2023-05-05 10:19:40,132 - INFO - validate:433 - Validation: Batch 1407/1541, Loss: 3.1241


1407it [24:48,



1466it [25:50,  1.06s/it][A[A2023-05-05 10:20:43,761 - INFO - validate:433 - Validation: Batch 1467/1541, Loss: 3.1235


1467it [25:52,  1.06s/it][A[A2023-05-05 10:20:44,822 - INFO - validate:433 - Validation: Batch 1468/1541, Loss: 3.1235


1468it [25:53,  1.06s/it][A[A2023-05-05 10:20:45,883 - INFO - validate:433 - Validation: Batch 1469/1541, Loss: 3.1236


1469it [25:54,  1.06s/it][A[A2023-05-05 10:20:46,941 - INFO - validate:433 - Validation: Batch 1470/1541, Loss: 3.1235


1470it [25:55,  1.06s/it][A[A2023-05-05 10:20:48,001 - INFO - validate:433 - Validation: Batch 1471/1541, Loss: 3.1236


1471it [25:56,  1.06s/it][A[A2023-05-05 10:20:49,059 - INFO - validate:433 - Validation: Batch 1472/1541, Loss: 3.1236


1472it [25:57,  1.06s/it][A[A2023-05-05 10:20:50,120 - INFO - validate:433 - Validation: Batch 1473/1541, Loss: 3.1236


1473it [25:58,  1.06s/it][A[A2023-05-05 10:20:51,178 - INFO - validate:433 - Validation: Batch 1474/1541, Loss: 3.1237


1474it [25:59,



1533it [27:01,  1.06s/it][A[A2023-05-05 10:21:54,649 - INFO - validate:433 - Validation: Batch 1534/1541, Loss: 3.1253


1534it [27:02,  1.06s/it][A[A2023-05-05 10:21:55,708 - INFO - validate:433 - Validation: Batch 1535/1541, Loss: 3.1253


1535it [27:03,  1.06s/it][A[A2023-05-05 10:21:56,761 - INFO - validate:433 - Validation: Batch 1536/1541, Loss: 3.1253


1536it [27:05,  1.06s/it][A[A2023-05-05 10:21:57,819 - INFO - validate:433 - Validation: Batch 1537/1541, Loss: 3.1252


1537it [27:06,  1.06s/it][A[A2023-05-05 10:21:58,877 - INFO - validate:433 - Validation: Batch 1538/1541, Loss: 3.1252


1538it [27:07,  1.06s/it][A[A2023-05-05 10:21:59,932 - INFO - validate:433 - Validation: Batch 1539/1541, Loss: 3.1251


1539it [27:08,  1.06s/it][A[A2023-05-05 10:22:00,983 - INFO - validate:433 - Validation: Batch 1540/1541, Loss: 3.1251


1540it [27:09,  1.05s/it][A[A2023-05-05 10:22:01,081 - INFO - validate:433 - Validation: Batch 1541/1541, Loss: 3.1250
1541it [27:09,  

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,▅██▇▆▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁

0,1
epoch,1.0
loss,3.19879


Run vblbgrzx errored: NameError("name 'log_save_path' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run vblbgrzx errored: NameError("name 'log_save_path' is not defined")
[34m[1mwandb[0m: Agent Starting Run: lxibxgbl with config:
[34m[1mwandb[0m: 	BETAS: [0.9, 0.95]
[34m[1mwandb[0m: 	GRADIENT_ACCUMULATION_STEPS: 4
[34m[1mwandb[0m: 	GRADIENT_CLIP: 2.5
[34m[1mwandb[0m: 	LEARNING_RATE: 0.0005
[34m[1mwandb[0m: 	NUM_EPOCHS: 1
[34m[1mwandb[0m: 	SAMPLING_INTERVAL: 10
[34m[1mwandb[0m: 	WEIGHT_DECAY: 0.001
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668344933229187, max=1.0…

2023-05-05 10:22:13,808 - INFO - create_or_load_model:370 - Model: facebook/opt-125m
2023-05-05 10:22:13,809 - INFO - print_trainable_parameters:121 - Parameters: Trainable- 125.24M|| All- 125.24M || Trainable%- 100.0
2023-05-05 10:22:13,811 - INFO - create_or_load_model:372 - Memory Memory Footprint: 500.957184 MB
2023-05-05 10:22:13,811 - INFO - create_or_load_model:373 - Model is on device: cuda:0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668331304875512, max=1.0…

  0%|          | 0/1 [00:00<?, ?it/s]2023-05-05 10:22:19,656 - INFO - train:33 - Epoch: 1/1

  0%|          | 0/13861 [00:00<?, ?it/s][A2023-05-05 10:22:20,238 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36470 MB.
2023-05-05 10:22:20,687 - INFO - train:65 - Batch 1/13861, Loss: 3.2951, Learning Rate: 0.0005

  0%|          | 1/13861 [00:01<6:51:09,  1.78s/it][A2023-05-05 10:22:21,891 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36470 MB.
2023-05-05 10:22:22,326 - INFO - train:65 - Batch 2/13861, Loss: 3.3077, Learning Rate: 0.0005

  0%|          | 2/13861 [00:03<6:31:49,  1.70s/it][A2023-05-05 10:22:23,528 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36470 MB.
2023-05-05 10:22:23,963 - INFO - train:65 - Batch 3/13861, Loss: 3.3197, Learning Rate: 0.0005

  0%|          | 3/13861 [00:05<6:25:36,  1.67s/it][A2023-05-05 10:22:25,166 - INFO - print_gpu_utilization:106 - GPU memory occupied: 36470 MB.
2023-05-05 10:22:25,599 - INFO - train:65 - B


  0%|          | 51/13861 [01:24<6:18:22,  1.64s/it][A2023-05-05 10:23:44,715 - INFO - train:65 - Batch 52/13861, Loss: 7.1112, Learning Rate: 0.0005

  0%|          | 52/13861 [01:25<6:21:09,  1.66s/it][A2023-05-05 10:23:46,400 - INFO - train:65 - Batch 53/13861, Loss: 7.1058, Learning Rate: 0.0005

  0%|          | 53/13861 [01:27<6:19:50,  1.65s/it][A2023-05-05 10:23:48,038 - INFO - train:65 - Batch 54/13861, Loss: 7.1000, Learning Rate: 0.0005

  0%|          | 54/13861 [01:29<6:18:55,  1.65s/it][A2023-05-05 10:23:49,678 - INFO - train:65 - Batch 55/13861, Loss: 7.0942, Learning Rate: 0.0005

  0%|          | 55/13861 [01:30<6:18:33,  1.65s/it][A2023-05-05 10:23:51,317 - INFO - train:65 - Batch 56/13861, Loss: 7.0905, Learning Rate: 0.0005

  0%|          | 56/13861 [01:32<6:21:18,  1.66s/it][A2023-05-05 10:23:53,002 - INFO - train:65 - Batch 57/13861, Loss: 7.0684, Learning Rate: 0.0005

  0%|          | 57/13861 [01:34<6:19:58,  1.65s/it][A2023-05-05 10:23:54,641 - INFO -


  1%|          | 105/13861 [02:53<6:18:35,  1.65s/it][A2023-05-05 10:25:13,830 - INFO - train:65 - Batch 106/13861, Loss: 6.3827, Learning Rate: 0.0005

  1%|          | 106/13861 [02:54<6:17:39,  1.65s/it][A2023-05-05 10:25:15,468 - INFO - train:65 - Batch 107/13861, Loss: 6.3716, Learning Rate: 0.0005

  1%|          | 107/13861 [02:56<6:16:57,  1.64s/it][A2023-05-05 10:25:17,106 - INFO - train:65 - Batch 108/13861, Loss: 6.3608, Learning Rate: 0.0005

  1%|          | 108/13861 [02:58<6:19:41,  1.66s/it][A2023-05-05 10:25:18,790 - INFO - train:65 - Batch 109/13861, Loss: 6.3470, Learning Rate: 0.0005

  1%|          | 109/13861 [02:59<6:18:23,  1.65s/it][A2023-05-05 10:25:20,428 - INFO - train:65 - Batch 110/13861, Loss: 6.3331, Learning Rate: 0.0005

  1%|          | 110/13861 [03:01<6:17:30,  1.65s/it][A2023-05-05 10:25:22,067 - INFO - train:65 - Batch 111/13861, Loss: 6.3198, Learning Rate: 0.0005

  1%|          | 111/13861 [03:03<6:16:50,  1.64s/it][A2023-05-05 10:25:23

  2%|▏         | 211/13861 [05:48<6:14:02,  1.64s/it][A2023-05-05 10:28:08,664 - INFO - train:65 - Batch 212/13861, Loss: 5.4618, Learning Rate: 0.0005

  2%|▏         | 212/13861 [05:49<6:16:44,  1.66s/it][A2023-05-05 10:28:10,348 - INFO - train:65 - Batch 213/13861, Loss: 5.4552, Learning Rate: 0.0005

  2%|▏         | 213/13861 [05:51<6:15:30,  1.65s/it][A2023-05-05 10:28:11,986 - INFO - train:65 - Batch 214/13861, Loss: 5.4489, Learning Rate: 0.0005

  2%|▏         | 214/13861 [05:53<6:14:38,  1.65s/it][A2023-05-05 10:28:13,625 - INFO - train:65 - Batch 215/13861, Loss: 5.4428, Learning Rate: 0.0005

  2%|▏         | 215/13861 [05:54<6:14:00,  1.64s/it][A2023-05-05 10:28:15,264 - INFO - train:65 - Batch 216/13861, Loss: 5.4369, Learning Rate: 0.0005

  2%|▏         | 216/13861 [05:56<6:16:46,  1.66s/it][A2023-05-05 10:28:16,948 - INFO - train:65 - Batch 217/13861, Loss: 5.4304, Learning Rate: 0.0005

  2%|▏         | 217/13861 [05:58<6:15:24,  1.65s/it][A2023-05-05 10:28:18,

  2%|▏         | 317/13861 [08:42<6:12:43,  1.65s/it][A2023-05-05 10:31:03,532 - INFO - train:65 - Batch 318/13861, Loss: 4.9304, Learning Rate: 0.0005

  2%|▏         | 318/13861 [08:44<6:11:46,  1.65s/it][A2023-05-05 10:31:05,169 - INFO - train:65 - Batch 319/13861, Loss: 4.9262, Learning Rate: 0.0005

  2%|▏         | 319/13861 [08:46<6:11:08,  1.64s/it][A2023-05-05 10:31:06,808 - INFO - train:65 - Batch 320/13861, Loss: 4.9222, Learning Rate: 0.0005

  2%|▏         | 320/13861 [08:47<6:13:46,  1.66s/it][A2023-05-05 10:31:08,492 - INFO - train:65 - Batch 321/13861, Loss: 4.9183, Learning Rate: 0.0005

  2%|▏         | 321/13861 [08:49<6:12:28,  1.65s/it][A2023-05-05 10:31:10,129 - INFO - train:65 - Batch 322/13861, Loss: 4.9145, Learning Rate: 0.0005

  2%|▏         | 322/13861 [08:51<6:11:32,  1.65s/it][A2023-05-05 10:31:11,766 - INFO - train:65 - Batch 323/13861, Loss: 4.9106, Learning Rate: 0.0005

  2%|▏         | 323/13861 [08:52<6:10:52,  1.64s/it][A2023-05-05 10:31:13,

  3%|▎         | 423/13861 [11:37<6:07:55,  1.64s/it][A2023-05-05 10:33:58,270 - INFO - train:65 - Batch 424/13861, Loss: 4.6124, Learning Rate: 0.0005

  3%|▎         | 424/13861 [11:39<6:10:39,  1.66s/it][A2023-05-05 10:33:59,955 - INFO - train:65 - Batch 425/13861, Loss: 4.6102, Learning Rate: 0.0005

  3%|▎         | 425/13861 [11:41<6:09:23,  1.65s/it][A2023-05-05 10:34:01,591 - INFO - train:65 - Batch 426/13861, Loss: 4.6083, Learning Rate: 0.0005

  3%|▎         | 426/13861 [11:42<6:08:30,  1.65s/it][A2023-05-05 10:34:03,228 - INFO - train:65 - Batch 427/13861, Loss: 4.6063, Learning Rate: 0.0005

  3%|▎         | 427/13861 [11:44<6:07:51,  1.64s/it][A2023-05-05 10:34:04,865 - INFO - train:65 - Batch 428/13861, Loss: 4.6042, Learning Rate: 0.0005

  3%|▎         | 428/13861 [11:46<6:10:29,  1.65s/it][A2023-05-05 10:34:06,547 - INFO - train:65 - Batch 429/13861, Loss: 4.6021, Learning Rate: 0.0005

  3%|▎         | 429/13861 [11:47<6:09:15,  1.65s/it][A2023-05-05 10:34:08,



35it [00:37,  1.06s/it][A[A2023-05-05 10:36:41,597 - INFO - validate:433 - Validation: Batch 36/1541, Loss: 4.2976


36it [00:38,  1.05s/it][A[A2023-05-05 10:36:42,649 - INFO - validate:433 - Validation: Batch 37/1541, Loss: 4.2980


37it [00:39,  1.05s/it][A[A2023-05-05 10:36:43,709 - INFO - validate:433 - Validation: Batch 38/1541, Loss: 4.2986


38it [00:40,  1.06s/it][A[A2023-05-05 10:36:44,764 - INFO - validate:433 - Validation: Batch 39/1541, Loss: 4.3011


39it [00:41,  1.06s/it][A[A2023-05-05 10:36:45,823 - INFO - validate:433 - Validation: Batch 40/1541, Loss: 4.3043


40it [00:42,  1.06s/it][A[A2023-05-05 10:36:46,877 - INFO - validate:433 - Validation: Batch 41/1541, Loss: 4.3051


41it [00:43,  1.06s/it][A[A2023-05-05 10:36:47,928 - INFO - validate:433 - Validation: Batch 42/1541, Loss: 4.3075


42it [00:44,  1.05s/it][A[A2023-05-05 10:36:48,975 - INFO - validate:433 - Validation: Batch 43/1541, Loss: 4.3123


43it [00:45,  1.05s/it][A[A2023-05-05 10:36:



104it [01:49,  1.06s/it][A[A2023-05-05 10:37:54,371 - INFO - validate:433 - Validation: Batch 105/1541, Loss: 4.2238


105it [01:50,  1.06s/it][A[A2023-05-05 10:37:55,426 - INFO - validate:433 - Validation: Batch 106/1541, Loss: 4.2232


106it [01:51,  1.06s/it][A[A2023-05-05 10:37:56,483 - INFO - validate:433 - Validation: Batch 107/1541, Loss: 4.2225


107it [01:52,  1.06s/it][A[A2023-05-05 10:37:57,539 - INFO - validate:433 - Validation: Batch 108/1541, Loss: 4.2218


108it [01:54,  1.06s/it][A[A2023-05-05 10:37:58,596 - INFO - validate:433 - Validation: Batch 109/1541, Loss: 4.2223


109it [01:55,  1.06s/it][A[A2023-05-05 10:37:59,653 - INFO - validate:433 - Validation: Batch 110/1541, Loss: 4.2210


110it [01:56,  1.06s/it][A[A2023-05-05 10:38:00,714 - INFO - validate:433 - Validation: Batch 111/1541, Loss: 4.2200


111it [01:57,  1.06s/it][A[A2023-05-05 10:38:01,771 - INFO - validate:433 - Validation: Batch 112/1541, Loss: 4.2187


112it [01:58,  1.06s/it][A[A



172it [03:01,  1.06s/it][A[A2023-05-05 10:39:06,255 - INFO - validate:433 - Validation: Batch 173/1541, Loss: 4.2076


173it [03:02,  1.06s/it][A[A2023-05-05 10:39:07,313 - INFO - validate:433 - Validation: Batch 174/1541, Loss: 4.2070


174it [03:03,  1.06s/it][A[A2023-05-05 10:39:08,371 - INFO - validate:433 - Validation: Batch 175/1541, Loss: 4.2066


175it [03:04,  1.06s/it][A[A2023-05-05 10:39:09,427 - INFO - validate:433 - Validation: Batch 176/1541, Loss: 4.2062


176it [03:05,  1.06s/it][A[A2023-05-05 10:39:10,487 - INFO - validate:433 - Validation: Batch 177/1541, Loss: 4.2059


177it [03:06,  1.06s/it][A[A2023-05-05 10:39:11,546 - INFO - validate:433 - Validation: Batch 178/1541, Loss: 4.2057


178it [03:08,  1.06s/it][A[A2023-05-05 10:39:12,605 - INFO - validate:433 - Validation: Batch 179/1541, Loss: 4.2054


179it [03:09,  1.06s/it][A[A2023-05-05 10:39:13,664 - INFO - validate:433 - Validation: Batch 180/1541, Loss: 4.2048


180it [03:10,  1.06s/it][A[A



240it [04:13,  1.06s/it][A[A2023-05-05 10:40:18,236 - INFO - validate:433 - Validation: Batch 241/1541, Loss: 4.1920


241it [04:14,  1.06s/it][A[A2023-05-05 10:40:19,293 - INFO - validate:433 - Validation: Batch 242/1541, Loss: 4.1923


242it [04:15,  1.06s/it][A[A2023-05-05 10:40:20,352 - INFO - validate:433 - Validation: Batch 243/1541, Loss: 4.1919


243it [04:16,  1.06s/it][A[A2023-05-05 10:40:21,410 - INFO - validate:433 - Validation: Batch 244/1541, Loss: 4.1921


244it [04:17,  1.06s/it][A[A2023-05-05 10:40:22,469 - INFO - validate:433 - Validation: Batch 245/1541, Loss: 4.1923


245it [04:18,  1.06s/it][A[A2023-05-05 10:40:23,527 - INFO - validate:433 - Validation: Batch 246/1541, Loss: 4.1923


246it [04:19,  1.06s/it][A[A2023-05-05 10:40:24,585 - INFO - validate:433 - Validation: Batch 247/1541, Loss: 4.1924


247it [04:21,  1.06s/it][A[A2023-05-05 10:40:25,645 - INFO - validate:433 - Validation: Batch 248/1541, Loss: 4.1923


248it [04:22,  1.06s/it][A[A



308it [05:25,  1.06s/it][A[A2023-05-05 10:41:30,199 - INFO - validate:433 - Validation: Batch 309/1541, Loss: 4.1933


309it [05:26,  1.06s/it][A[A2023-05-05 10:41:31,256 - INFO - validate:433 - Validation: Batch 310/1541, Loss: 4.1932


310it [05:27,  1.06s/it][A[A2023-05-05 10:41:32,315 - INFO - validate:433 - Validation: Batch 311/1541, Loss: 4.1931


311it [05:28,  1.06s/it][A[A2023-05-05 10:41:33,370 - INFO - validate:433 - Validation: Batch 312/1541, Loss: 4.1933


312it [05:29,  1.06s/it][A[A2023-05-05 10:41:34,429 - INFO - validate:433 - Validation: Batch 313/1541, Loss: 4.1929


313it [05:30,  1.06s/it][A[A2023-05-05 10:41:35,487 - INFO - validate:433 - Validation: Batch 314/1541, Loss: 4.1927


314it [05:31,  1.06s/it][A[A2023-05-05 10:41:36,546 - INFO - validate:433 - Validation: Batch 315/1541, Loss: 4.1931


315it [05:33,  1.06s/it][A[A2023-05-05 10:41:37,600 - INFO - validate:433 - Validation: Batch 316/1541, Loss: 4.1935


316it [05:34,  1.06s/it][A[A



376it [06:37,  1.06s/it][A[A2023-05-05 10:42:42,145 - INFO - validate:433 - Validation: Batch 377/1541, Loss: 4.1978


377it [06:38,  1.06s/it][A[A2023-05-05 10:42:43,201 - INFO - validate:433 - Validation: Batch 378/1541, Loss: 4.1974


378it [06:39,  1.06s/it][A[A2023-05-05 10:42:44,260 - INFO - validate:433 - Validation: Batch 379/1541, Loss: 4.1974


379it [06:40,  1.06s/it][A[A2023-05-05 10:42:45,315 - INFO - validate:433 - Validation: Batch 380/1541, Loss: 4.1979


380it [06:41,  1.06s/it][A[A2023-05-05 10:42:46,378 - INFO - validate:433 - Validation: Batch 381/1541, Loss: 4.1977


381it [06:42,  1.06s/it][A[A2023-05-05 10:42:47,435 - INFO - validate:433 - Validation: Batch 382/1541, Loss: 4.1980


382it [06:43,  1.06s/it][A[A2023-05-05 10:42:48,495 - INFO - validate:433 - Validation: Batch 383/1541, Loss: 4.1980


383it [06:44,  1.06s/it][A[A2023-05-05 10:42:49,551 - INFO - validate:433 - Validation: Batch 384/1541, Loss: 4.1983


384it [06:46,  1.06s/it][A[A



444it [07:49,  1.06s/it][A[A2023-05-05 10:43:54,094 - INFO - validate:433 - Validation: Batch 445/1541, Loss: 4.2011


445it [07:50,  1.06s/it][A[A2023-05-05 10:43:55,154 - INFO - validate:433 - Validation: Batch 446/1541, Loss: 4.2011


446it [07:51,  1.06s/it][A[A2023-05-05 10:43:56,213 - INFO - validate:433 - Validation: Batch 447/1541, Loss: 4.2012


447it [07:52,  1.06s/it][A[A2023-05-05 10:43:57,272 - INFO - validate:433 - Validation: Batch 448/1541, Loss: 4.2012


448it [07:53,  1.06s/it][A[A2023-05-05 10:43:58,333 - INFO - validate:433 - Validation: Batch 449/1541, Loss: 4.2010


449it [07:54,  1.06s/it][A[A2023-05-05 10:43:59,390 - INFO - validate:433 - Validation: Batch 450/1541, Loss: 4.2010


450it [07:55,  1.06s/it][A[A2023-05-05 10:44:00,447 - INFO - validate:433 - Validation: Batch 451/1541, Loss: 4.2010


451it [07:56,  1.06s/it][A[A2023-05-05 10:44:01,504 - INFO - validate:433 - Validation: Batch 452/1541, Loss: 4.2012


452it [07:57,  1.06s/it][A[A



512it [09:01,  1.06s/it][A[A2023-05-05 10:45:06,074 - INFO - validate:433 - Validation: Batch 513/1541, Loss: 4.2026


513it [09:02,  1.06s/it][A[A2023-05-05 10:45:07,135 - INFO - validate:433 - Validation: Batch 514/1541, Loss: 4.2025


514it [09:03,  1.06s/it][A[A2023-05-05 10:45:08,194 - INFO - validate:433 - Validation: Batch 515/1541, Loss: 4.2022


515it [09:04,  1.06s/it][A[A2023-05-05 10:45:09,253 - INFO - validate:433 - Validation: Batch 516/1541, Loss: 4.2021


516it [09:05,  1.06s/it][A[A2023-05-05 10:45:10,313 - INFO - validate:433 - Validation: Batch 517/1541, Loss: 4.2019


517it [09:06,  1.06s/it][A[A2023-05-05 10:45:11,369 - INFO - validate:433 - Validation: Batch 518/1541, Loss: 4.2019


518it [09:07,  1.06s/it][A[A2023-05-05 10:45:12,430 - INFO - validate:433 - Validation: Batch 519/1541, Loss: 4.2018


519it [09:08,  1.06s/it][A[A2023-05-05 10:45:13,491 - INFO - validate:433 - Validation: Batch 520/1541, Loss: 4.2016


520it [09:09,  1.06s/it][A[A



580it [10:13,  1.06s/it][A[A2023-05-05 10:46:18,093 - INFO - validate:433 - Validation: Batch 581/1541, Loss: 4.2020


581it [10:14,  1.06s/it][A[A2023-05-05 10:46:19,150 - INFO - validate:433 - Validation: Batch 582/1541, Loss: 4.2019


582it [10:15,  1.06s/it][A[A2023-05-05 10:46:20,207 - INFO - validate:433 - Validation: Batch 583/1541, Loss: 4.2020


583it [10:16,  1.06s/it][A[A2023-05-05 10:46:21,265 - INFO - validate:433 - Validation: Batch 584/1541, Loss: 4.2023


584it [10:17,  1.06s/it][A[A2023-05-05 10:46:22,321 - INFO - validate:433 - Validation: Batch 585/1541, Loss: 4.2023


585it [10:18,  1.06s/it][A[A2023-05-05 10:46:23,382 - INFO - validate:433 - Validation: Batch 586/1541, Loss: 4.2023


586it [10:19,  1.06s/it][A[A2023-05-05 10:46:24,442 - INFO - validate:433 - Validation: Batch 587/1541, Loss: 4.2022


587it [10:20,  1.06s/it][A[A2023-05-05 10:46:25,500 - INFO - validate:433 - Validation: Batch 588/1541, Loss: 4.2023


588it [10:21,  1.06s/it][A[A



648it [11:25,  1.06s/it][A[A2023-05-05 10:47:29,895 - INFO - validate:433 - Validation: Batch 649/1541, Loss: 4.2041


649it [11:26,  1.06s/it][A[A2023-05-05 10:47:30,950 - INFO - validate:433 - Validation: Batch 650/1541, Loss: 4.2043


650it [11:27,  1.06s/it][A[A2023-05-05 10:47:32,009 - INFO - validate:433 - Validation: Batch 651/1541, Loss: 4.2044


651it [11:28,  1.06s/it][A[A2023-05-05 10:47:33,066 - INFO - validate:433 - Validation: Batch 652/1541, Loss: 4.2044


652it [11:29,  1.06s/it][A[A2023-05-05 10:47:34,121 - INFO - validate:433 - Validation: Batch 653/1541, Loss: 4.2045


653it [11:30,  1.06s/it][A[A2023-05-05 10:47:35,175 - INFO - validate:433 - Validation: Batch 654/1541, Loss: 4.2047


654it [11:31,  1.06s/it][A[A2023-05-05 10:47:36,229 - INFO - validate:433 - Validation: Batch 655/1541, Loss: 4.2046


655it [11:32,  1.06s/it][A[A2023-05-05 10:47:37,284 - INFO - validate:433 - Validation: Batch 656/1541, Loss: 4.2047


656it [11:33,  1.06s/it][A[A



716it [12:37,  1.05s/it][A[A2023-05-05 10:48:41,678 - INFO - validate:433 - Validation: Batch 717/1541, Loss: 4.2125


717it [12:38,  1.06s/it][A[A2023-05-05 10:48:42,735 - INFO - validate:433 - Validation: Batch 718/1541, Loss: 4.2127


718it [12:39,  1.06s/it][A[A2023-05-05 10:48:43,793 - INFO - validate:433 - Validation: Batch 719/1541, Loss: 4.2128


719it [12:40,  1.06s/it][A[A2023-05-05 10:48:44,850 - INFO - validate:433 - Validation: Batch 720/1541, Loss: 4.2129


720it [12:41,  1.06s/it][A[A2023-05-05 10:48:45,905 - INFO - validate:433 - Validation: Batch 721/1541, Loss: 4.2132


721it [12:42,  1.06s/it][A[A2023-05-05 10:48:46,959 - INFO - validate:433 - Validation: Batch 722/1541, Loss: 4.2135


722it [12:43,  1.06s/it][A[A2023-05-05 10:48:48,018 - INFO - validate:433 - Validation: Batch 723/1541, Loss: 4.2135


723it [12:44,  1.06s/it][A[A2023-05-05 10:48:49,073 - INFO - validate:433 - Validation: Batch 724/1541, Loss: 4.2136


724it [12:45,  1.06s/it][A[A



784it [13:48,  1.06s/it][A[A2023-05-05 10:49:53,506 - INFO - validate:433 - Validation: Batch 785/1541, Loss: 4.2162


785it [13:49,  1.06s/it][A[A2023-05-05 10:49:54,549 - INFO - validate:433 - Validation: Batch 786/1541, Loss: 4.2160


786it [13:51,  1.05s/it][A[A2023-05-05 10:49:55,599 - INFO - validate:433 - Validation: Batch 787/1541, Loss: 4.2159


787it [13:52,  1.05s/it][A[A2023-05-05 10:49:56,656 - INFO - validate:433 - Validation: Batch 788/1541, Loss: 4.2160


788it [13:53,  1.05s/it][A[A2023-05-05 10:49:57,713 - INFO - validate:433 - Validation: Batch 789/1541, Loss: 4.2162


789it [13:54,  1.05s/it][A[A2023-05-05 10:49:58,761 - INFO - validate:433 - Validation: Batch 790/1541, Loss: 4.2163


790it [13:55,  1.05s/it][A[A2023-05-05 10:49:59,818 - INFO - validate:433 - Validation: Batch 791/1541, Loss: 4.2162


791it [13:56,  1.05s/it][A[A2023-05-05 10:50:00,877 - INFO - validate:433 - Validation: Batch 792/1541, Loss: 4.2160


792it [13:57,  1.06s/it][A[A



852it [15:00,  1.06s/it][A[A2023-05-05 10:51:05,414 - INFO - validate:433 - Validation: Batch 853/1541, Loss: 4.2147


853it [15:01,  1.06s/it][A[A2023-05-05 10:51:06,470 - INFO - validate:433 - Validation: Batch 854/1541, Loss: 4.2147


854it [15:02,  1.06s/it][A[A2023-05-05 10:51:07,529 - INFO - validate:433 - Validation: Batch 855/1541, Loss: 4.2147


855it [15:03,  1.06s/it][A[A2023-05-05 10:51:08,583 - INFO - validate:433 - Validation: Batch 856/1541, Loss: 4.2147


856it [15:05,  1.06s/it][A[A2023-05-05 10:51:09,639 - INFO - validate:433 - Validation: Batch 857/1541, Loss: 4.2147


857it [15:06,  1.06s/it][A[A2023-05-05 10:51:10,696 - INFO - validate:433 - Validation: Batch 858/1541, Loss: 4.2147


858it [15:07,  1.06s/it][A[A2023-05-05 10:51:11,756 - INFO - validate:433 - Validation: Batch 859/1541, Loss: 4.2148


859it [15:08,  1.06s/it][A[A2023-05-05 10:51:12,814 - INFO - validate:433 - Validation: Batch 860/1541, Loss: 4.2148


860it [15:09,  1.06s/it][A[A



920it [16:12,  1.06s/it][A[A2023-05-05 10:52:17,377 - INFO - validate:433 - Validation: Batch 921/1541, Loss: 4.2139


921it [16:13,  1.06s/it][A[A2023-05-05 10:52:18,435 - INFO - validate:433 - Validation: Batch 922/1541, Loss: 4.2138


922it [16:14,  1.06s/it][A[A2023-05-05 10:52:19,493 - INFO - validate:433 - Validation: Batch 923/1541, Loss: 4.2137


923it [16:15,  1.06s/it][A[A2023-05-05 10:52:20,552 - INFO - validate:433 - Validation: Batch 924/1541, Loss: 4.2136


924it [16:17,  1.06s/it][A[A2023-05-05 10:52:21,612 - INFO - validate:433 - Validation: Batch 925/1541, Loss: 4.2134


925it [16:18,  1.06s/it][A[A2023-05-05 10:52:22,671 - INFO - validate:433 - Validation: Batch 926/1541, Loss: 4.2134


926it [16:19,  1.06s/it][A[A2023-05-05 10:52:23,730 - INFO - validate:433 - Validation: Batch 927/1541, Loss: 4.2133


927it [16:20,  1.06s/it][A[A2023-05-05 10:52:24,789 - INFO - validate:433 - Validation: Batch 928/1541, Loss: 4.2133


928it [16:21,  1.06s/it][A[A



988it [17:24,  1.06s/it][A[A2023-05-05 10:53:29,335 - INFO - validate:433 - Validation: Batch 989/1541, Loss: 4.2117


989it [17:25,  1.06s/it][A[A2023-05-05 10:53:30,389 - INFO - validate:433 - Validation: Batch 990/1541, Loss: 4.2116


990it [17:26,  1.06s/it][A[A2023-05-05 10:53:31,448 - INFO - validate:433 - Validation: Batch 991/1541, Loss: 4.2117


991it [17:27,  1.06s/it][A[A2023-05-05 10:53:32,506 - INFO - validate:433 - Validation: Batch 992/1541, Loss: 4.2117


992it [17:28,  1.06s/it][A[A2023-05-05 10:53:33,565 - INFO - validate:433 - Validation: Batch 993/1541, Loss: 4.2119


993it [17:30,  1.06s/it][A[A2023-05-05 10:53:34,624 - INFO - validate:433 - Validation: Batch 994/1541, Loss: 4.2119


994it [17:31,  1.06s/it][A[A2023-05-05 10:53:35,682 - INFO - validate:433 - Validation: Batch 995/1541, Loss: 4.2119


995it [17:32,  1.06s/it][A[A2023-05-05 10:53:36,739 - INFO - validate:433 - Validation: Batch 996/1541, Loss: 4.2119


996it [17:33,  1.06s/it][A[A



1055it [18:35,  1.06s/it][A[A2023-05-05 10:54:40,200 - INFO - validate:433 - Validation: Batch 1056/1541, Loss: 4.2156


1056it [18:36,  1.06s/it][A[A2023-05-05 10:54:41,257 - INFO - validate:433 - Validation: Batch 1057/1541, Loss: 4.2156


1057it [18:37,  1.06s/it][A[A2023-05-05 10:54:42,316 - INFO - validate:433 - Validation: Batch 1058/1541, Loss: 4.2158


1058it [18:38,  1.06s/it][A[A2023-05-05 10:54:43,373 - INFO - validate:433 - Validation: Batch 1059/1541, Loss: 4.2158


1059it [18:39,  1.06s/it][A[A2023-05-05 10:54:44,429 - INFO - validate:433 - Validation: Batch 1060/1541, Loss: 4.2159


1060it [18:40,  1.06s/it][A[A2023-05-05 10:54:45,485 - INFO - validate:433 - Validation: Batch 1061/1541, Loss: 4.2160


1061it [18:41,  1.06s/it][A[A2023-05-05 10:54:46,538 - INFO - validate:433 - Validation: Batch 1062/1541, Loss: 4.2161


1062it [18:43,  1.06s/it][A[A2023-05-05 10:54:47,597 - INFO - validate:433 - Validation: Batch 1063/1541, Loss: 4.2162


1063it [18:44,



1122it [19:46,  1.06s/it][A[A2023-05-05 10:55:51,018 - INFO - validate:433 - Validation: Batch 1123/1541, Loss: 4.2196


1123it [19:47,  1.06s/it][A[A2023-05-05 10:55:52,080 - INFO - validate:433 - Validation: Batch 1124/1541, Loss: 4.2197


1124it [19:48,  1.06s/it][A[A2023-05-05 10:55:53,139 - INFO - validate:433 - Validation: Batch 1125/1541, Loss: 4.2197


1125it [19:49,  1.06s/it][A[A2023-05-05 10:55:54,198 - INFO - validate:433 - Validation: Batch 1126/1541, Loss: 4.2197


1126it [19:50,  1.06s/it][A[A2023-05-05 10:55:55,255 - INFO - validate:433 - Validation: Batch 1127/1541, Loss: 4.2197


1127it [19:51,  1.06s/it][A[A2023-05-05 10:55:56,312 - INFO - validate:433 - Validation: Batch 1128/1541, Loss: 4.2197


1128it [19:52,  1.06s/it][A[A2023-05-05 10:55:57,371 - INFO - validate:433 - Validation: Batch 1129/1541, Loss: 4.2197


1129it [19:53,  1.06s/it][A[A2023-05-05 10:55:58,429 - INFO - validate:433 - Validation: Batch 1130/1541, Loss: 4.2198


1130it [19:54,



1189it [20:57,  1.06s/it][A[A2023-05-05 10:57:01,907 - INFO - validate:433 - Validation: Batch 1190/1541, Loss: 4.2200


1190it [20:58,  1.06s/it][A[A2023-05-05 10:57:02,966 - INFO - validate:433 - Validation: Batch 1191/1541, Loss: 4.2199


1191it [20:59,  1.06s/it][A[A2023-05-05 10:57:04,023 - INFO - validate:433 - Validation: Batch 1192/1541, Loss: 4.2200


1192it [21:00,  1.06s/it][A[A2023-05-05 10:57:05,081 - INFO - validate:433 - Validation: Batch 1193/1541, Loss: 4.2200


1193it [21:01,  1.06s/it][A[A2023-05-05 10:57:06,140 - INFO - validate:433 - Validation: Batch 1194/1541, Loss: 4.2201


1194it [21:02,  1.06s/it][A[A2023-05-05 10:57:07,197 - INFO - validate:433 - Validation: Batch 1195/1541, Loss: 4.2201


1195it [21:03,  1.06s/it][A[A2023-05-05 10:57:08,253 - INFO - validate:433 - Validation: Batch 1196/1541, Loss: 4.2202


1196it [21:04,  1.06s/it][A[A2023-05-05 10:57:09,311 - INFO - validate:433 - Validation: Batch 1197/1541, Loss: 4.2202


1197it [21:05,



1256it [22:08,  1.06s/it][A[A2023-05-05 10:58:12,810 - INFO - validate:433 - Validation: Batch 1257/1541, Loss: 4.2192


1257it [22:09,  1.06s/it][A[A2023-05-05 10:58:13,868 - INFO - validate:433 - Validation: Batch 1258/1541, Loss: 4.2192


1258it [22:10,  1.06s/it][A[A2023-05-05 10:58:14,926 - INFO - validate:433 - Validation: Batch 1259/1541, Loss: 4.2192


1259it [22:11,  1.06s/it][A[A2023-05-05 10:58:15,982 - INFO - validate:433 - Validation: Batch 1260/1541, Loss: 4.2192


1260it [22:12,  1.06s/it][A[A2023-05-05 10:58:17,040 - INFO - validate:433 - Validation: Batch 1261/1541, Loss: 4.2192


1261it [22:13,  1.06s/it][A[A2023-05-05 10:58:18,101 - INFO - validate:433 - Validation: Batch 1262/1541, Loss: 4.2193


1262it [22:14,  1.06s/it][A[A2023-05-05 10:58:19,157 - INFO - validate:433 - Validation: Batch 1263/1541, Loss: 4.2193


1263it [22:15,  1.06s/it][A[A2023-05-05 10:58:20,214 - INFO - validate:433 - Validation: Batch 1264/1541, Loss: 4.2193


1264it [22:16,



1323it [23:19,  1.06s/it][A[A2023-05-05 10:59:23,661 - INFO - validate:433 - Validation: Batch 1324/1541, Loss: 4.2236


1324it [23:20,  1.06s/it][A[A2023-05-05 10:59:24,720 - INFO - validate:433 - Validation: Batch 1325/1541, Loss: 4.2236


1325it [23:21,  1.06s/it][A[A2023-05-05 10:59:25,777 - INFO - validate:433 - Validation: Batch 1326/1541, Loss: 4.2236


1326it [23:22,  1.06s/it][A[A2023-05-05 10:59:26,831 - INFO - validate:433 - Validation: Batch 1327/1541, Loss: 4.2236


1327it [23:23,  1.06s/it][A[A2023-05-05 10:59:27,893 - INFO - validate:433 - Validation: Batch 1328/1541, Loss: 4.2236


1328it [23:24,  1.06s/it][A[A2023-05-05 10:59:28,942 - INFO - validate:433 - Validation: Batch 1329/1541, Loss: 4.2236


1329it [23:25,  1.06s/it][A[A2023-05-05 10:59:29,998 - INFO - validate:433 - Validation: Batch 1330/1541, Loss: 4.2237


1330it [23:26,  1.06s/it][A[A2023-05-05 10:59:31,053 - INFO - validate:433 - Validation: Batch 1331/1541, Loss: 4.2238


1331it [23:27,



1390it [24:29,  1.05s/it][A[A2023-05-05 11:00:34,397 - INFO - validate:433 - Validation: Batch 1391/1541, Loss: 4.2242


1391it [24:30,  1.06s/it][A[A2023-05-05 11:00:35,454 - INFO - validate:433 - Validation: Batch 1392/1541, Loss: 4.2242


1392it [24:31,  1.06s/it][A[A2023-05-05 11:00:36,513 - INFO - validate:433 - Validation: Batch 1393/1541, Loss: 4.2242


1393it [24:32,  1.06s/it][A[A2023-05-05 11:00:37,568 - INFO - validate:433 - Validation: Batch 1394/1541, Loss: 4.2242


1394it [24:34,  1.06s/it][A[A2023-05-05 11:00:38,624 - INFO - validate:433 - Validation: Batch 1395/1541, Loss: 4.2243


1395it [24:35,  1.06s/it][A[A2023-05-05 11:00:39,680 - INFO - validate:433 - Validation: Batch 1396/1541, Loss: 4.2243


1396it [24:36,  1.06s/it][A[A2023-05-05 11:00:40,737 - INFO - validate:433 - Validation: Batch 1397/1541, Loss: 4.2243


1397it [24:37,  1.06s/it][A[A2023-05-05 11:00:41,792 - INFO - validate:433 - Validation: Batch 1398/1541, Loss: 4.2242


1398it [24:38,



1457it [25:40,  1.06s/it][A[A2023-05-05 11:01:45,227 - INFO - validate:433 - Validation: Batch 1458/1541, Loss: 4.2235


1458it [25:41,  1.06s/it][A[A2023-05-05 11:01:46,286 - INFO - validate:433 - Validation: Batch 1459/1541, Loss: 4.2235


1459it [25:42,  1.06s/it][A[A2023-05-05 11:01:47,342 - INFO - validate:433 - Validation: Batch 1460/1541, Loss: 4.2236


1460it [25:43,  1.06s/it][A[A2023-05-05 11:01:48,397 - INFO - validate:433 - Validation: Batch 1461/1541, Loss: 4.2236


1461it [25:44,  1.06s/it][A[A2023-05-05 11:01:49,454 - INFO - validate:433 - Validation: Batch 1462/1541, Loss: 4.2235


1462it [25:45,  1.06s/it][A[A2023-05-05 11:01:50,510 - INFO - validate:433 - Validation: Batch 1463/1541, Loss: 4.2236


1463it [25:46,  1.06s/it][A[A2023-05-05 11:01:51,567 - INFO - validate:433 - Validation: Batch 1464/1541, Loss: 4.2237


1464it [25:48,  1.06s/it][A[A2023-05-05 11:01:52,626 - INFO - validate:433 - Validation: Batch 1465/1541, Loss: 4.2237


1465it [25:49,



1524it [26:51,  1.05s/it][A[A2023-05-05 11:02:55,890 - INFO - validate:433 - Validation: Batch 1525/1541, Loss: 4.2273


1525it [26:52,  1.05s/it][A[A2023-05-05 11:02:56,941 - INFO - validate:433 - Validation: Batch 1526/1541, Loss: 4.2273


1526it [26:53,  1.05s/it][A[A2023-05-05 11:02:57,994 - INFO - validate:433 - Validation: Batch 1527/1541, Loss: 4.2274


1527it [26:54,  1.05s/it][A[A2023-05-05 11:02:59,050 - INFO - validate:433 - Validation: Batch 1528/1541, Loss: 4.2274


1528it [26:55,  1.05s/it][A[A2023-05-05 11:03:00,101 - INFO - validate:433 - Validation: Batch 1529/1541, Loss: 4.2274


1529it [26:56,  1.05s/it][A[A2023-05-05 11:03:01,153 - INFO - validate:433 - Validation: Batch 1530/1541, Loss: 4.2275


1530it [26:57,  1.05s/it][A[A2023-05-05 11:03:02,208 - INFO - validate:433 - Validation: Batch 1531/1541, Loss: 4.2275


1531it [26:58,  1.05s/it][A[A2023-05-05 11:03:03,261 - INFO - validate:433 - Validation: Batch 1532/1541, Loss: 4.2275


1532it [26:59,

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,▁▆▇██▇▇▇▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃

0,1
epoch,1.0
loss,4.4985


Run lxibxgbl errored: NameError("name 'log_save_path' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run lxibxgbl errored: NameError("name 'log_save_path' is not defined")
[34m[1mwandb[0m: Agent Starting Run: 54scx9ox with config:
[34m[1mwandb[0m: 	BETAS: [0.9, 0.95]
[34m[1mwandb[0m: 	GRADIENT_ACCUMULATION_STEPS: 16
[34m[1mwandb[0m: 	GRADIENT_CLIP: 0.5
[34m[1mwandb[0m: 	LEARNING_RATE: 0.0005
[34m[1mwandb[0m: 	NUM_EPOCHS: 1
[34m[1mwandb[0m: 	SAMPLING_INTERVAL: 50
[34m[1mwandb[0m: 	WEIGHT_DECAY: 0.01
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


2023-05-05 11:03:23,171 - INFO - create_or_load_model:370 - Model: facebook/opt-125m
2023-05-05 11:03:23,173 - INFO - print_trainable_parameters:121 - Parameters: Trainable- 125.24M|| All- 125.24M || Trainable%- 100.0
2023-05-05 11:03:23,174 - INFO - create_or_load_model:372 - Memory Memory Footprint: 500.957184 MB
2023-05-05 11:03:23,175 - INFO - create_or_load_model:373 - Model is on device: cuda:0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668335534632206, max=1.0…

  0%|          | 0/1 [00:00<?, ?it/s]2023-05-05 11:03:29,147 - INFO - train:33 - Epoch: 1/1

  0%|          | 0/13861 [00:00<?, ?it/s][A2023-05-05 11:03:31,907 - INFO - print_gpu_utilization:106 - GPU memory occupied: 42744 MB.
2023-05-05 11:03:32,364 - INFO - train:65 - Batch 1/13861, Loss: 3.3663, Learning Rate: 0.0005

  0%|          | 1/13861 [00:03<15:08:07,  3.93s/it][A2023-05-05 11:03:33,566 - INFO - print_gpu_utilization:106 - GPU memory occupied: 42744 MB.
2023-05-05 11:03:33,999 - INFO - train:65 - Batch 2/13861, Loss: 3.3459, Learning Rate: 0.0005
  0%|          | 1/13861 [00:05<21:52:31,  5.68s/it]
  0%|          | 0/1 [00:06<?, ?it/s]


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁
loss,█▁

0,1
epoch,1.0
loss,3.34591


Run 54scx9ox errored: OutOfMemoryError('CUDA out of memory. Tried to allocate 6.12 GiB (GPU 0; 44.49 GiB total capacity; 24.75 GiB already allocated; 3.26 GiB free; 40.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF')
[34m[1mwandb[0m: [32m[41mERROR[0m Run 54scx9ox errored: OutOfMemoryError('CUDA out of memory. Tried to allocate 6.12 GiB (GPU 0; 44.49 GiB total capacity; 24.75 GiB already allocated; 3.26 GiB free; 40.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF')
[34m[1mwandb[0m: Agent Starting Run: z6k3cveu with config:
[34m[1mwandb[0m: 	BETAS: [0.9, 0.95]
[34m[1mwandb[0m: 	GRADIENT_ACCUMULATION_STEPS: 4
[34m[1mwandb[0m: 	GRADIENT_CLIP: 5
[34m[1mwandb[0m: 	LEA

2023-05-05 11:03:48,261 - INFO - create_or_load_model:370 - Model: facebook/opt-125m
2023-05-05 11:03:48,263 - INFO - print_trainable_parameters:121 - Parameters: Trainable- 125.24M|| All- 125.24M || Trainable%- 100.0
2023-05-05 11:03:48,264 - INFO - create_or_load_model:372 - Memory Memory Footprint: 500.957184 MB
2023-05-05 11:03:48,265 - INFO - create_or_load_model:373 - Model is on device: cuda:0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668346136187513, max=1.0…

  0%|          | 0/1 [00:00<?, ?it/s]2023-05-05 11:03:54,116 - INFO - train:33 - Epoch: 1/1

  0%|          | 0/13861 [00:00<?, ?it/s][A2023-05-05 11:03:57,067 - INFO - print_gpu_utilization:106 - GPU memory occupied: 42744 MB.
  0%|          | 0/13861 [00:03<?, ?it/s]
  0%|          | 0/1 [00:04<?, ?it/s]


Run z6k3cveu errored: OutOfMemoryError('CUDA out of memory. Tried to allocate 6.12 GiB (GPU 0; 44.49 GiB total capacity; 36.08 GiB already allocated; 199.12 MiB free; 43.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF')
[34m[1mwandb[0m: [32m[41mERROR[0m Run z6k3cveu errored: OutOfMemoryError('CUDA out of memory. Tried to allocate 6.12 GiB (GPU 0; 44.49 GiB total capacity; 36.08 GiB already allocated; 199.12 MiB free; 43.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF')
[34m[1mwandb[0m: Agent Starting Run: qi6081bf with config:
[34m[1mwandb[0m: 	BETAS: [0.9, 0.95]
[34m[1mwandb[0m: 	GRADIENT_ACCUMULATION_STEPS: 4
[34m[1mwandb[0m: 	GRADIENT_CLIP: 0.5
[34m[1mwandb[0m

2023-05-05 11:04:09,488 - INFO - create_or_load_model:370 - Model: facebook/opt-125m
2023-05-05 11:04:09,490 - INFO - print_trainable_parameters:121 - Parameters: Trainable- 125.24M|| All- 125.24M || Trainable%- 100.0
2023-05-05 11:04:09,492 - INFO - create_or_load_model:372 - Memory Memory Footprint: 500.957184 MB
2023-05-05 11:04:09,492 - INFO - create_or_load_model:373 - Model is on device: cuda:0


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668279888108373, max=1.0…

  0%|          | 0/1 [00:00<?, ?it/s]2023-05-05 11:04:14,837 - INFO - train:33 - Epoch: 1/1

  0%|          | 0/13861 [00:00<?, ?it/s][A2023-05-05 11:04:16,310 - INFO - print_gpu_utilization:106 - GPU memory occupied: 45880 MB.
  0%|          | 0/13861 [00:01<?, ?it/s]
  0%|          | 0/1 [00:02<?, ?it/s]


VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Run qi6081bf errored: OutOfMemoryError('CUDA out of memory. Tried to allocate 3.06 GiB (GPU 0; 44.49 GiB total capacity; 38.06 GiB already allocated; 191.12 MiB free; 43.55 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF')
[34m[1mwandb[0m: [32m[41mERROR[0m Run qi6081bf errored: OutOfMemoryError('CUDA out of memory. Tried to allocate 3.06 GiB (GPU 0; 44.49 GiB total capacity; 38.06 GiB already allocated; 191.12 MiB free; 43.55 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF')
Detected 5 failed runs in a row at start, killing sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 5 failed runs in a row at start, killing sweep.
[34m[1mwandb[0m: To change this value set WANDB_AGENT_M