In [None]:
from huggingface_hub import login
import os
login(token="")
os.environ["HF_TOKEN"] = ""

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Code from RMU utils
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import random
random.seed(0)
from datasets import load_dataset

################################
##### Activation functions #####
################################

def forward_with_cache(model, inputs, module, no_grad=True):
    # define a tensor with the size of our cached activations
    cache = []
    def hook(module, input, output):
        if isinstance(output, tuple):
            cache.append(output[0])
        else:
            cache.append(output)
        return None 
    
    hook_handle = module.register_forward_hook(hook)
    
    if no_grad:
        with torch.no_grad():
            _ = model(**inputs)
    else:
        _ = model(**inputs)
        
    hook_handle.remove()

    return cache[0]
    
#######################################
##### Model and data loading code #####
#######################################


def get_params(model, layer_ids, param_ids):
    params = []
    for layer_id in layer_ids:
        for i, p in enumerate(model.model.layers[layer_id].parameters()):
            if i in param_ids:
                params.append(p)
    return params


def load_model(model_name_or_path):
    torch_dtype = "auto" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16

    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch_dtype,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path, trust_remote_code=True, use_fast=False
    )
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left"
    tokenizer.mask_token_id = tokenizer.eos_token_id
    tokenizer.sep_token_id = tokenizer.eos_token_id
    tokenizer.cls_token_id = tokenizer.eos_token_id

    return model, tokenizer

def get_data(forget_corpora, retain_corpora, min_len=50, max_len=2000, batch_size=4):
    def get_dataset(name):
        data = []
        if name == "wikitext":
            raw_data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
            for x in raw_data:
                if len(x['text']) > min_len:
                    data.append(str(x['text']))
        else:
            assert os.getenv("HF_TOKEN"), "HF_TOKEN is not set"
            # Support both full names like "cais/wmdp-bio-forget-corpus" and short names like "bio-forget-corpus"
            dataset_name = name if "/" in name else f"cais/wmdp-{name}"
            dataset = load_dataset(dataset_name, split="train", token=os.getenv("HF_TOKEN"))
            for line in dataset:
                if len(line['text']) > min_len:
                    data.append(str(line['text']))
        data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
        return data

    return (
        [get_dataset(c) for c in forget_corpora],
        [get_dataset(c) for c in retain_corpora]
    )

In [3]:
# RMU code
import os
import datetime

import numpy as np
import torch
from torch.optim import AdamW
import tqdm as tqdm

def run_rmu(
    updated_model,
    frozen_model,
    tokenizer,
    forget_data_list,
    retain_data_list,
    args,
):
    rmu_config = vars(args)
    print("====rmu Config====")
    print("\n".join(f"{k}={v}" for k,v in rmu_config.items()))
    print("=====")

    updated_model = updated_model.train()
    params = get_params(updated_model, args.layer_ids, args.param_ids)
    optimizer = AdamW(params, lr=args.lr)
    frozen_module = eval(
        args.module_str.format(model_name="frozen_model", layer_id=args.layer_id)
    )
    updated_module = eval(
        args.module_str.format(model_name="updated_model", layer_id=args.layer_id)
    )

    control_vectors_list = []
    for i in range(len(forget_data_list)):
        random_vector = torch.rand(1,1, updated_model.config.hidden_size, dtype=updated_model.dtype, device=updated_model.device)
        control_vec = random_vector / torch.norm(random_vector) * args.steering_coeff_list[i]
        control_vectors_list.append(control_vec)

    num_batches = min(
        args.max_num_batches,
        min([len(f) for f in forget_data_list]),
        min([len(r) for r in retain_data_list]),
    )
    
    truncation_side = tokenizer.truncation_side
    tokenizer.truncation_side="right"

    for epoch in range(1):
        print(f"======= Epoch {epoch} =======")
        with tqdm.tqdm(total=num_batches) as pbar:
            for idx in range(num_batches):
                topic_idx = idx % len(forget_data_list)
                batch_idx = idx // len(forget_data_list)
                control_vec = control_vectors_list[topic_idx]
                unlearn_batch = forget_data_list[topic_idx][batch_idx]
                retain_batch = retain_data_list[topic_idx][batch_idx]

                # Unlearning loss
                max_length = 512 if topic_idx == 0 else 768
                unlearn_inputs = tokenizer(
                    unlearn_batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length
                )
                updated_forget_activations = forward_with_cache(
                    updated_model, unlearn_inputs, module=updated_module, no_grad=False
                ).to(updated_model.device)

                unlearn_loss = torch.nn.functional.mse_loss(
                    updated_forget_activations, control_vec
                )

                # Retain loss
                retain_inputs = tokenizer(
                    retain_batch, return_tensors="pt", padding=True, truncation=True, max_length=512
                ).to(updated_model.device)
                updated_retain_activations = forward_with_cache(
                    updated_model, retain_inputs, module=updated_module, no_grad=False
                ).to(updated_model.device)
                frozen_retain_activations = forward_with_cache(
                    frozen_model, retain_inputs, module=frozen_module, no_grad=True
                ).to(updated_model.device)

                retain_loss = torch.nn.functional.mse_loss(
                    updated_retain_activations, frozen_retain_activations
                )
                retain_loss *= args.alpha[topic_idx]

                # Update model
                loss = unlearn_loss + retain_loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                print(f"loss: {loss.item():.4g} | unlearn_loss: {unlearn_loss.item():.4g} | retain_loss: {retain_loss.item():.4g} | param_change: {params[0].grad.abs().mean().item():.4g}")
                
                # ======= Logging ======
                if args.verbose:
                    frozen_forget_activations = forward_with_cache(frozen_model, unlearn_inputs, module=frozen_module, no_grad=True).to(updated_model.device)
                    unlearn_cosine= torch.nn.functional.cosine_similarity(updated_forget_activations, frozen_forget_activations, dim=-1).mean()
                    retain_cosine = torch.nn.functional.cosine_similarity(updated_retain_activations, frozen_retain_activations, dim=-1).mean()
                    
                    print(f"unlearn_cosine_sim={unlearn_cosine.item()}")
                    print(f"retain_cosine_sim={retain_cosine.item()}")
                    print(f"Topic {topic_idx} updated_forget_activations.norm=",torch.mean(updated_forget_activations.norm(dim=-1).mean(dim=1), dim=0).item())
                    print(f"Topic {topic_idx} frozen_forget_activations.norm=",torch.mean(frozen_forget_activations.norm(dim=-1).mean(dim=1), dim=0).item())
                    print(f"Topic {topic_idx} updated_retain_activations.norm=",torch.mean(updated_retain_activations.norm(dim=-1).mean(dim=1), dim=0).item())
                    print(f"Topic {topic_idx} frozen_retain_activations.norm=",torch.mean(frozen_retain_activations.norm(dim=-1).mean(dim=1), dim=0).item())

                pbar.update(1)

    tokenizer.truncation_side = truncation_side
    # Save model
    if args.output_dir:
        path = args.output_dir
    else:
        date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        path = f"rmu-models/{args.model_name_or_path}_alpha-{args.alpha}_batches-{num_batches}_layer-{args.layer_id}_{date}"
    updated_model.save_pretrained(path)
    tokenizer.save_pretrained(path)
    print(f"Saved model to {path}")


def get_args():
    import argparse

    parser = argparse.ArgumentParser()
    ### Model arguments
    parser.add_argument(
        "--model_name_or_path", type=str, default= "google/gemma-2-2b" # changed to gemma
    )
    parser.add_argument(
        "--module_str", type=str, default="{model_name}.model.layers[{layer_id}]"
    )
    parser.add_argument(
        "--output_dir", type=str, default="./my_unlearned_models-rmu"
    )
    ### Data arguments
    parser.add_argument(
        "--retain_corpora",
        type=str,
        default="wikitext,wikitext",
        help="comma-separated list of corpora to retain",
    )
    parser.add_argument(
        "--forget_corpora",
        type=str,
        default="bio-forget-corpus,cyber-forget-corpus",
        help="comma-separated list of corpora to forget",
    )
    ### rmu hyperparameters
    parser.add_argument("--alpha", type=str, default="100,100", help="retain weight")
    parser.add_argument(
        "--steering_coeffs",
        type=str,
        default="20,20",
        help="Steer vector weight in order of topic",
    )
    parser.add_argument("--lr", type=float, default=5e-5, help="learning rate")
    parser.add_argument("--min_len", type=int, default=0)
    parser.add_argument("--max_len", type=int, default=2000)
    parser.add_argument("--batch_size", type=int, default=4)
    parser.add_argument("--max_num_batches", type=int, default=80)
    parser.add_argument("--layer_id", type=int, default=7, help="layer to unlearn")
    parser.add_argument("--layer_ids", type=str, default="5,6,7", help="update layers")
    parser.add_argument("--param_ids", type=str, default="6", help="update params")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--verbose", action="store_true", help="Logging the activations norms and cosine at each step")

    args = parser.parse_args()
    args.retain_corpora = args.retain_corpora.split(",")
    args.forget_corpora = args.forget_corpora.split(",")
    args.steering_coeff_list = [float(c) for c in args.steering_coeffs.split(",")]
    args.alpha = [float(c) for c in args.alpha.split(",")]
    args.layer_ids = [int(layer_id) for layer_id in args.layer_ids.split(",")]
    args.param_ids = [int(param_id) for param_id in args.param_ids.split(",")]
    return args 




In [4]:
import sys
sys.argv = ['']  # Prevents argparse from reading notebook args

args = get_args()

SEED = args.seed

In [5]:
def get_args_notebook(
    model_name_or_path="google/gemma-2-2b",
    module_str="{model_name}.model.layers[{layer_id}]",
    output_dir="./my_unlearned_models-rmu",
    retain_corpora=None,
    forget_corpora=None,
    alpha=None,
    steering_coeffs=None,
    lr=5e-5,
    min_len=0,
    max_len=2000,
    batch_size=4,
    max_num_batches=80,
    layer_id=7,
    layer_ids=None,
    param_ids=None,
    seed=42,
    verbose=False
):
    """Notebook-friendly args without argparse"""
    class Args:
        pass
    
    args = Args()
    
    # Model arguments
    args.model_name_or_path = model_name_or_path
    args.module_str = module_str
    args.output_dir = output_dir  # Use the parameter!
    
    # Data arguments
    args.retain_corpora = retain_corpora if retain_corpora is not None else ["wikitext", "wikitext"]
    args.forget_corpora = forget_corpora if forget_corpora is not None else ["bio-forget-corpus", "cyber-forget-corpus"]
    
    # RMU hyperparameters
    args.alpha = alpha if alpha is not None else [100.0, 100.0]
    args.steering_coeff_list = steering_coeffs if steering_coeffs is not None else [20.0, 20.0]
    args.lr = lr
    args.min_len = min_len
    args.max_len = max_len
    args.batch_size = batch_size
    args.max_num_batches = max_num_batches
    args.layer_id = layer_id
    args.layer_ids = layer_ids if layer_ids is not None else [5, 6, 7]
    args.param_ids = param_ids if param_ids is not None else [6]
    args.seed = seed
    args.verbose = verbose
    
    return args

In [6]:
def run_max_entropy(
    updated_model,
    frozen_model,
    tokenizer,
    forget_data_list,
    retain_data_list,
    args,
):
    max_entropy_config = vars(args)
    print("====MaxEntropy Config====")
    print("\n".join(f"{k}={v}" for k,v in max_entropy_config.items()))
    print("=====")

    updated_model = updated_model.train()
    params = get_params(updated_model, args.layer_ids, args.param_ids)
    optimizer = AdamW(params, lr=args.lr)
    frozen_module = eval(
        args.module_str.format(model_name="frozen_model", layer_id=args.layer_id)
    )
    updated_module = eval(
        args.module_str.format(model_name="updated_model", layer_id=args.layer_id)
    )

    num_batches = min(
        args.max_num_batches,
        min([len(f) for f in forget_data_list]),
        min([len(r) for r in retain_data_list]),
    )
    
    truncation_side = tokenizer.truncation_side
    tokenizer.truncation_side="right"

    for epoch in range(1):
        print(f"======= Epoch {epoch} =======")
        with tqdm.tqdm(total=num_batches) as pbar:
            for idx in range(num_batches):
                topic_idx = idx % len(forget_data_list)
                batch_idx = idx // len(forget_data_list)
                unlearn_batch = forget_data_list[topic_idx][batch_idx]
                retain_batch = retain_data_list[topic_idx][batch_idx]

                # Clear GPU cache before each step
                torch.cuda.empty_cache()

                # Unlearning loss - Maximum Entropy (memory efficient)
                max_length = 512 if topic_idx == 0 else 768
                unlearn_inputs = tokenizer(
                    unlearn_batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length
                ).to(updated_model.device)
                
                # Get logits from the model
                outputs = updated_model(**unlearn_inputs, labels=unlearn_inputs["input_ids"])
                logits = outputs.logits
                
                # Memory-efficient entropy computation using logsumexp trick
                # H = log(vocab_size) - mean(max_logit) + mean(logsumexp - max_logit) approximately
                # Or more precisely: H = logsumexp(logits) - sum(softmax * logits)
                
                # Compute in chunks to save memory
                attention_mask = unlearn_inputs["attention_mask"]
                
                # Use log_softmax and compute entropy without storing full probs tensor
                with torch.amp.autocast('cuda'):  # Use mixed precision
                    log_probs = torch.nn.functional.log_softmax(logits.float(), dim=-1)
                    # Entropy: H = -sum(p * log(p)) = -sum(exp(log_p) * log_p)
                    entropy = -(log_probs.exp() * log_probs).sum(dim=-1)  # (batch_size, seq_len)
                
                # Clean up immediately
                del logits, log_probs, outputs
                torch.cuda.empty_cache()
                
                # Average over sequence and batch (only on non-padding tokens)
                masked_entropy = entropy * attention_mask
                avg_entropy = masked_entropy.sum() / attention_mask.sum()
                
                # Maximize entropy = minimize negative entropy
                unlearn_loss = -avg_entropy
                
                del entropy, masked_entropy
                torch.cuda.empty_cache()

                # Retain loss
                retain_inputs = tokenizer(
                    retain_batch, return_tensors="pt", padding=True, truncation=True, max_length=512
                ).to(updated_model.device)
                updated_retain_activations = forward_with_cache(
                    updated_model, retain_inputs, module=updated_module, no_grad=False
                ).to(updated_model.device)
                frozen_retain_activations = forward_with_cache(
                    frozen_model, retain_inputs, module=frozen_module, no_grad=True
                ).to(updated_model.device)

                retain_loss = torch.nn.functional.mse_loss(
                    updated_retain_activations, frozen_retain_activations
                )
                retain_loss *= args.alpha[topic_idx]

                # Update model
                loss = unlearn_loss + retain_loss
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                print(f"loss: {loss.item():.4g} | unlearn_loss: {unlearn_loss.item():.4g} | retain_loss: {retain_loss.item():.4g} | avg_entropy: {avg_entropy.item():.4g} | param_change: {params[0].grad.abs().mean().item():.4g}")
                
                # Clean up
                del updated_retain_activations, frozen_retain_activations, retain_inputs, unlearn_inputs
                torch.cuda.empty_cache()
                
                # ======= Logging ======
                if args.verbose:
                    print(f"Topic {topic_idx} avg_entropy={avg_entropy.item()}")

                pbar.update(1)

    tokenizer.truncation_side = truncation_side
    # Save model
    if args.output_dir:
        path = args.output_dir
    else:
        date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        path = f"maxent-models/{args.model_name_or_path}_alpha-{args.alpha}_batches-{num_batches}_layer-{args.layer_id}_{date}"
    updated_model.save_pretrained(path)
    tokenizer.save_pretrained(path)
    print(f"Saved model to {path}")

In [None]:
# import torch.nn as nn
# import gc

# def get_batch_loss(logits, labels):
#     """
#     Compute per-sample loss (used in NPO).
#     Returns a tensor of shape (batch_size,) with loss for each sample.
#     """
#     shifted_labels = labels[..., 1:].contiguous()
#     logits = logits[..., :-1, :].contiguous()

#     loss_function = nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
#     # Get the sum loss for each sequence in a batch
#     loss = loss_function(logits.transpose(-1, -2), shifted_labels).sum(dim=-1)

#     return loss


# def run_npo(
#     model,
#     tokenizer,
#     forget_data_list,
#     retain_data_list,
#     lr: float = 1e-5,
#     num_epochs: int = 1,
#     beta: float = 0.1,
#     alpha: float = 1.0,
#     max_length: int = 512,
#     output_dir: str = None,
# ):
#     """
#     Ultra memory-efficient NPO - precomputes reference losses then trains.

#     NPO Loss: L_npo = -log(sigmoid(beta * neg_log_ratios)) * 2 / beta + alpha * L_retain
#     where neg_log_ratios = log(p_theta(y_forget|x)) - log(p_ref(y_forget|x))

#     Args:
#         model: The model to unlearn from (will be modified in-place)
#         tokenizer: Tokenizer for the model
#         forget_data_list: List of text samples to forget
#         retain_data_list: List of text samples to retain
#         lr: Learning rate (default 1e-5)
#         num_epochs: Number of epochs (default 1)
#         beta: NPO beta parameter controlling strength (default 0.1)
#         alpha: Weight for retain loss (default 1.0)
#         max_length: Maximum sequence length
#         output_dir: Directory to save the unlearned model

#     Returns:
#         The unlearned model
#     """
#     device = next(model.parameters()).device

#     print("====NPO Config (Ultra Memory Efficient)====")
#     print(f"lr={lr}, epochs={num_epochs}, beta={beta}, alpha={alpha}, max_length={max_length}")
#     print(f"forget_samples={len(forget_data_list)}, retain_samples={len(retain_data_list)}")
#     print("==================")

#     # Enable gradient checkpointing to save memory during backprop
#     if hasattr(model, 'gradient_checkpointing_enable'):
#         print("Enabling gradient checkpointing for memory efficiency...")
#         model.gradient_checkpointing_enable()

#     # PHASE 1: Precompute reference losses (original model)
#     print("\nPhase 1: Computing reference losses from original model...")
#     model.eval()
#     reference_losses = []
    
#     num_batches = min(len(forget_data_list), len(retain_data_list))
    
#     with torch.no_grad():
#         for idx in tqdm.tqdm(range(num_batches), desc="Computing reference"):
#             forget_text = forget_data_list[idx]
#             if isinstance(forget_text, str):
#                 forget_text = [forget_text]
            
#             forget_inputs = tokenizer(
#                 forget_text, return_tensors="pt",
#                 padding=True, truncation=True, max_length=max_length
#             ).to(device)
            
#             outputs = model(**forget_inputs, labels=forget_inputs["input_ids"])
#             loss_per_sample = get_batch_loss(outputs.logits, forget_inputs["input_ids"])
            
#             # Store on CPU to save GPU memory
#             reference_losses.append(loss_per_sample.detach().cpu())
            
#             # Cleanup
#             del forget_inputs, outputs, loss_per_sample
            
#             if idx % 10 == 0:
#                 torch.cuda.empty_cache()
    
#     print(f"Precomputed {len(reference_losses)} reference losses")
#     torch.cuda.synchronize()
#     torch.cuda.empty_cache()
#     gc.collect()
    
#     # PHASE 2: Training with precomputed reference
#     print("\nPhase 2: Training with NPO loss...")
#     model.train()
#     optimizer = AdamW(model.parameters(), lr=lr)

#     for epoch in range(num_epochs):
#         print(f"\n======= Epoch {epoch + 1}/{num_epochs} =======")

#         with tqdm.tqdm(total=num_batches, desc=f"Epoch {epoch + 1}") as pbar:
#             for idx in range(num_batches):
#                 # Get samples
#                 forget_text = forget_data_list[idx]
#                 retain_text = retain_data_list[idx]

#                 if isinstance(forget_text, str):
#                     forget_text = [forget_text]
#                 if isinstance(retain_text, str):
#                     retain_text = [retain_text]

#                 # Tokenize forget data
#                 forget_inputs = tokenizer(
#                     forget_text, return_tensors="pt",
#                     padding=True, truncation=True, max_length=max_length
#                 ).to(device)

#                 # Get current model loss
#                 forget_outputs = model(**forget_inputs, labels=forget_inputs["input_ids"])
#                 forget_loss_current = get_batch_loss(forget_outputs.logits, forget_inputs["input_ids"])

#                 # Get precomputed reference loss
#                 forget_loss_oracle = reference_losses[idx].to(device)

#                 # Negative log-likelihood ratios
#                 neg_log_ratios = forget_loss_current - forget_loss_oracle

#                 # NPO loss: -log(sigmoid(beta * neg_log_ratios)) * 2 / beta
#                 npo_loss = -torch.nn.functional.logsigmoid(beta * neg_log_ratios).mean() * 2 / beta

#                 # Clean up forget computation before retain
#                 del forget_inputs, forget_outputs, forget_loss_current, forget_loss_oracle, neg_log_ratios
#                 torch.cuda.empty_cache()

#                 # Retain loss (standard cross-entropy)
#                 retain_inputs = tokenizer(
#                     retain_text, return_tensors="pt",
#                     padding=True, truncation=True, max_length=max_length
#                 ).to(device)

#                 retain_outputs = model(**retain_inputs, labels=retain_inputs["input_ids"])
#                 retain_loss = retain_outputs.loss

#                 # Total loss
#                 loss = npo_loss + alpha * retain_loss

#                 optimizer.zero_grad()
#                 loss.backward()
#                 optimizer.step()

#                 pbar.update(1)
#                 pbar.set_postfix({
#                     'loss': f'{loss.item():.4f}',
#                     'npo': f'{npo_loss.item():.4f}',
#                     'retain': f'{retain_loss.item():.4f}',
#                 })

#                 # Clean up
#                 del retain_inputs, retain_outputs, npo_loss, retain_loss, loss
                
#                 # Aggressive cleanup every 5 steps
#                 if idx % 5 == 0:
#                     torch.cuda.synchronize()
#                     torch.cuda.empty_cache()

#     # Clean up reference losses
#     del reference_losses
#     gc.collect()
#     torch.cuda.empty_cache()

#     # Disable gradient checkpointing before saving
#     if hasattr(model, 'gradient_checkpointing_disable'):
#         model.gradient_checkpointing_disable()

#     # Save
#     if output_dir:
#         model.save_pretrained(output_dir)
#         tokenizer.save_pretrained(output_dir)
#         print(f"\nSaved model to {output_dir}")

#     return model


# if __name__ == "__main__":
#     print("NPO (Negative Preference Optimization) Unlearning")
#     print("Loss: L_npo = -log(sigmoid(beta * neg_log_ratios)) * 2 / beta + alpha * L_retain")

NPO (Negative Preference Optimization) Unlearning
Loss: L_npo = -log(sigmoid(beta * neg_log_ratios)) * 2 / beta + alpha * L_retain


In [8]:

# args = get_args_notebook(
#     output_dir="./bio/rmu-bio-only",
#     forget_corpora=["bio-forget-corpus"],
#     retain_corpora=["wikitext"],
#     batch_size=2,
#     max_num_batches=1000,  # ← CHANGE THIS FROM 80 TO 1000
# )

# SEED = args.seed
# torch.cuda.manual_seed(SEED)
# torch.cuda.manual_seed_all(SEED)
# torch.manual_seed(SEED)
# np.random.seed(SEED)

# frozen_model, tokenizer = load_model(args.model_name_or_path)
# updated_model, tokenizer = load_model(args.model_name_or_path)
# forget_data_list, retain_data_list = get_data(
#   args.forget_corpora,
#   args.retain_corpora,
#   args.min_len,
#   args.max_len,
#   args.batch_size,
# )
# run_rmu(
#   updated_model,
#   frozen_model,
#   tokenizer,
#   forget_data_list,
#   retain_data_list,
#   args,
# )

In [9]:

# Clear memory first
import gc
gc.collect()
torch.cuda.empty_cache()

args = get_args_notebook(
    output_dir="./bio/maxentropy-bio-only",
    forget_corpora=["bio-forget-corpus"],
    retain_corpora=["wikitext"],
    batch_size=2,
    max_num_batches=1000,  # Increase from 80
    alpha=[50.0],  # Try reducing retain weight from 100 to 50
    lr=5e-5,
)

SEED = args.seed
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

frozen_model, tokenizer = load_model(args.model_name_or_path)
updated_model, tokenizer = load_model(args.model_name_or_path)
forget_data_list, retain_data_list = get_data(
  args.forget_corpora,
  args.retain_corpora,
  args.min_len,
  args.max_len,
  args.batch_size,
)
run_max_entropy(
  updated_model,
  frozen_model,
  tokenizer,
  forget_data_list,
  retain_data_list,
  args,
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.76it/s]
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.74it/s]


====MaxEntropy Config====
model_name_or_path=google/gemma-2-2b
module_str={model_name}.model.layers[{layer_id}]
output_dir=./bio/maxentropy-bio-only
retain_corpora=['wikitext']
forget_corpora=['bio-forget-corpus']
alpha=[50.0]
steering_coeff_list=[20.0, 20.0]
lr=5e-05
min_len=0
max_len=2000
batch_size=2
max_num_batches=1000
layer_id=7
layer_ids=[5, 6, 7]
param_ids=[6]
seed=42
verbose=False
=====


  0%|          | 0/1000 [00:00<?, ?it/s]

loss: -1.699 | unlearn_loss: -1.699 | retain_loss: 0 | avg_entropy: 1.699 | param_change: 0.0001511


  0%|          | 1/1000 [00:02<40:49,  2.45s/it]

loss: -1.897 | unlearn_loss: -1.985 | retain_loss: 0.08849 | avg_entropy: 1.985 | param_change: 0.0007667


  0%|          | 2/1000 [00:04<38:36,  2.32s/it]

loss: 22.09 | unlearn_loss: -1.898 | retain_loss: 23.99 | avg_entropy: 1.898 | param_change: 0.05097


  0%|          | 3/1000 [00:06<37:31,  2.26s/it]

loss: 19.91 | unlearn_loss: -2.082 | retain_loss: 21.99 | avg_entropy: 2.082 | param_change: 0.05816


  0%|          | 4/1000 [00:09<37:14,  2.24s/it]

loss: 0.0841 | unlearn_loss: -2.053 | retain_loss: 2.137 | avg_entropy: 2.053 | param_change: 0.004068


  0%|          | 5/1000 [00:11<38:01,  2.29s/it]

loss: -0.7322 | unlearn_loss: -2.291 | retain_loss: 1.558 | avg_entropy: 2.291 | param_change: 0.002698


  1%|          | 6/1000 [00:13<35:57,  2.17s/it]

loss: -1.484 | unlearn_loss: -1.852 | retain_loss: 0.3679 | avg_entropy: 1.852 | param_change: 0.006797


  1%|          | 7/1000 [00:15<34:54,  2.11s/it]

loss: 21.67 | unlearn_loss: -2.246 | retain_loss: 23.92 | avg_entropy: 2.246 | param_change: 0.02279


  1%|          | 8/1000 [00:17<34:30,  2.09s/it]

loss: 16.33 | unlearn_loss: -2.124 | retain_loss: 18.46 | avg_entropy: 2.124 | param_change: 0.01593


  1%|          | 9/1000 [00:19<34:40,  2.10s/it]

loss: 10.5 | unlearn_loss: -2.235 | retain_loss: 12.74 | avg_entropy: 2.235 | param_change: 0.009785


  1%|          | 10/1000 [00:21<35:16,  2.14s/it]

loss: 1.627 | unlearn_loss: -2.08 | retain_loss: 3.708 | avg_entropy: 2.08 | param_change: 0.003536


  1%|          | 11/1000 [00:23<34:45,  2.11s/it]

loss: 1.954 | unlearn_loss: -2.246 | retain_loss: 4.2 | avg_entropy: 2.246 | param_change: 0.005102


  1%|          | 12/1000 [00:25<34:33,  2.10s/it]

loss: 1.573 | unlearn_loss: -2.151 | retain_loss: 3.724 | avg_entropy: 2.151 | param_change: 0.005488


  1%|▏         | 13/1000 [00:28<34:53,  2.12s/it]

loss: 2.426 | unlearn_loss: -1.942 | retain_loss: 4.368 | avg_entropy: 1.942 | param_change: 0.006899


  1%|▏         | 14/1000 [00:30<35:33,  2.16s/it]

loss: 4.243 | unlearn_loss: -2.102 | retain_loss: 6.344 | avg_entropy: 2.102 | param_change: 0.009482


  2%|▏         | 15/1000 [00:32<35:52,  2.19s/it]

loss: 1.03 | unlearn_loss: -2.171 | retain_loss: 3.201 | avg_entropy: 2.171 | param_change: 0.005476


  2%|▏         | 16/1000 [00:34<35:30,  2.17s/it]

loss: 0.7286 | unlearn_loss: -2.181 | retain_loss: 2.91 | avg_entropy: 2.181 | param_change: 0.004793


  2%|▏         | 17/1000 [00:36<35:37,  2.17s/it]

loss: 2.072 | unlearn_loss: -2.283 | retain_loss: 4.355 | avg_entropy: 2.283 | param_change: 0.004959


  2%|▏         | 18/1000 [00:38<34:47,  2.13s/it]

loss: 0.3892 | unlearn_loss: -2.312 | retain_loss: 2.701 | avg_entropy: 2.312 | param_change: 0.00308


  2%|▏         | 19/1000 [00:41<35:23,  2.16s/it]

loss: -0.712 | unlearn_loss: -1.986 | retain_loss: 1.274 | avg_entropy: 1.986 | param_change: 0.001967


  2%|▏         | 20/1000 [00:43<35:39,  2.18s/it]

loss: -1.261 | unlearn_loss: -2.153 | retain_loss: 0.8923 | avg_entropy: 2.153 | param_change: 0.001939


  2%|▏         | 21/1000 [00:45<34:32,  2.12s/it]

loss: 0.8062 | unlearn_loss: -2.457 | retain_loss: 3.263 | avg_entropy: 2.457 | param_change: 0.004541


  2%|▏         | 22/1000 [00:47<34:00,  2.09s/it]

loss: 0.7168 | unlearn_loss: -2.253 | retain_loss: 2.969 | avg_entropy: 2.253 | param_change: 0.004321


  2%|▏         | 23/1000 [00:49<33:43,  2.07s/it]

loss: 0.6568 | unlearn_loss: -2.003 | retain_loss: 2.66 | avg_entropy: 2.003 | param_change: 0.003997


  2%|▏         | 24/1000 [00:51<33:12,  2.04s/it]

loss: -1.174 | unlearn_loss: -2.024 | retain_loss: 0.8499 | avg_entropy: 2.024 | param_change: 0.002015


  2%|▎         | 25/1000 [00:53<33:28,  2.06s/it]

loss: -0.5556 | unlearn_loss: -2.351 | retain_loss: 1.795 | avg_entropy: 2.351 | param_change: 0.003035


  3%|▎         | 26/1000 [00:55<32:59,  2.03s/it]

loss: -1.279 | unlearn_loss: -2.178 | retain_loss: 0.899 | avg_entropy: 2.178 | param_change: 0.001663


  3%|▎         | 27/1000 [00:57<34:21,  2.12s/it]

loss: -1.51 | unlearn_loss: -2.262 | retain_loss: 0.7528 | avg_entropy: 2.262 | param_change: 0.001559


  3%|▎         | 28/1000 [00:59<33:40,  2.08s/it]

loss: -1.2 | unlearn_loss: -2.308 | retain_loss: 1.108 | avg_entropy: 2.308 | param_change: 0.001898


  3%|▎         | 29/1000 [01:01<33:05,  2.04s/it]

loss: -1.88 | unlearn_loss: -2.374 | retain_loss: 0.4941 | avg_entropy: 2.374 | param_change: 0.00136


  3%|▎         | 30/1000 [01:03<33:15,  2.06s/it]

loss: -1.548 | unlearn_loss: -2.155 | retain_loss: 0.6072 | avg_entropy: 2.155 | param_change: 0.001286


  3%|▎         | 31/1000 [01:06<34:31,  2.14s/it]

loss: -1.414 | unlearn_loss: -2.499 | retain_loss: 1.085 | avg_entropy: 2.499 | param_change: 0.002138


  3%|▎         | 32/1000 [01:08<33:39,  2.09s/it]

loss: -1.656 | unlearn_loss: -2.443 | retain_loss: 0.7872 | avg_entropy: 2.443 | param_change: 0.001843


  3%|▎         | 33/1000 [01:10<35:58,  2.23s/it]

loss: -1.808 | unlearn_loss: -2.667 | retain_loss: 0.8596 | avg_entropy: 2.667 | param_change: 0.002044


  3%|▎         | 34/1000 [01:13<36:45,  2.28s/it]

loss: -2.153 | unlearn_loss: -2.635 | retain_loss: 0.4822 | avg_entropy: 2.635 | param_change: 0.001333


  4%|▎         | 35/1000 [01:15<36:48,  2.29s/it]

loss: -2.432 | unlearn_loss: -3.075 | retain_loss: 0.6431 | avg_entropy: 3.075 | param_change: 0.00272


  4%|▎         | 36/1000 [01:17<36:25,  2.27s/it]

loss: -2.525 | unlearn_loss: -3.495 | retain_loss: 0.9705 | avg_entropy: 3.495 | param_change: 0.003844


  4%|▎         | 37/1000 [01:19<36:08,  2.25s/it]

loss: -4.377 | unlearn_loss: -5.015 | retain_loss: 0.6384 | avg_entropy: 5.015 | param_change: 0.005734


  4%|▍         | 38/1000 [01:21<35:15,  2.20s/it]

loss: -1.972 | unlearn_loss: -2.673 | retain_loss: 0.7015 | avg_entropy: 2.673 | param_change: 0.001897


  4%|▍         | 39/1000 [01:24<35:13,  2.20s/it]

loss: -2.382 | unlearn_loss: -3.186 | retain_loss: 0.8036 | avg_entropy: 3.186 | param_change: 0.005003


  4%|▍         | 40/1000 [01:26<35:19,  2.21s/it]

loss: -2.042 | unlearn_loss: -2.777 | retain_loss: 0.7347 | avg_entropy: 2.777 | param_change: 0.003417


  4%|▍         | 41/1000 [01:28<35:43,  2.24s/it]

loss: -5.767 | unlearn_loss: -6.313 | retain_loss: 0.5458 | avg_entropy: 6.313 | param_change: 0.005276


  4%|▍         | 42/1000 [01:31<37:01,  2.32s/it]

loss: -4.84 | unlearn_loss: -5.433 | retain_loss: 0.593 | avg_entropy: 5.433 | param_change: 0.002866


  4%|▍         | 43/1000 [01:33<37:26,  2.35s/it]

loss: -4.73 | unlearn_loss: -5.455 | retain_loss: 0.7245 | avg_entropy: 5.455 | param_change: 0.003239


  4%|▍         | 44/1000 [01:35<37:00,  2.32s/it]

loss: -2.53 | unlearn_loss: -3.219 | retain_loss: 0.6891 | avg_entropy: 3.219 | param_change: 0.002712


  4%|▍         | 45/1000 [01:38<36:47,  2.31s/it]

loss: -2.904 | unlearn_loss: -3.429 | retain_loss: 0.5246 | avg_entropy: 3.429 | param_change: 0.002088


  5%|▍         | 46/1000 [01:40<37:19,  2.35s/it]

loss: -3.387 | unlearn_loss: -4.685 | retain_loss: 1.298 | avg_entropy: 4.685 | param_change: 0.006865


  5%|▍         | 47/1000 [01:42<35:40,  2.25s/it]

loss: -7.263 | unlearn_loss: -8.636 | retain_loss: 1.373 | avg_entropy: 8.636 | param_change: 0.01169


  5%|▍         | 48/1000 [01:44<34:37,  2.18s/it]

loss: -5.405 | unlearn_loss: -6.202 | retain_loss: 0.7971 | avg_entropy: 6.202 | param_change: 0.003149


  5%|▍         | 49/1000 [01:46<33:34,  2.12s/it]

loss: -5.919 | unlearn_loss: -6.717 | retain_loss: 0.7974 | avg_entropy: 6.717 | param_change: 0.00416


  5%|▌         | 50/1000 [01:48<32:50,  2.07s/it]

loss: -6.212 | unlearn_loss: -6.834 | retain_loss: 0.6211 | avg_entropy: 6.834 | param_change: 0.003966


  5%|▌         | 51/1000 [01:50<32:19,  2.04s/it]

loss: -6.612 | unlearn_loss: -7.183 | retain_loss: 0.5711 | avg_entropy: 7.183 | param_change: 0.003163


  5%|▌         | 52/1000 [01:52<31:55,  2.02s/it]

loss: -7.573 | unlearn_loss: -10.07 | retain_loss: 2.498 | avg_entropy: 10.07 | param_change: 0.01154


  5%|▌         | 53/1000 [01:54<31:37,  2.00s/it]

loss: -8.527 | unlearn_loss: -10.29 | retain_loss: 1.76 | avg_entropy: 10.29 | param_change: 0.006162


  5%|▌         | 54/1000 [01:56<31:24,  1.99s/it]

loss: -4.139 | unlearn_loss: -5.321 | retain_loss: 1.181 | avg_entropy: 5.321 | param_change: 0.004314


  6%|▌         | 55/1000 [01:58<31:23,  1.99s/it]

loss: -7.182 | unlearn_loss: -9.433 | retain_loss: 2.251 | avg_entropy: 9.433 | param_change: 0.005603


  6%|▌         | 56/1000 [02:00<32:52,  2.09s/it]

loss: -8.982 | unlearn_loss: -10.24 | retain_loss: 1.262 | avg_entropy: 10.24 | param_change: 0.002698


  6%|▌         | 57/1000 [02:02<32:41,  2.08s/it]

loss: -6.895 | unlearn_loss: -8.202 | retain_loss: 1.308 | avg_entropy: 8.202 | param_change: 0.002605


  6%|▌         | 58/1000 [02:05<33:39,  2.14s/it]

loss: -9.8 | unlearn_loss: -10.65 | retain_loss: 0.8458 | avg_entropy: 10.65 | param_change: 0.002703


  6%|▌         | 59/1000 [02:07<34:06,  2.17s/it]

loss: -9.969 | unlearn_loss: -10.68 | retain_loss: 0.7081 | avg_entropy: 10.68 | param_change: 0.00169


  6%|▌         | 60/1000 [02:09<36:35,  2.34s/it]

loss: -10.16 | unlearn_loss: -10.73 | retain_loss: 0.5708 | avg_entropy: 10.73 | param_change: 0.001374


  6%|▌         | 61/1000 [02:12<36:16,  2.32s/it]

loss: -10.28 | unlearn_loss: -10.83 | retain_loss: 0.5485 | avg_entropy: 10.83 | param_change: 0.001416


  6%|▌         | 62/1000 [02:15<38:33,  2.47s/it]

loss: -10.1 | unlearn_loss: -10.86 | retain_loss: 0.7518 | avg_entropy: 10.86 | param_change: 0.001642


  6%|▋         | 63/1000 [02:17<37:33,  2.41s/it]

loss: -10.35 | unlearn_loss: -10.82 | retain_loss: 0.4725 | avg_entropy: 10.82 | param_change: 0.001318


  6%|▋         | 64/1000 [02:20<39:29,  2.53s/it]

loss: -7.313 | unlearn_loss: -7.995 | retain_loss: 0.6817 | avg_entropy: 7.995 | param_change: 0.00227


  6%|▋         | 65/1000 [02:22<39:14,  2.52s/it]

loss: -7.931 | unlearn_loss: -8.453 | retain_loss: 0.5223 | avg_entropy: 8.453 | param_change: 0.001595


  7%|▋         | 66/1000 [02:25<38:39,  2.48s/it]

loss: -9.835 | unlearn_loss: -10.64 | retain_loss: 0.8088 | avg_entropy: 10.64 | param_change: 0.001862


  7%|▋         | 67/1000 [02:27<37:35,  2.42s/it]

loss: -8.284 | unlearn_loss: -8.842 | retain_loss: 0.5579 | avg_entropy: 8.842 | param_change: 0.002323


  7%|▋         | 68/1000 [02:29<38:09,  2.46s/it]

loss: -10.39 | unlearn_loss: -11.16 | retain_loss: 0.7717 | avg_entropy: 11.16 | param_change: 0.001516


  7%|▋         | 69/1000 [02:32<37:04,  2.39s/it]

loss: -8.685 | unlearn_loss: -9.61 | retain_loss: 0.9258 | avg_entropy: 9.61 | param_change: 0.002001


  7%|▋         | 70/1000 [02:34<37:05,  2.39s/it]

loss: -8.971 | unlearn_loss: -9.589 | retain_loss: 0.6183 | avg_entropy: 9.589 | param_change: 0.001829


  7%|▋         | 71/1000 [02:36<36:44,  2.37s/it]

loss: -9.863 | unlearn_loss: -10.87 | retain_loss: 1.004 | avg_entropy: 10.87 | param_change: 0.002947


  7%|▋         | 72/1000 [02:39<36:54,  2.39s/it]

loss: -8.994 | unlearn_loss: -9.955 | retain_loss: 0.9612 | avg_entropy: 9.955 | param_change: 0.002126


  7%|▋         | 73/1000 [02:41<35:06,  2.27s/it]

loss: -9.27 | unlearn_loss: -10.15 | retain_loss: 0.8764 | avg_entropy: 10.15 | param_change: 0.0023


  7%|▋         | 74/1000 [02:43<35:29,  2.30s/it]

loss: -1.681 | unlearn_loss: -11.36 | retain_loss: 9.679 | avg_entropy: 11.36 | param_change: 0.01522


  8%|▊         | 75/1000 [02:45<34:09,  2.22s/it]

loss: -10.78 | unlearn_loss: -11.44 | retain_loss: 0.6526 | avg_entropy: 11.44 | param_change: 0.001405


  8%|▊         | 76/1000 [02:48<36:20,  2.36s/it]

loss: -10.66 | unlearn_loss: -11.48 | retain_loss: 0.8235 | avg_entropy: 11.48 | param_change: 0.001441


  8%|▊         | 77/1000 [02:50<35:43,  2.32s/it]

loss: -9.343 | unlearn_loss: -10.19 | retain_loss: 0.8449 | avg_entropy: 10.19 | param_change: 0.001717


  8%|▊         | 78/1000 [02:52<34:48,  2.27s/it]

loss: -9.901 | unlearn_loss: -10.63 | retain_loss: 0.7311 | avg_entropy: 10.63 | param_change: 0.00174


  8%|▊         | 79/1000 [02:55<36:20,  2.37s/it]

loss: -9.741 | unlearn_loss: -10.73 | retain_loss: 0.9869 | avg_entropy: 10.73 | param_change: 0.002856


  8%|▊         | 80/1000 [02:57<36:24,  2.37s/it]

loss: -11 | unlearn_loss: -11.56 | retain_loss: 0.5591 | avg_entropy: 11.56 | param_change: 0.001216


  8%|▊         | 81/1000 [03:00<36:41,  2.40s/it]

loss: -10.87 | unlearn_loss: -11.48 | retain_loss: 0.6106 | avg_entropy: 11.48 | param_change: 0.001681


  8%|▊         | 82/1000 [03:02<36:20,  2.38s/it]

loss: -10.23 | unlearn_loss: -11.41 | retain_loss: 1.177 | avg_entropy: 11.41 | param_change: 0.002569


  8%|▊         | 83/1000 [03:04<35:44,  2.34s/it]

loss: -10.74 | unlearn_loss: -11.62 | retain_loss: 0.8802 | avg_entropy: 11.62 | param_change: 0.001421


  8%|▊         | 84/1000 [03:07<36:00,  2.36s/it]

loss: -10.18 | unlearn_loss: -11.61 | retain_loss: 1.437 | avg_entropy: 11.61 | param_change: 0.001373


  8%|▊         | 85/1000 [03:09<34:39,  2.27s/it]

loss: -9.433 | unlearn_loss: -10.29 | retain_loss: 0.8581 | avg_entropy: 10.29 | param_change: 0.002119


  9%|▊         | 86/1000 [03:11<35:20,  2.32s/it]

loss: -11.03 | unlearn_loss: -11.61 | retain_loss: 0.5781 | avg_entropy: 11.61 | param_change: 0.001203


  9%|▊         | 87/1000 [03:13<35:06,  2.31s/it]

loss: -10.22 | unlearn_loss: -11.61 | retain_loss: 1.394 | avg_entropy: 11.61 | param_change: 0.00193


  9%|▉         | 88/1000 [03:15<33:59,  2.24s/it]

loss: -10.89 | unlearn_loss: -11.56 | retain_loss: 0.6731 | avg_entropy: 11.56 | param_change: 0.001562


  9%|▉         | 89/1000 [03:18<33:36,  2.21s/it]

loss: -9.914 | unlearn_loss: -10.52 | retain_loss: 0.6023 | avg_entropy: 10.52 | param_change: 0.001541


  9%|▉         | 90/1000 [03:20<34:08,  2.25s/it]

loss: -10.32 | unlearn_loss: -11.67 | retain_loss: 1.345 | avg_entropy: 11.67 | param_change: 0.001241


  9%|▉         | 91/1000 [03:22<33:24,  2.21s/it]

loss: -10.91 | unlearn_loss: -11.59 | retain_loss: 0.6759 | avg_entropy: 11.59 | param_change: 0.001209


  9%|▉         | 92/1000 [03:24<32:41,  2.16s/it]

loss: -8.93 | unlearn_loss: -11.65 | retain_loss: 2.724 | avg_entropy: 11.65 | param_change: 0.005638


  9%|▉         | 93/1000 [03:26<33:02,  2.19s/it]

loss: -8.273 | unlearn_loss: -10.72 | retain_loss: 2.447 | avg_entropy: 10.72 | param_change: 0.002709


  9%|▉         | 94/1000 [03:28<32:20,  2.14s/it]

loss: -9.196 | unlearn_loss: -11.67 | retain_loss: 2.472 | avg_entropy: 11.67 | param_change: 0.009524


 10%|▉         | 95/1000 [03:31<33:59,  2.25s/it]

loss: -9.925 | unlearn_loss: -10.87 | retain_loss: 0.9443 | avg_entropy: 10.87 | param_change: 0.001718


 10%|▉         | 96/1000 [03:33<33:49,  2.24s/it]

loss: -8.862 | unlearn_loss: -10.58 | retain_loss: 1.719 | avg_entropy: 10.58 | param_change: 0.004297


 10%|▉         | 97/1000 [03:36<35:20,  2.35s/it]

loss: -10.53 | unlearn_loss: -11.59 | retain_loss: 1.058 | avg_entropy: 11.59 | param_change: 0.002086


 10%|▉         | 98/1000 [03:38<36:18,  2.41s/it]

loss: -8.97 | unlearn_loss: -9.929 | retain_loss: 0.9592 | avg_entropy: 9.929 | param_change: 0.002193


 10%|▉         | 99/1000 [03:41<35:52,  2.39s/it]

loss: -6.77 | unlearn_loss: -10.34 | retain_loss: 3.565 | avg_entropy: 10.34 | param_change: 0.003


 10%|█         | 100/1000 [03:43<34:07,  2.27s/it]

loss: -11.03 | unlearn_loss: -11.69 | retain_loss: 0.6614 | avg_entropy: 11.69 | param_change: 0.00141


 10%|█         | 101/1000 [03:45<33:05,  2.21s/it]

loss: -9.87 | unlearn_loss: -11.66 | retain_loss: 1.793 | avg_entropy: 11.66 | param_change: 0.004148


 10%|█         | 102/1000 [03:47<32:31,  2.17s/it]

loss: -10.22 | unlearn_loss: -11.73 | retain_loss: 1.506 | avg_entropy: 11.73 | param_change: 0.001538


 10%|█         | 103/1000 [03:49<31:53,  2.13s/it]

loss: -10.86 | unlearn_loss: -11.69 | retain_loss: 0.8362 | avg_entropy: 11.69 | param_change: 0.001523


 10%|█         | 104/1000 [03:51<32:54,  2.20s/it]

loss: -10.72 | unlearn_loss: -11.74 | retain_loss: 1.021 | avg_entropy: 11.74 | param_change: 0.003468


 10%|█         | 105/1000 [03:54<34:25,  2.31s/it]

loss: -9.838 | unlearn_loss: -10.58 | retain_loss: 0.744 | avg_entropy: 10.58 | param_change: 0.002766


 11%|█         | 106/1000 [03:56<33:36,  2.26s/it]

loss: -11.03 | unlearn_loss: -11.71 | retain_loss: 0.6882 | avg_entropy: 11.71 | param_change: 0.001734


 11%|█         | 107/1000 [03:58<33:51,  2.27s/it]

loss: -10.75 | unlearn_loss: -11.69 | retain_loss: 0.9347 | avg_entropy: 11.69 | param_change: 0.003319


 11%|█         | 108/1000 [04:00<33:52,  2.28s/it]

loss: -10.89 | unlearn_loss: -11.69 | retain_loss: 0.7931 | avg_entropy: 11.69 | param_change: 0.001306


 11%|█         | 109/1000 [04:03<33:50,  2.28s/it]

loss: -10.67 | unlearn_loss: -11.7 | retain_loss: 1.03 | avg_entropy: 11.7 | param_change: 0.002303


 11%|█         | 110/1000 [04:05<33:19,  2.25s/it]

loss: -11.04 | unlearn_loss: -11.74 | retain_loss: 0.7004 | avg_entropy: 11.74 | param_change: 0.001634


 11%|█         | 111/1000 [04:07<32:50,  2.22s/it]

loss: -9.959 | unlearn_loss: -10.95 | retain_loss: 0.9909 | avg_entropy: 10.95 | param_change: 0.002904


 11%|█         | 112/1000 [04:09<32:08,  2.17s/it]

loss: -11.05 | unlearn_loss: -11.73 | retain_loss: 0.6776 | avg_entropy: 11.73 | param_change: 0.002072


 11%|█▏        | 113/1000 [04:12<33:38,  2.28s/it]

loss: -11.01 | unlearn_loss: -11.74 | retain_loss: 0.7297 | avg_entropy: 11.74 | param_change: 0.001688


 11%|█▏        | 114/1000 [04:14<33:39,  2.28s/it]

loss: -10.92 | unlearn_loss: -11.76 | retain_loss: 0.8375 | avg_entropy: 11.76 | param_change: 0.003145


 12%|█▏        | 115/1000 [04:16<33:23,  2.26s/it]

loss: -10.96 | unlearn_loss: -11.72 | retain_loss: 0.7523 | avg_entropy: 11.72 | param_change: 0.00238


 12%|█▏        | 116/1000 [04:19<34:10,  2.32s/it]

loss: -11.14 | unlearn_loss: -11.7 | retain_loss: 0.5567 | avg_entropy: 11.7 | param_change: 0.001511


 12%|█▏        | 117/1000 [04:21<34:40,  2.36s/it]

loss: -10.66 | unlearn_loss: -11.79 | retain_loss: 1.131 | avg_entropy: 11.79 | param_change: 0.005099


 12%|█▏        | 118/1000 [04:23<34:09,  2.32s/it]

loss: -10.88 | unlearn_loss: -11.77 | retain_loss: 0.8857 | avg_entropy: 11.77 | param_change: 0.003969


 12%|█▏        | 119/1000 [04:26<33:49,  2.30s/it]

loss: -10.7 | unlearn_loss: -11.28 | retain_loss: 0.5821 | avg_entropy: 11.28 | param_change: 0.001508


 12%|█▏        | 120/1000 [04:28<34:12,  2.33s/it]

loss: -10.39 | unlearn_loss: -11.75 | retain_loss: 1.358 | avg_entropy: 11.75 | param_change: 0.004611


 12%|█▏        | 121/1000 [04:30<34:13,  2.34s/it]

loss: -10.46 | unlearn_loss: -11.29 | retain_loss: 0.8301 | avg_entropy: 11.29 | param_change: 0.002946


 12%|█▏        | 122/1000 [04:33<34:13,  2.34s/it]

loss: -10.64 | unlearn_loss: -11.27 | retain_loss: 0.6273 | avg_entropy: 11.27 | param_change: 0.001122


 12%|█▏        | 123/1000 [04:35<33:39,  2.30s/it]

loss: -9.328 | unlearn_loss: -10.55 | retain_loss: 1.225 | avg_entropy: 10.55 | param_change: 0.005665


 12%|█▏        | 124/1000 [04:37<33:39,  2.31s/it]

loss: -11 | unlearn_loss: -11.78 | retain_loss: 0.7756 | avg_entropy: 11.78 | param_change: 0.001938


 12%|█▎        | 125/1000 [04:39<32:35,  2.24s/it]

loss: -10.9 | unlearn_loss: -11.78 | retain_loss: 0.8806 | avg_entropy: 11.78 | param_change: 0.003022


 13%|█▎        | 126/1000 [04:42<32:47,  2.25s/it]

loss: -10.81 | unlearn_loss: -11.5 | retain_loss: 0.6872 | avg_entropy: 11.5 | param_change: 0.001862


 13%|█▎        | 127/1000 [04:44<32:56,  2.26s/it]

loss: -10.76 | unlearn_loss: -11.55 | retain_loss: 0.7927 | avg_entropy: 11.55 | param_change: 0.00251


 13%|█▎        | 128/1000 [04:46<32:55,  2.26s/it]

loss: -10.98 | unlearn_loss: -11.7 | retain_loss: 0.721 | avg_entropy: 11.7 | param_change: 0.002253


 13%|█▎        | 129/1000 [04:48<32:52,  2.26s/it]

loss: -10.92 | unlearn_loss: -11.77 | retain_loss: 0.8506 | avg_entropy: 11.77 | param_change: 0.002977


 13%|█▎        | 130/1000 [04:51<33:35,  2.32s/it]

loss: -9.429 | unlearn_loss: -10.35 | retain_loss: 0.9212 | avg_entropy: 10.35 | param_change: 0.00423


 13%|█▎        | 131/1000 [04:53<32:27,  2.24s/it]

loss: -11.07 | unlearn_loss: -11.76 | retain_loss: 0.6928 | avg_entropy: 11.76 | param_change: 0.0018


 13%|█▎        | 132/1000 [04:55<32:25,  2.24s/it]

loss: -10.8 | unlearn_loss: -11.54 | retain_loss: 0.7365 | avg_entropy: 11.54 | param_change: 0.003264


 13%|█▎        | 133/1000 [04:57<32:25,  2.24s/it]

loss: -10.44 | unlearn_loss: -11.23 | retain_loss: 0.7888 | avg_entropy: 11.23 | param_change: 0.001755


 13%|█▎        | 134/1000 [05:00<32:24,  2.25s/it]

loss: -10.65 | unlearn_loss: -11.22 | retain_loss: 0.5734 | avg_entropy: 11.22 | param_change: 0.001901


 14%|█▎        | 135/1000 [05:02<32:41,  2.27s/it]

loss: -10.81 | unlearn_loss: -11.5 | retain_loss: 0.6819 | avg_entropy: 11.5 | param_change: 0.001549


 14%|█▎        | 136/1000 [05:04<31:24,  2.18s/it]

loss: -10.96 | unlearn_loss: -11.64 | retain_loss: 0.6797 | avg_entropy: 11.64 | param_change: 0.002227


 14%|█▎        | 137/1000 [05:06<32:33,  2.26s/it]

loss: -11.14 | unlearn_loss: -11.73 | retain_loss: 0.5887 | avg_entropy: 11.73 | param_change: 0.00158


 14%|█▍        | 138/1000 [05:09<33:34,  2.34s/it]

loss: -11.04 | unlearn_loss: -11.78 | retain_loss: 0.7344 | avg_entropy: 11.78 | param_change: 0.002013


 14%|█▍        | 139/1000 [05:11<33:32,  2.34s/it]

loss: -10.76 | unlearn_loss: -11.27 | retain_loss: 0.5135 | avg_entropy: 11.27 | param_change: 0.001078


 14%|█▍        | 140/1000 [05:13<33:15,  2.32s/it]

loss: -10.34 | unlearn_loss: -10.97 | retain_loss: 0.6358 | avg_entropy: 10.97 | param_change: 0.001783


 14%|█▍        | 141/1000 [05:16<32:30,  2.27s/it]

loss: -10.41 | unlearn_loss: -11.35 | retain_loss: 0.9399 | avg_entropy: 11.35 | param_change: 0.002024


 14%|█▍        | 142/1000 [05:18<32:09,  2.25s/it]

loss: -11.18 | unlearn_loss: -11.71 | retain_loss: 0.5309 | avg_entropy: 11.71 | param_change: 0.001317


 14%|█▍        | 143/1000 [05:20<31:51,  2.23s/it]

loss: -10.23 | unlearn_loss: -11.21 | retain_loss: 0.9748 | avg_entropy: 11.21 | param_change: 0.003102


 14%|█▍        | 144/1000 [05:22<31:48,  2.23s/it]

loss: -11.16 | unlearn_loss: -11.67 | retain_loss: 0.5017 | avg_entropy: 11.67 | param_change: 0.00122


 14%|█▍        | 145/1000 [05:24<31:40,  2.22s/it]

loss: -9.761 | unlearn_loss: -11.35 | retain_loss: 1.592 | avg_entropy: 11.35 | param_change: 0.003748


 15%|█▍        | 146/1000 [05:27<31:02,  2.18s/it]

loss: -8.167 | unlearn_loss: -11.54 | retain_loss: 3.375 | avg_entropy: 11.54 | param_change: 0.005118


 15%|█▍        | 147/1000 [05:29<30:02,  2.11s/it]

loss: -10.95 | unlearn_loss: -11.79 | retain_loss: 0.8395 | avg_entropy: 11.79 | param_change: 0.001751


 15%|█▍        | 148/1000 [05:31<30:44,  2.16s/it]

loss: -10.08 | unlearn_loss: -11.34 | retain_loss: 1.261 | avg_entropy: 11.34 | param_change: 0.002102


 15%|█▍        | 149/1000 [05:33<31:02,  2.19s/it]

loss: -10.43 | unlearn_loss: -11.77 | retain_loss: 1.341 | avg_entropy: 11.77 | param_change: 0.002406


 15%|█▌        | 150/1000 [05:35<31:06,  2.20s/it]

loss: -10.64 | unlearn_loss: -11.72 | retain_loss: 1.08 | avg_entropy: 11.72 | param_change: 0.002997


 15%|█▌        | 151/1000 [05:37<30:44,  2.17s/it]

loss: -10.66 | unlearn_loss: -11.36 | retain_loss: 0.6993 | avg_entropy: 11.36 | param_change: 0.001874


 15%|█▌        | 152/1000 [05:40<31:18,  2.22s/it]

loss: -11.19 | unlearn_loss: -11.83 | retain_loss: 0.6464 | avg_entropy: 11.83 | param_change: 0.001116


 15%|█▌        | 153/1000 [05:42<32:48,  2.32s/it]

loss: -10.67 | unlearn_loss: -11.79 | retain_loss: 1.122 | avg_entropy: 11.79 | param_change: 0.0017


 15%|█▌        | 154/1000 [05:44<31:24,  2.23s/it]

loss: -10.71 | unlearn_loss: -11.59 | retain_loss: 0.8768 | avg_entropy: 11.59 | param_change: 0.001628


 16%|█▌        | 155/1000 [05:46<31:03,  2.21s/it]

loss: -10.74 | unlearn_loss: -11.84 | retain_loss: 1.094 | avg_entropy: 11.84 | param_change: 0.001744


 16%|█▌        | 156/1000 [05:48<30:05,  2.14s/it]

loss: -10.85 | unlearn_loss: -11.71 | retain_loss: 0.8631 | avg_entropy: 11.71 | param_change: 0.002069


 16%|█▌        | 157/1000 [05:51<29:52,  2.13s/it]

loss: -6.383 | unlearn_loss: -11.4 | retain_loss: 5.02 | avg_entropy: 11.4 | param_change: 0.007788


 16%|█▌        | 158/1000 [05:52<29:15,  2.08s/it]

loss: -9.046 | unlearn_loss: -11.44 | retain_loss: 2.398 | avg_entropy: 11.44 | param_change: 0.00514


 16%|█▌        | 159/1000 [05:54<28:40,  2.05s/it]

loss: -9.148 | unlearn_loss: -11.82 | retain_loss: 2.672 | avg_entropy: 11.82 | param_change: 0.005796


 16%|█▌        | 160/1000 [05:56<28:27,  2.03s/it]

loss: -9.265 | unlearn_loss: -11.42 | retain_loss: 2.155 | avg_entropy: 11.42 | param_change: 0.006919


 16%|█▌        | 161/1000 [05:58<28:22,  2.03s/it]

loss: -9.672 | unlearn_loss: -11.49 | retain_loss: 1.813 | avg_entropy: 11.49 | param_change: 0.005232


 16%|█▌        | 162/1000 [06:01<28:20,  2.03s/it]

loss: -6.997 | unlearn_loss: -10.91 | retain_loss: 3.917 | avg_entropy: 10.91 | param_change: 0.007962


 16%|█▋        | 163/1000 [06:02<28:04,  2.01s/it]

loss: -9.674 | unlearn_loss: -11.83 | retain_loss: 2.152 | avg_entropy: 11.83 | param_change: 0.00578


 16%|█▋        | 164/1000 [06:04<27:56,  2.00s/it]

loss: -8.705 | unlearn_loss: -11.4 | retain_loss: 2.691 | avg_entropy: 11.4 | param_change: 0.00899


 16%|█▋        | 165/1000 [06:06<27:49,  2.00s/it]

loss: -9.945 | unlearn_loss: -11.42 | retain_loss: 1.475 | avg_entropy: 11.42 | param_change: 0.002911


 17%|█▋        | 166/1000 [06:08<27:42,  1.99s/it]

loss: -9.208 | unlearn_loss: -11.81 | retain_loss: 2.606 | avg_entropy: 11.81 | param_change: 0.005069


 17%|█▋        | 167/1000 [06:10<27:30,  1.98s/it]

loss: -10.03 | unlearn_loss: -11.86 | retain_loss: 1.833 | avg_entropy: 11.86 | param_change: 0.003432


 17%|█▋        | 168/1000 [06:12<27:31,  1.99s/it]

loss: -9.855 | unlearn_loss: -11.86 | retain_loss: 2.007 | avg_entropy: 11.86 | param_change: 0.007307


 17%|█▋        | 169/1000 [06:14<27:36,  1.99s/it]

loss: -10.63 | unlearn_loss: -11.87 | retain_loss: 1.239 | avg_entropy: 11.87 | param_change: 0.004873


 17%|█▋        | 170/1000 [06:16<27:19,  1.98s/it]

loss: -9.355 | unlearn_loss: -11.41 | retain_loss: 2.058 | avg_entropy: 11.41 | param_change: 0.008597


 17%|█▋        | 171/1000 [06:18<27:29,  1.99s/it]

loss: -10.38 | unlearn_loss: -11.41 | retain_loss: 1.03 | avg_entropy: 11.41 | param_change: 0.003798


 17%|█▋        | 172/1000 [06:20<27:27,  1.99s/it]

loss: -9.836 | unlearn_loss: -11.87 | retain_loss: 2.039 | avg_entropy: 11.87 | param_change: 0.004926


 17%|█▋        | 173/1000 [06:22<27:33,  2.00s/it]

loss: -9.815 | unlearn_loss: -11.1 | retain_loss: 1.289 | avg_entropy: 11.1 | param_change: 0.002698


 17%|█▋        | 174/1000 [06:25<28:48,  2.09s/it]

loss: -11.19 | unlearn_loss: -11.84 | retain_loss: 0.6529 | avg_entropy: 11.84 | param_change: 0.001136


 18%|█▊        | 175/1000 [06:27<28:33,  2.08s/it]

loss: -11.16 | unlearn_loss: -11.84 | retain_loss: 0.6756 | avg_entropy: 11.84 | param_change: 0.001128


 18%|█▊        | 176/1000 [06:29<28:29,  2.07s/it]

loss: -10.99 | unlearn_loss: -11.85 | retain_loss: 0.8609 | avg_entropy: 11.85 | param_change: 0.00144


 18%|█▊        | 177/1000 [06:31<28:46,  2.10s/it]

loss: -10.88 | unlearn_loss: -11.67 | retain_loss: 0.7902 | avg_entropy: 11.67 | param_change: 0.001592


 18%|█▊        | 178/1000 [06:33<28:21,  2.07s/it]

loss: -11.18 | unlearn_loss: -11.75 | retain_loss: 0.5761 | avg_entropy: 11.75 | param_change: 0.00115


 18%|█▊        | 179/1000 [06:35<29:43,  2.17s/it]

loss: -10.89 | unlearn_loss: -11.86 | retain_loss: 0.977 | avg_entropy: 11.86 | param_change: 0.001794


 18%|█▊        | 180/1000 [06:38<29:51,  2.18s/it]

loss: -10.94 | unlearn_loss: -11.88 | retain_loss: 0.9349 | avg_entropy: 11.88 | param_change: 0.001846


 18%|█▊        | 181/1000 [06:40<29:49,  2.19s/it]

loss: -11.04 | unlearn_loss: -11.72 | retain_loss: 0.6728 | avg_entropy: 11.72 | param_change: 0.001182


 18%|█▊        | 182/1000 [06:42<29:00,  2.13s/it]

loss: -11.09 | unlearn_loss: -11.77 | retain_loss: 0.6774 | avg_entropy: 11.77 | param_change: 0.00116


 18%|█▊        | 183/1000 [06:44<30:32,  2.24s/it]

loss: -10.57 | unlearn_loss: -11.23 | retain_loss: 0.6571 | avg_entropy: 11.23 | param_change: 0.001777


 18%|█▊        | 184/1000 [06:46<30:05,  2.21s/it]

loss: -11.12 | unlearn_loss: -11.86 | retain_loss: 0.7408 | avg_entropy: 11.86 | param_change: 0.001216


 18%|█▊        | 185/1000 [06:49<30:29,  2.24s/it]

loss: -11.18 | unlearn_loss: -11.88 | retain_loss: 0.6973 | avg_entropy: 11.88 | param_change: 0.001258


 19%|█▊        | 186/1000 [06:51<29:52,  2.20s/it]

loss: -10.51 | unlearn_loss: -11.28 | retain_loss: 0.7712 | avg_entropy: 11.28 | param_change: 0.00169


 19%|█▊        | 187/1000 [06:53<29:36,  2.18s/it]

loss: -11.04 | unlearn_loss: -11.64 | retain_loss: 0.5996 | avg_entropy: 11.64 | param_change: 0.001466


 19%|█▉        | 188/1000 [06:55<29:47,  2.20s/it]

loss: -11.29 | unlearn_loss: -11.89 | retain_loss: 0.597 | avg_entropy: 11.89 | param_change: 0.001128


 19%|█▉        | 189/1000 [06:57<29:07,  2.15s/it]

loss: -11.18 | unlearn_loss: -11.88 | retain_loss: 0.7071 | avg_entropy: 11.88 | param_change: 0.001605


 19%|█▉        | 190/1000 [06:59<28:38,  2.12s/it]

loss: -11.13 | unlearn_loss: -11.76 | retain_loss: 0.634 | avg_entropy: 11.76 | param_change: 0.00144


 19%|█▉        | 191/1000 [07:01<28:05,  2.08s/it]

loss: -11.29 | unlearn_loss: -11.79 | retain_loss: 0.5048 | avg_entropy: 11.79 | param_change: 0.00112


 19%|█▉        | 192/1000 [07:03<27:56,  2.07s/it]

loss: -11.24 | unlearn_loss: -11.87 | retain_loss: 0.6324 | avg_entropy: 11.87 | param_change: 0.001332


 19%|█▉        | 193/1000 [07:05<27:45,  2.06s/it]

loss: -11.11 | unlearn_loss: -11.85 | retain_loss: 0.7393 | avg_entropy: 11.85 | param_change: 0.001589


 19%|█▉        | 194/1000 [07:07<27:49,  2.07s/it]

loss: -11.26 | unlearn_loss: -11.9 | retain_loss: 0.6434 | avg_entropy: 11.9 | param_change: 0.001542


 20%|█▉        | 195/1000 [07:10<28:03,  2.09s/it]

loss: -11.19 | unlearn_loss: -11.86 | retain_loss: 0.6716 | avg_entropy: 11.86 | param_change: 0.001534


 20%|█▉        | 196/1000 [07:12<27:53,  2.08s/it]

loss: -11.24 | unlearn_loss: -11.84 | retain_loss: 0.6059 | avg_entropy: 11.84 | param_change: 0.001643


 20%|█▉        | 197/1000 [07:14<28:34,  2.14s/it]

loss: -11.41 | unlearn_loss: -11.86 | retain_loss: 0.4507 | avg_entropy: 11.86 | param_change: 0.0008627


 20%|█▉        | 198/1000 [07:16<28:58,  2.17s/it]

loss: -11.36 | unlearn_loss: -11.8 | retain_loss: 0.4416 | avg_entropy: 11.8 | param_change: 0.001165


 20%|█▉        | 199/1000 [07:18<29:24,  2.20s/it]

loss: -11.38 | unlearn_loss: -11.88 | retain_loss: 0.5043 | avg_entropy: 11.88 | param_change: 0.001158


 20%|██        | 200/1000 [07:21<30:58,  2.32s/it]

loss: -11.24 | unlearn_loss: -11.89 | retain_loss: 0.6437 | avg_entropy: 11.89 | param_change: 0.001295


 20%|██        | 201/1000 [07:23<30:01,  2.25s/it]

loss: -10.17 | unlearn_loss: -11.88 | retain_loss: 1.714 | avg_entropy: 11.88 | param_change: 0.003639


 20%|██        | 202/1000 [07:25<28:57,  2.18s/it]

loss: -11.15 | unlearn_loss: -11.91 | retain_loss: 0.7616 | avg_entropy: 11.91 | param_change: 0.001807


 20%|██        | 203/1000 [07:27<28:18,  2.13s/it]

loss: -10.96 | unlearn_loss: -11.65 | retain_loss: 0.6975 | avg_entropy: 11.65 | param_change: 0.001572


 20%|██        | 204/1000 [07:29<27:47,  2.10s/it]

loss: -11.26 | unlearn_loss: -11.9 | retain_loss: 0.6411 | avg_entropy: 11.9 | param_change: 0.001667


 20%|██        | 205/1000 [07:31<27:50,  2.10s/it]

loss: -11.55 | unlearn_loss: -11.89 | retain_loss: 0.3382 | avg_entropy: 11.89 | param_change: 0.0006117


 21%|██        | 206/1000 [07:34<30:26,  2.30s/it]

loss: -11.53 | unlearn_loss: -11.89 | retain_loss: 0.3641 | avg_entropy: 11.89 | param_change: 0.0006433


 21%|██        | 207/1000 [07:36<30:24,  2.30s/it]

loss: -11.41 | unlearn_loss: -11.76 | retain_loss: 0.3549 | avg_entropy: 11.76 | param_change: 0.000862


 21%|██        | 208/1000 [07:39<29:56,  2.27s/it]

loss: -10.94 | unlearn_loss: -11.88 | retain_loss: 0.9424 | avg_entropy: 11.88 | param_change: 0.002476


 21%|██        | 209/1000 [07:41<29:14,  2.22s/it]

loss: -11.35 | unlearn_loss: -11.89 | retain_loss: 0.5396 | avg_entropy: 11.89 | param_change: 0.001539


 21%|██        | 210/1000 [07:43<28:45,  2.18s/it]

loss: -10.51 | unlearn_loss: -11.04 | retain_loss: 0.5252 | avg_entropy: 11.04 | param_change: 0.001808


 21%|██        | 211/1000 [07:45<29:06,  2.21s/it]

loss: -10.81 | unlearn_loss: -11.55 | retain_loss: 0.7393 | avg_entropy: 11.55 | param_change: 0.002071


 21%|██        | 212/1000 [07:47<29:03,  2.21s/it]

loss: -11.29 | unlearn_loss: -11.91 | retain_loss: 0.6119 | avg_entropy: 11.91 | param_change: 0.001787


 21%|██▏       | 213/1000 [07:49<28:25,  2.17s/it]

loss: -11.18 | unlearn_loss: -11.64 | retain_loss: 0.4555 | avg_entropy: 11.64 | param_change: 0.001297


 21%|██▏       | 214/1000 [07:52<28:29,  2.18s/it]

loss: -11.51 | unlearn_loss: -11.91 | retain_loss: 0.3985 | avg_entropy: 11.91 | param_change: 0.001005


 22%|██▏       | 215/1000 [07:54<28:51,  2.21s/it]

loss: -11.14 | unlearn_loss: -11.65 | retain_loss: 0.5079 | avg_entropy: 11.65 | param_change: 0.001752


 22%|██▏       | 216/1000 [07:56<28:35,  2.19s/it]

loss: -11.38 | unlearn_loss: -11.9 | retain_loss: 0.5149 | avg_entropy: 11.9 | param_change: 0.001123


 22%|██▏       | 217/1000 [07:58<28:49,  2.21s/it]

loss: -11.07 | unlearn_loss: -11.58 | retain_loss: 0.5022 | avg_entropy: 11.58 | param_change: 0.001012


 22%|██▏       | 218/1000 [08:00<28:58,  2.22s/it]

loss: -11.52 | unlearn_loss: -11.92 | retain_loss: 0.398 | avg_entropy: 11.92 | param_change: 0.001008


 22%|██▏       | 219/1000 [08:03<29:37,  2.28s/it]

loss: -11.52 | unlearn_loss: -11.91 | retain_loss: 0.3908 | avg_entropy: 11.91 | param_change: 0.001004


 22%|██▏       | 220/1000 [08:05<29:23,  2.26s/it]

loss: -11.2 | unlearn_loss: -11.82 | retain_loss: 0.6138 | avg_entropy: 11.82 | param_change: 0.00271


 22%|██▏       | 221/1000 [08:07<28:13,  2.17s/it]

loss: -11.14 | unlearn_loss: -11.91 | retain_loss: 0.7664 | avg_entropy: 11.91 | param_change: 0.002358


 22%|██▏       | 222/1000 [08:09<27:27,  2.12s/it]

loss: -11.59 | unlearn_loss: -11.91 | retain_loss: 0.3168 | avg_entropy: 11.91 | param_change: 0.0007311


 22%|██▏       | 223/1000 [08:12<30:14,  2.33s/it]

loss: -11.23 | unlearn_loss: -11.89 | retain_loss: 0.6659 | avg_entropy: 11.89 | param_change: 0.001728


 22%|██▏       | 224/1000 [08:14<29:54,  2.31s/it]

loss: -11.38 | unlearn_loss: -11.91 | retain_loss: 0.5265 | avg_entropy: 11.91 | param_change: 0.001011


 22%|██▎       | 225/1000 [08:16<29:41,  2.30s/it]

loss: -11.36 | unlearn_loss: -11.89 | retain_loss: 0.5383 | avg_entropy: 11.89 | param_change: 0.001145


 23%|██▎       | 226/1000 [08:19<30:30,  2.36s/it]

loss: -11.45 | unlearn_loss: -11.91 | retain_loss: 0.4659 | avg_entropy: 11.91 | param_change: 0.001298


 23%|██▎       | 227/1000 [08:21<28:55,  2.25s/it]

loss: -11.34 | unlearn_loss: -11.93 | retain_loss: 0.5995 | avg_entropy: 11.93 | param_change: 0.001576


 23%|██▎       | 228/1000 [08:23<28:08,  2.19s/it]

loss: -11.43 | unlearn_loss: -11.92 | retain_loss: 0.4874 | avg_entropy: 11.92 | param_change: 0.001078


 23%|██▎       | 229/1000 [08:25<27:37,  2.15s/it]

loss: -11.1 | unlearn_loss: -11.93 | retain_loss: 0.8357 | avg_entropy: 11.93 | param_change: 0.002056


 23%|██▎       | 230/1000 [08:27<27:04,  2.11s/it]

loss: -11.23 | unlearn_loss: -11.93 | retain_loss: 0.7066 | avg_entropy: 11.93 | param_change: 0.002321


 23%|██▎       | 231/1000 [08:29<26:45,  2.09s/it]

loss: -11.1 | unlearn_loss: -11.92 | retain_loss: 0.825 | avg_entropy: 11.92 | param_change: 0.004257


 23%|██▎       | 232/1000 [08:31<26:33,  2.07s/it]

loss: -11.23 | unlearn_loss: -11.93 | retain_loss: 0.7018 | avg_entropy: 11.93 | param_change: 0.001688


 23%|██▎       | 233/1000 [08:33<26:34,  2.08s/it]

loss: -11.13 | unlearn_loss: -11.92 | retain_loss: 0.7842 | avg_entropy: 11.92 | param_change: 0.003289


 23%|██▎       | 234/1000 [08:35<26:15,  2.06s/it]

loss: -11.12 | unlearn_loss: -11.94 | retain_loss: 0.823 | avg_entropy: 11.94 | param_change: 0.00285


 24%|██▎       | 235/1000 [08:37<26:03,  2.04s/it]

loss: -11.47 | unlearn_loss: -11.93 | retain_loss: 0.4552 | avg_entropy: 11.93 | param_change: 0.0009269


 24%|██▎       | 236/1000 [08:39<26:10,  2.06s/it]

loss: -10.19 | unlearn_loss: -10.68 | retain_loss: 0.4814 | avg_entropy: 10.68 | param_change: 0.001989


 24%|██▎       | 237/1000 [08:41<26:36,  2.09s/it]

loss: -10.7 | unlearn_loss: -11.3 | retain_loss: 0.6082 | avg_entropy: 11.3 | param_change: 0.001715


 24%|██▍       | 238/1000 [08:44<26:29,  2.09s/it]

loss: -11.38 | unlearn_loss: -11.8 | retain_loss: 0.4202 | avg_entropy: 11.8 | param_change: 0.001139


 24%|██▍       | 239/1000 [08:46<26:11,  2.06s/it]

loss: -11.53 | unlearn_loss: -11.9 | retain_loss: 0.369 | avg_entropy: 11.9 | param_change: 0.00115


 24%|██▍       | 240/1000 [08:48<26:07,  2.06s/it]

loss: -11.62 | unlearn_loss: -11.93 | retain_loss: 0.3091 | avg_entropy: 11.93 | param_change: 0.0007531


 24%|██▍       | 241/1000 [08:50<26:53,  2.13s/it]

loss: -11.44 | unlearn_loss: -11.92 | retain_loss: 0.4778 | avg_entropy: 11.92 | param_change: 0.0008082


 24%|██▍       | 242/1000 [08:52<27:28,  2.17s/it]

loss: -11.36 | unlearn_loss: -11.88 | retain_loss: 0.5194 | avg_entropy: 11.88 | param_change: 0.001116


 24%|██▍       | 243/1000 [08:54<27:46,  2.20s/it]

loss: -10.19 | unlearn_loss: -10.64 | retain_loss: 0.445 | avg_entropy: 10.64 | param_change: 0.001922


 24%|██▍       | 244/1000 [08:56<26:51,  2.13s/it]

loss: -11.58 | unlearn_loss: -11.92 | retain_loss: 0.3338 | avg_entropy: 11.92 | param_change: 0.0006747


 24%|██▍       | 245/1000 [08:59<27:51,  2.21s/it]

loss: -11.09 | unlearn_loss: -11.88 | retain_loss: 0.7968 | avg_entropy: 11.88 | param_change: 0.001855


 25%|██▍       | 246/1000 [09:01<27:37,  2.20s/it]

loss: -11.48 | unlearn_loss: -11.93 | retain_loss: 0.4512 | avg_entropy: 11.93 | param_change: 0.0008698


 25%|██▍       | 247/1000 [09:03<27:08,  2.16s/it]

loss: -10.95 | unlearn_loss: -11.34 | retain_loss: 0.3935 | avg_entropy: 11.34 | param_change: 0.000845


 25%|██▍       | 248/1000 [09:05<27:32,  2.20s/it]

loss: -11.34 | unlearn_loss: -11.92 | retain_loss: 0.5779 | avg_entropy: 11.92 | param_change: 0.001412


 25%|██▍       | 249/1000 [09:07<26:42,  2.13s/it]

loss: -11.28 | unlearn_loss: -11.92 | retain_loss: 0.6402 | avg_entropy: 11.92 | param_change: 0.001495


 25%|██▌       | 250/1000 [09:09<26:14,  2.10s/it]

loss: -11.41 | unlearn_loss: -11.93 | retain_loss: 0.521 | avg_entropy: 11.93 | param_change: 0.001288


 25%|██▌       | 251/1000 [09:11<26:07,  2.09s/it]

loss: -11.2 | unlearn_loss: -11.81 | retain_loss: 0.6181 | avg_entropy: 11.81 | param_change: 0.00188


 25%|██▌       | 252/1000 [09:14<26:12,  2.10s/it]

loss: -11.11 | unlearn_loss: -11.93 | retain_loss: 0.8229 | avg_entropy: 11.93 | param_change: 0.002209


 25%|██▌       | 253/1000 [09:16<25:41,  2.06s/it]

loss: -10.35 | unlearn_loss: -11.2 | retain_loss: 0.8576 | avg_entropy: 11.2 | param_change: 0.002106


 25%|██▌       | 254/1000 [09:17<25:18,  2.04s/it]

loss: -11.36 | unlearn_loss: -11.95 | retain_loss: 0.5922 | avg_entropy: 11.95 | param_change: 0.002006


 26%|██▌       | 255/1000 [09:19<25:14,  2.03s/it]

loss: -11.23 | unlearn_loss: -11.95 | retain_loss: 0.7117 | avg_entropy: 11.95 | param_change: 0.00241


 26%|██▌       | 256/1000 [09:22<25:47,  2.08s/it]

loss: -11.3 | unlearn_loss: -11.96 | retain_loss: 0.6558 | avg_entropy: 11.96 | param_change: 0.001355


 26%|██▌       | 257/1000 [09:24<26:21,  2.13s/it]

loss: -11.51 | unlearn_loss: -11.96 | retain_loss: 0.4458 | avg_entropy: 11.96 | param_change: 0.001131


 26%|██▌       | 258/1000 [09:26<26:21,  2.13s/it]

loss: -11.05 | unlearn_loss: -11.95 | retain_loss: 0.9048 | avg_entropy: 11.95 | param_change: 0.003701


 26%|██▌       | 259/1000 [09:28<26:05,  2.11s/it]

loss: -11.29 | unlearn_loss: -11.72 | retain_loss: 0.4303 | avg_entropy: 11.72 | param_change: 0.001231


 26%|██▌       | 260/1000 [09:30<26:18,  2.13s/it]

loss: -11.14 | unlearn_loss: -11.76 | retain_loss: 0.625 | avg_entropy: 11.76 | param_change: 0.001475


 26%|██▌       | 261/1000 [09:33<26:30,  2.15s/it]

loss: -11.16 | unlearn_loss: -11.49 | retain_loss: 0.327 | avg_entropy: 11.49 | param_change: 0.0007566


 26%|██▌       | 262/1000 [09:35<28:20,  2.30s/it]

loss: -11.24 | unlearn_loss: -11.95 | retain_loss: 0.708 | avg_entropy: 11.95 | param_change: 0.003201


 26%|██▋       | 263/1000 [09:37<27:44,  2.26s/it]

loss: -11.29 | unlearn_loss: -11.87 | retain_loss: 0.5867 | avg_entropy: 11.87 | param_change: 0.001197


 26%|██▋       | 264/1000 [09:39<27:03,  2.21s/it]

loss: -11.25 | unlearn_loss: -11.68 | retain_loss: 0.4318 | avg_entropy: 11.68 | param_change: 0.001123


 26%|██▋       | 265/1000 [09:42<27:05,  2.21s/it]

loss: -6.947 | unlearn_loss: -11.34 | retain_loss: 4.392 | avg_entropy: 11.34 | param_change: 0.01129


 27%|██▋       | 266/1000 [09:44<27:15,  2.23s/it]

loss: -10.45 | unlearn_loss: -11.95 | retain_loss: 1.503 | avg_entropy: 11.95 | param_change: 0.004625


 27%|██▋       | 267/1000 [09:46<26:55,  2.20s/it]

loss: -9.894 | unlearn_loss: -11.87 | retain_loss: 1.975 | avg_entropy: 11.87 | param_change: 0.007479


 27%|██▋       | 268/1000 [09:48<27:43,  2.27s/it]

loss: -9.704 | unlearn_loss: -11.68 | retain_loss: 1.973 | avg_entropy: 11.68 | param_change: 0.005222


 27%|██▋       | 269/1000 [09:51<27:18,  2.24s/it]

loss: -8.885 | unlearn_loss: -11.48 | retain_loss: 2.598 | avg_entropy: 11.48 | param_change: 0.005935


 27%|██▋       | 270/1000 [09:53<28:07,  2.31s/it]

loss: -10.49 | unlearn_loss: -11.93 | retain_loss: 1.439 | avg_entropy: 11.93 | param_change: 0.002546


 27%|██▋       | 271/1000 [09:55<27:24,  2.26s/it]

loss: -9.834 | unlearn_loss: -11.42 | retain_loss: 1.587 | avg_entropy: 11.42 | param_change: 0.003084


 27%|██▋       | 272/1000 [09:58<28:32,  2.35s/it]

loss: -10.28 | unlearn_loss: -11.95 | retain_loss: 1.676 | avg_entropy: 11.95 | param_change: 0.003221


 27%|██▋       | 273/1000 [10:00<27:58,  2.31s/it]

loss: -10.34 | unlearn_loss: -11.91 | retain_loss: 1.568 | avg_entropy: 11.91 | param_change: 0.002122


 27%|██▋       | 274/1000 [10:02<28:03,  2.32s/it]

loss: -10.61 | unlearn_loss: -11.71 | retain_loss: 1.096 | avg_entropy: 11.71 | param_change: 0.001907


 28%|██▊       | 275/1000 [10:05<28:18,  2.34s/it]

loss: -10.57 | unlearn_loss: -11.87 | retain_loss: 1.304 | avg_entropy: 11.87 | param_change: 0.00218


 28%|██▊       | 276/1000 [10:07<28:00,  2.32s/it]

loss: -11.2 | unlearn_loss: -11.96 | retain_loss: 0.7621 | avg_entropy: 11.96 | param_change: 0.001703


 28%|██▊       | 277/1000 [10:09<27:18,  2.27s/it]

loss: -10.79 | unlearn_loss: -11.85 | retain_loss: 1.053 | avg_entropy: 11.85 | param_change: 0.002141


 28%|██▊       | 278/1000 [10:11<27:18,  2.27s/it]

loss: -11.01 | unlearn_loss: -11.92 | retain_loss: 0.9135 | avg_entropy: 11.92 | param_change: 0.001458


 28%|██▊       | 279/1000 [10:14<26:26,  2.20s/it]

loss: -10.66 | unlearn_loss: -11.46 | retain_loss: 0.8006 | avg_entropy: 11.46 | param_change: 0.001331


 28%|██▊       | 280/1000 [10:16<27:05,  2.26s/it]

loss: -10.59 | unlearn_loss: -11.95 | retain_loss: 1.363 | avg_entropy: 11.95 | param_change: 0.001946


 28%|██▊       | 281/1000 [10:18<26:52,  2.24s/it]

loss: -11.13 | unlearn_loss: -11.94 | retain_loss: 0.8082 | avg_entropy: 11.94 | param_change: 0.001742


 28%|██▊       | 282/1000 [10:20<26:45,  2.24s/it]

loss: -11.01 | unlearn_loss: -11.96 | retain_loss: 0.9437 | avg_entropy: 11.96 | param_change: 0.00188


 28%|██▊       | 283/1000 [10:23<26:58,  2.26s/it]

loss: -9.985 | unlearn_loss: -11.61 | retain_loss: 1.622 | avg_entropy: 11.61 | param_change: 0.004359


 28%|██▊       | 284/1000 [10:25<25:44,  2.16s/it]

loss: -10.43 | unlearn_loss: -11.83 | retain_loss: 1.397 | avg_entropy: 11.83 | param_change: 0.005907


 28%|██▊       | 285/1000 [10:27<25:08,  2.11s/it]

loss: -10.92 | unlearn_loss: -11.96 | retain_loss: 1.045 | avg_entropy: 11.96 | param_change: 0.004327


 29%|██▊       | 286/1000 [10:29<24:37,  2.07s/it]

loss: -10 | unlearn_loss: -11.96 | retain_loss: 1.958 | avg_entropy: 11.96 | param_change: 0.008846


 29%|██▊       | 287/1000 [10:31<24:18,  2.04s/it]

loss: -9.954 | unlearn_loss: -11.78 | retain_loss: 1.827 | avg_entropy: 11.78 | param_change: 0.007576


 29%|██▉       | 288/1000 [10:32<23:57,  2.02s/it]

loss: -6.426 | unlearn_loss: -10.79 | retain_loss: 4.369 | avg_entropy: 10.79 | param_change: 0.01107


 29%|██▉       | 289/1000 [10:34<23:38,  2.00s/it]

loss: -10.19 | unlearn_loss: -11.93 | retain_loss: 1.742 | avg_entropy: 11.93 | param_change: 0.005854


 29%|██▉       | 290/1000 [10:36<23:30,  1.99s/it]

loss: -9.905 | unlearn_loss: -11.69 | retain_loss: 1.786 | avg_entropy: 11.69 | param_change: 0.007509


 29%|██▉       | 291/1000 [10:38<23:33,  1.99s/it]

loss: -7.5 | unlearn_loss: -11.16 | retain_loss: 3.658 | avg_entropy: 11.16 | param_change: 0.009087


 29%|██▉       | 292/1000 [10:40<23:24,  1.98s/it]

loss: -9.662 | unlearn_loss: -11.97 | retain_loss: 2.305 | avg_entropy: 11.97 | param_change: 0.007339


 29%|██▉       | 293/1000 [10:42<23:17,  1.98s/it]

loss: -10.03 | unlearn_loss: -11.88 | retain_loss: 1.851 | avg_entropy: 11.88 | param_change: 0.004474


 29%|██▉       | 294/1000 [10:44<23:18,  1.98s/it]

loss: -7.291 | unlearn_loss: -11.88 | retain_loss: 4.59 | avg_entropy: 11.88 | param_change: 0.007762


 30%|██▉       | 295/1000 [10:46<23:22,  1.99s/it]

loss: -9.828 | unlearn_loss: -11.92 | retain_loss: 2.089 | avg_entropy: 11.92 | param_change: 0.005722


 30%|██▉       | 296/1000 [10:48<23:15,  1.98s/it]

loss: -8.547 | unlearn_loss: -11.92 | retain_loss: 3.378 | avg_entropy: 11.92 | param_change: 0.006209


 30%|██▉       | 297/1000 [10:50<23:06,  1.97s/it]

loss: -6.385 | unlearn_loss: -11.89 | retain_loss: 5.502 | avg_entropy: 11.89 | param_change: 0.0085


 30%|██▉       | 298/1000 [10:52<23:10,  1.98s/it]

loss: -6.946 | unlearn_loss: -11.94 | retain_loss: 4.991 | avg_entropy: 11.94 | param_change: 0.01024


 30%|██▉       | 299/1000 [10:54<22:56,  1.96s/it]

loss: -5.833 | unlearn_loss: -11.89 | retain_loss: 6.059 | avg_entropy: 11.89 | param_change: 0.007861


 30%|███       | 300/1000 [10:56<22:56,  1.97s/it]

loss: -8.917 | unlearn_loss: -11.96 | retain_loss: 3.044 | avg_entropy: 11.96 | param_change: 0.005175


 30%|███       | 301/1000 [10:58<22:53,  1.96s/it]

loss: -9.754 | unlearn_loss: -11.96 | retain_loss: 2.205 | avg_entropy: 11.96 | param_change: 0.004028


 30%|███       | 302/1000 [11:00<22:46,  1.96s/it]

loss: -10.39 | unlearn_loss: -11.96 | retain_loss: 1.567 | avg_entropy: 11.96 | param_change: 0.002875


 30%|███       | 303/1000 [11:02<23:55,  2.06s/it]

loss: -8.459 | unlearn_loss: -11.95 | retain_loss: 3.493 | avg_entropy: 11.95 | param_change: 0.005952


 30%|███       | 304/1000 [11:04<24:09,  2.08s/it]

loss: -7.59 | unlearn_loss: -11.97 | retain_loss: 4.383 | avg_entropy: 11.97 | param_change: 0.004886


 30%|███       | 305/1000 [11:07<24:05,  2.08s/it]

loss: -8.727 | unlearn_loss: -11.44 | retain_loss: 2.711 | avg_entropy: 11.44 | param_change: 0.003024


 31%|███       | 306/1000 [11:09<25:04,  2.17s/it]

loss: -9.876 | unlearn_loss: -11.98 | retain_loss: 2.101 | avg_entropy: 11.98 | param_change: 0.002291


 31%|███       | 307/1000 [11:11<25:54,  2.24s/it]

loss: -10.1 | unlearn_loss: -11.92 | retain_loss: 1.826 | avg_entropy: 11.92 | param_change: 0.003096


 31%|███       | 308/1000 [11:14<25:52,  2.24s/it]

loss: -8.904 | unlearn_loss: -11.47 | retain_loss: 2.563 | avg_entropy: 11.47 | param_change: 0.003776


 31%|███       | 309/1000 [11:16<25:14,  2.19s/it]

loss: -9.947 | unlearn_loss: -11.96 | retain_loss: 2.012 | avg_entropy: 11.96 | param_change: 0.002495


 31%|███       | 310/1000 [11:18<26:44,  2.32s/it]

loss: -9.545 | unlearn_loss: -11.98 | retain_loss: 2.435 | avg_entropy: 11.98 | param_change: 0.004063


 31%|███       | 311/1000 [11:20<25:35,  2.23s/it]

loss: -10.38 | unlearn_loss: -11.96 | retain_loss: 1.586 | avg_entropy: 11.96 | param_change: 0.001828


 31%|███       | 312/1000 [11:23<26:40,  2.33s/it]

loss: -10.26 | unlearn_loss: -11.54 | retain_loss: 1.282 | avg_entropy: 11.54 | param_change: 0.001842


 31%|███▏      | 313/1000 [11:25<25:49,  2.26s/it]

loss: -9.448 | unlearn_loss: -11.35 | retain_loss: 1.904 | avg_entropy: 11.35 | param_change: 0.002969


 31%|███▏      | 314/1000 [11:27<26:09,  2.29s/it]

loss: -10.71 | unlearn_loss: -11.96 | retain_loss: 1.25 | avg_entropy: 11.96 | param_change: 0.001849


 32%|███▏      | 315/1000 [11:29<25:20,  2.22s/it]

loss: -9.62 | unlearn_loss: -11.56 | retain_loss: 1.937 | avg_entropy: 11.56 | param_change: 0.00226


 32%|███▏      | 316/1000 [11:31<24:58,  2.19s/it]

loss: -10.43 | unlearn_loss: -11.97 | retain_loss: 1.538 | avg_entropy: 11.97 | param_change: 0.001675


 32%|███▏      | 317/1000 [11:34<25:32,  2.24s/it]

loss: -10.91 | unlearn_loss: -11.98 | retain_loss: 1.062 | avg_entropy: 11.98 | param_change: 0.001633


 32%|███▏      | 318/1000 [11:36<25:23,  2.23s/it]

loss: -10.13 | unlearn_loss: -11.97 | retain_loss: 1.836 | avg_entropy: 11.97 | param_change: 0.002992


 32%|███▏      | 319/1000 [11:38<25:07,  2.21s/it]

loss: -10.73 | unlearn_loss: -11.69 | retain_loss: 0.9608 | avg_entropy: 11.69 | param_change: 0.00158


 32%|███▏      | 320/1000 [11:40<25:20,  2.24s/it]

loss: -10.77 | unlearn_loss: -11.69 | retain_loss: 0.9209 | avg_entropy: 11.69 | param_change: 0.001695


 32%|███▏      | 321/1000 [11:43<25:29,  2.25s/it]

loss: -11.07 | unlearn_loss: -11.95 | retain_loss: 0.8772 | avg_entropy: 11.95 | param_change: 0.001208


 32%|███▏      | 322/1000 [11:45<25:08,  2.23s/it]

loss: -10.83 | unlearn_loss: -11.71 | retain_loss: 0.8844 | avg_entropy: 11.71 | param_change: 0.001227


 32%|███▏      | 323/1000 [11:47<24:32,  2.18s/it]

loss: -11.42 | unlearn_loss: -11.98 | retain_loss: 0.5578 | avg_entropy: 11.98 | param_change: 0.0009693


 32%|███▏      | 324/1000 [11:49<24:51,  2.21s/it]

loss: -10.81 | unlearn_loss: -11.46 | retain_loss: 0.6545 | avg_entropy: 11.46 | param_change: 0.001577


 32%|███▎      | 325/1000 [11:51<24:29,  2.18s/it]

loss: -11.34 | unlearn_loss: -11.99 | retain_loss: 0.6474 | avg_entropy: 11.99 | param_change: 0.001355


 33%|███▎      | 326/1000 [11:54<24:32,  2.18s/it]

loss: -10.57 | unlearn_loss: -11.58 | retain_loss: 1.009 | avg_entropy: 11.58 | param_change: 0.001885


 33%|███▎      | 327/1000 [11:56<25:02,  2.23s/it]

loss: -10.76 | unlearn_loss: -11.94 | retain_loss: 1.188 | avg_entropy: 11.94 | param_change: 0.002407


 33%|███▎      | 328/1000 [11:58<25:01,  2.23s/it]

loss: -11.3 | unlearn_loss: -11.96 | retain_loss: 0.6681 | avg_entropy: 11.96 | param_change: 0.001161


 33%|███▎      | 329/1000 [12:01<25:32,  2.28s/it]

loss: -10.99 | unlearn_loss: -11.82 | retain_loss: 0.8231 | avg_entropy: 11.82 | param_change: 0.001975


 33%|███▎      | 330/1000 [12:03<25:13,  2.26s/it]

loss: -11.36 | unlearn_loss: -11.99 | retain_loss: 0.6221 | avg_entropy: 11.99 | param_change: 0.00139


 33%|███▎      | 331/1000 [12:05<25:46,  2.31s/it]

loss: -10.73 | unlearn_loss: -11.96 | retain_loss: 1.227 | avg_entropy: 11.96 | param_change: 0.003866


 33%|███▎      | 332/1000 [12:08<26:03,  2.34s/it]

loss: -10.99 | unlearn_loss: -11.84 | retain_loss: 0.8539 | avg_entropy: 11.84 | param_change: 0.002181


 33%|███▎      | 333/1000 [12:10<24:56,  2.24s/it]

loss: -10.9 | unlearn_loss: -11.77 | retain_loss: 0.8659 | avg_entropy: 11.77 | param_change: 0.002174


 33%|███▎      | 334/1000 [12:12<23:58,  2.16s/it]

loss: -10.27 | unlearn_loss: -11.97 | retain_loss: 1.696 | avg_entropy: 11.97 | param_change: 0.005444


 34%|███▎      | 335/1000 [12:14<23:23,  2.11s/it]

loss: -11.11 | unlearn_loss: -11.97 | retain_loss: 0.8622 | avg_entropy: 11.97 | param_change: 0.001782


 34%|███▎      | 336/1000 [12:16<23:57,  2.16s/it]

loss: -10.09 | unlearn_loss: -11.25 | retain_loss: 1.153 | avg_entropy: 11.25 | param_change: 0.00373


 34%|███▎      | 337/1000 [12:18<23:21,  2.11s/it]

loss: -11.25 | unlearn_loss: -11.88 | retain_loss: 0.6271 | avg_entropy: 11.88 | param_change: 0.001364


 34%|███▍      | 338/1000 [12:20<24:51,  2.25s/it]

loss: -11.02 | unlearn_loss: -11.7 | retain_loss: 0.6768 | avg_entropy: 11.7 | param_change: 0.001531


 34%|███▍      | 339/1000 [12:23<25:02,  2.27s/it]

loss: -11.18 | unlearn_loss: -11.95 | retain_loss: 0.7664 | avg_entropy: 11.95 | param_change: 0.001475


 34%|███▍      | 340/1000 [12:25<24:22,  2.22s/it]

loss: -11.26 | unlearn_loss: -11.91 | retain_loss: 0.6518 | avg_entropy: 11.91 | param_change: 0.001524


 34%|███▍      | 341/1000 [12:27<23:47,  2.17s/it]

loss: -10.79 | unlearn_loss: -11.77 | retain_loss: 0.9792 | avg_entropy: 11.77 | param_change: 0.002605


 34%|███▍      | 342/1000 [12:29<23:29,  2.14s/it]

loss: -9.84 | unlearn_loss: -11.97 | retain_loss: 2.131 | avg_entropy: 11.97 | param_change: 0.003803


 34%|███▍      | 343/1000 [12:31<22:55,  2.09s/it]

loss: -10.46 | unlearn_loss: -11.67 | retain_loss: 1.209 | avg_entropy: 11.67 | param_change: 0.002738


 34%|███▍      | 344/1000 [12:33<22:45,  2.08s/it]

loss: -11.32 | unlearn_loss: -11.94 | retain_loss: 0.6163 | avg_entropy: 11.94 | param_change: 0.001188


 34%|███▍      | 345/1000 [12:35<23:27,  2.15s/it]

loss: -11.23 | unlearn_loss: -11.99 | retain_loss: 0.7539 | avg_entropy: 11.99 | param_change: 0.002067


 35%|███▍      | 346/1000 [12:38<24:14,  2.22s/it]

loss: -11.09 | unlearn_loss: -11.74 | retain_loss: 0.6424 | avg_entropy: 11.74 | param_change: 0.001635


 35%|███▍      | 347/1000 [12:40<24:45,  2.27s/it]

loss: -11.2 | unlearn_loss: -11.79 | retain_loss: 0.5943 | avg_entropy: 11.79 | param_change: 0.001298


 35%|███▍      | 348/1000 [12:42<24:10,  2.22s/it]

loss: -11.05 | unlearn_loss: -11.94 | retain_loss: 0.8944 | avg_entropy: 11.94 | param_change: 0.00245


 35%|███▍      | 349/1000 [12:44<23:46,  2.19s/it]

loss: -11.43 | unlearn_loss: -11.96 | retain_loss: 0.5292 | avg_entropy: 11.96 | param_change: 0.001406


 35%|███▌      | 350/1000 [12:46<23:21,  2.16s/it]

loss: -11.16 | unlearn_loss: -11.98 | retain_loss: 0.8199 | avg_entropy: 11.98 | param_change: 0.001661


 35%|███▌      | 351/1000 [12:49<23:22,  2.16s/it]

loss: -11.41 | unlearn_loss: -12 | retain_loss: 0.5925 | avg_entropy: 12 | param_change: 0.001354


 35%|███▌      | 352/1000 [12:51<23:41,  2.19s/it]

loss: -11.12 | unlearn_loss: -11.69 | retain_loss: 0.561 | avg_entropy: 11.69 | param_change: 0.001498


 35%|███▌      | 353/1000 [12:53<23:28,  2.18s/it]

loss: -10.81 | unlearn_loss: -11.58 | retain_loss: 0.7703 | avg_entropy: 11.58 | param_change: 0.002082


 35%|███▌      | 354/1000 [12:55<22:58,  2.13s/it]

loss: -11.44 | unlearn_loss: -11.96 | retain_loss: 0.5201 | avg_entropy: 11.96 | param_change: 0.0013


 36%|███▌      | 355/1000 [12:57<22:51,  2.13s/it]

loss: -11.02 | unlearn_loss: -11.9 | retain_loss: 0.8838 | avg_entropy: 11.9 | param_change: 0.002473


 36%|███▌      | 356/1000 [12:59<22:42,  2.12s/it]

loss: -10.96 | unlearn_loss: -11.75 | retain_loss: 0.799 | avg_entropy: 11.75 | param_change: 0.002991


 36%|███▌      | 357/1000 [13:01<22:27,  2.10s/it]

loss: -10.44 | unlearn_loss: -11.74 | retain_loss: 1.292 | avg_entropy: 11.74 | param_change: 0.004718


 36%|███▌      | 358/1000 [13:03<22:11,  2.07s/it]

loss: -10.37 | unlearn_loss: -11.71 | retain_loss: 1.336 | avg_entropy: 11.71 | param_change: 0.005062


 36%|███▌      | 359/1000 [13:05<21:57,  2.05s/it]

loss: -10.17 | unlearn_loss: -11.79 | retain_loss: 1.624 | avg_entropy: 11.79 | param_change: 0.003499


 36%|███▌      | 360/1000 [13:07<21:37,  2.03s/it]

loss: -9.731 | unlearn_loss: -11.57 | retain_loss: 1.838 | avg_entropy: 11.57 | param_change: 0.004119


 36%|███▌      | 361/1000 [13:09<21:25,  2.01s/it]

loss: -10.83 | unlearn_loss: -11.97 | retain_loss: 1.144 | avg_entropy: 11.97 | param_change: 0.00301


 36%|███▌      | 362/1000 [13:11<21:23,  2.01s/it]

loss: -10.7 | unlearn_loss: -11.98 | retain_loss: 1.284 | avg_entropy: 11.98 | param_change: 0.002632


 36%|███▋      | 363/1000 [13:14<22:31,  2.12s/it]

loss: -11.09 | unlearn_loss: -11.97 | retain_loss: 0.8749 | avg_entropy: 11.97 | param_change: 0.001913


 36%|███▋      | 364/1000 [13:16<23:05,  2.18s/it]

loss: -10.42 | unlearn_loss: -11.82 | retain_loss: 1.404 | avg_entropy: 11.82 | param_change: 0.003242


 36%|███▋      | 365/1000 [13:18<23:02,  2.18s/it]

loss: -10.67 | unlearn_loss: -11.76 | retain_loss: 1.093 | avg_entropy: 11.76 | param_change: 0.002725


 37%|███▋      | 366/1000 [13:20<23:12,  2.20s/it]

loss: -7.509 | unlearn_loss: -11.97 | retain_loss: 4.462 | avg_entropy: 11.97 | param_change: 0.008219


 37%|███▋      | 367/1000 [13:22<22:31,  2.14s/it]

loss: -10.62 | unlearn_loss: -12 | retain_loss: 1.389 | avg_entropy: 12 | param_change: 0.003678


 37%|███▋      | 368/1000 [13:24<22:24,  2.13s/it]

loss: -11.09 | unlearn_loss: -11.97 | retain_loss: 0.8762 | avg_entropy: 11.97 | param_change: 0.002083


 37%|███▋      | 369/1000 [13:27<22:21,  2.13s/it]

loss: -8.628 | unlearn_loss: -12.01 | retain_loss: 3.383 | avg_entropy: 12.01 | param_change: 0.006905


 37%|███▋      | 370/1000 [13:29<21:55,  2.09s/it]

loss: -10.81 | unlearn_loss: -12 | retain_loss: 1.19 | avg_entropy: 12 | param_change: 0.00282


 37%|███▋      | 371/1000 [13:31<22:11,  2.12s/it]

loss: -11.03 | unlearn_loss: -12 | retain_loss: 0.9684 | avg_entropy: 12 | param_change: 0.001704


 37%|███▋      | 372/1000 [13:33<22:38,  2.16s/it]

loss: -10.86 | unlearn_loss: -12.01 | retain_loss: 1.154 | avg_entropy: 12.01 | param_change: 0.002901


 37%|███▋      | 373/1000 [13:35<22:45,  2.18s/it]

loss: -11.4 | unlearn_loss: -12.01 | retain_loss: 0.6197 | avg_entropy: 12.01 | param_change: 0.001376


 37%|███▋      | 374/1000 [13:38<24:02,  2.30s/it]

loss: -10.84 | unlearn_loss: -11.78 | retain_loss: 0.9433 | avg_entropy: 11.78 | param_change: 0.002107


 38%|███▊      | 375/1000 [13:40<24:48,  2.38s/it]

loss: -11.14 | unlearn_loss: -12.01 | retain_loss: 0.8778 | avg_entropy: 12.01 | param_change: 0.001294


 38%|███▊      | 376/1000 [13:43<24:12,  2.33s/it]

loss: -11.28 | unlearn_loss: -11.99 | retain_loss: 0.7093 | avg_entropy: 11.99 | param_change: 0.001558


 38%|███▊      | 377/1000 [13:45<24:38,  2.37s/it]

loss: -10.52 | unlearn_loss: -11.73 | retain_loss: 1.211 | avg_entropy: 11.73 | param_change: 0.00232


 38%|███▊      | 378/1000 [13:47<23:56,  2.31s/it]

loss: -11.16 | unlearn_loss: -11.98 | retain_loss: 0.8242 | avg_entropy: 11.98 | param_change: 0.001864


 38%|███▊      | 379/1000 [13:50<24:19,  2.35s/it]

loss: -10.7 | unlearn_loss: -11.53 | retain_loss: 0.8382 | avg_entropy: 11.53 | param_change: 0.001938


 38%|███▊      | 380/1000 [13:52<23:58,  2.32s/it]

loss: -11.09 | unlearn_loss: -12.02 | retain_loss: 0.9302 | avg_entropy: 12.02 | param_change: 0.002367


 38%|███▊      | 381/1000 [13:54<23:15,  2.25s/it]

loss: -11.02 | unlearn_loss: -11.89 | retain_loss: 0.8736 | avg_entropy: 11.89 | param_change: 0.001991


 38%|███▊      | 382/1000 [13:56<23:26,  2.28s/it]

loss: -11.04 | unlearn_loss: -11.86 | retain_loss: 0.8177 | avg_entropy: 11.86 | param_change: 0.002204


 38%|███▊      | 383/1000 [13:59<23:32,  2.29s/it]

loss: -10.85 | unlearn_loss: -11.71 | retain_loss: 0.8626 | avg_entropy: 11.71 | param_change: 0.00229


 38%|███▊      | 384/1000 [14:01<23:31,  2.29s/it]

loss: -11.03 | unlearn_loss: -11.77 | retain_loss: 0.7403 | avg_entropy: 11.77 | param_change: 0.001788


 38%|███▊      | 385/1000 [14:04<24:38,  2.40s/it]

loss: -10.98 | unlearn_loss: -11.93 | retain_loss: 0.9464 | avg_entropy: 11.93 | param_change: 0.002464


 39%|███▊      | 386/1000 [14:06<24:18,  2.38s/it]

loss: -11.46 | unlearn_loss: -11.99 | retain_loss: 0.5335 | avg_entropy: 11.99 | param_change: 0.000891


 39%|███▊      | 387/1000 [14:08<24:30,  2.40s/it]

loss: -10.97 | unlearn_loss: -11.78 | retain_loss: 0.8087 | avg_entropy: 11.78 | param_change: 0.001828


 39%|███▉      | 388/1000 [14:11<23:51,  2.34s/it]

loss: -11.32 | unlearn_loss: -12 | retain_loss: 0.6763 | avg_entropy: 12 | param_change: 0.001469


 39%|███▉      | 389/1000 [14:13<23:38,  2.32s/it]

loss: -11.26 | unlearn_loss: -11.98 | retain_loss: 0.7241 | avg_entropy: 11.98 | param_change: 0.003644


 39%|███▉      | 390/1000 [14:15<22:59,  2.26s/it]

loss: -4.382 | unlearn_loss: -11.96 | retain_loss: 7.583 | avg_entropy: 11.96 | param_change: 0.009524


 39%|███▉      | 391/1000 [14:17<22:08,  2.18s/it]

loss: -9.894 | unlearn_loss: -12.01 | retain_loss: 2.114 | avg_entropy: 12.01 | param_change: 0.003171


 39%|███▉      | 392/1000 [14:19<21:39,  2.14s/it]

loss: -8.93 | unlearn_loss: -11.48 | retain_loss: 2.553 | avg_entropy: 11.48 | param_change: 0.005499


 39%|███▉      | 393/1000 [14:21<21:04,  2.08s/it]

loss: -8.668 | unlearn_loss: -10.91 | retain_loss: 2.242 | avg_entropy: 10.91 | param_change: 0.006658


 39%|███▉      | 394/1000 [14:23<20:48,  2.06s/it]

loss: -10.55 | unlearn_loss: -11.99 | retain_loss: 1.442 | avg_entropy: 11.99 | param_change: 0.002943


 40%|███▉      | 395/1000 [14:25<20:44,  2.06s/it]

loss: -11.01 | unlearn_loss: -12 | retain_loss: 0.9927 | avg_entropy: 12 | param_change: 0.001908


 40%|███▉      | 396/1000 [14:28<22:16,  2.21s/it]

loss: -9.835 | unlearn_loss: -11.36 | retain_loss: 1.524 | avg_entropy: 11.36 | param_change: 0.003473


 40%|███▉      | 397/1000 [14:30<21:31,  2.14s/it]

loss: -9.618 | unlearn_loss: -11.54 | retain_loss: 1.926 | avg_entropy: 11.54 | param_change: 0.003512


 40%|███▉      | 398/1000 [14:32<21:27,  2.14s/it]

loss: -11.1 | unlearn_loss: -12.01 | retain_loss: 0.9071 | avg_entropy: 12.01 | param_change: 0.001589


 40%|███▉      | 399/1000 [14:34<21:47,  2.17s/it]

loss: -10.22 | unlearn_loss: -11.75 | retain_loss: 1.53 | avg_entropy: 11.75 | param_change: 0.002548


 40%|████      | 400/1000 [14:36<21:28,  2.15s/it]

loss: -9.315 | unlearn_loss: -11.69 | retain_loss: 2.378 | avg_entropy: 11.69 | param_change: 0.004447


 40%|████      | 401/1000 [14:38<21:11,  2.12s/it]

loss: -11.22 | unlearn_loss: -12.01 | retain_loss: 0.7936 | avg_entropy: 12.01 | param_change: 0.001915


 40%|████      | 402/1000 [14:40<21:38,  2.17s/it]

loss: -10.8 | unlearn_loss: -11.62 | retain_loss: 0.8192 | avg_entropy: 11.62 | param_change: 0.001794


 40%|████      | 403/1000 [14:43<21:52,  2.20s/it]

loss: -10.57 | unlearn_loss: -11.74 | retain_loss: 1.175 | avg_entropy: 11.74 | param_change: 0.002192


 40%|████      | 404/1000 [14:45<21:53,  2.20s/it]

loss: -10.77 | unlearn_loss: -11.64 | retain_loss: 0.8691 | avg_entropy: 11.64 | param_change: 0.001708


 40%|████      | 405/1000 [14:47<21:23,  2.16s/it]

loss: -11.14 | unlearn_loss: -12 | retain_loss: 0.8626 | avg_entropy: 12 | param_change: 0.001607


 41%|████      | 406/1000 [14:49<21:03,  2.13s/it]

loss: -10.86 | unlearn_loss: -11.76 | retain_loss: 0.905 | avg_entropy: 11.76 | param_change: 0.00139


 41%|████      | 407/1000 [14:51<21:36,  2.19s/it]

loss: -11 | unlearn_loss: -11.75 | retain_loss: 0.7524 | avg_entropy: 11.75 | param_change: 0.001015


 41%|████      | 408/1000 [14:54<22:03,  2.24s/it]

loss: -11.24 | unlearn_loss: -12.02 | retain_loss: 0.7801 | avg_entropy: 12.02 | param_change: 0.001592


 41%|████      | 409/1000 [14:56<22:13,  2.26s/it]

loss: -11.23 | unlearn_loss: -12 | retain_loss: 0.7742 | avg_entropy: 12 | param_change: 0.001138


 41%|████      | 410/1000 [14:58<22:25,  2.28s/it]

loss: -10.95 | unlearn_loss: -11.79 | retain_loss: 0.841 | avg_entropy: 11.79 | param_change: 0.001733


 41%|████      | 411/1000 [15:01<22:04,  2.25s/it]

loss: -11.42 | unlearn_loss: -12.02 | retain_loss: 0.5992 | avg_entropy: 12.02 | param_change: 0.000925


 41%|████      | 412/1000 [15:03<22:16,  2.27s/it]

loss: -11.04 | unlearn_loss: -12.01 | retain_loss: 0.9662 | avg_entropy: 12.01 | param_change: 0.001494


 41%|████▏     | 413/1000 [15:05<22:31,  2.30s/it]

loss: -11.27 | unlearn_loss: -12.02 | retain_loss: 0.7511 | avg_entropy: 12.02 | param_change: 0.001013


 41%|████▏     | 414/1000 [15:07<22:14,  2.28s/it]

loss: -11.08 | unlearn_loss: -12.02 | retain_loss: 0.9474 | avg_entropy: 12.02 | param_change: 0.001442


 42%|████▏     | 415/1000 [15:10<22:22,  2.29s/it]

loss: -11.3 | unlearn_loss: -12.02 | retain_loss: 0.7141 | avg_entropy: 12.02 | param_change: 0.00115


 42%|████▏     | 416/1000 [15:12<22:27,  2.31s/it]

loss: -11.35 | unlearn_loss: -12.02 | retain_loss: 0.6651 | avg_entropy: 12.02 | param_change: 0.0008303


 42%|████▏     | 417/1000 [15:14<22:16,  2.29s/it]

loss: -11.18 | unlearn_loss: -12.03 | retain_loss: 0.8429 | avg_entropy: 12.03 | param_change: 0.00131


 42%|████▏     | 418/1000 [15:17<22:22,  2.31s/it]

loss: -11.09 | unlearn_loss: -12.03 | retain_loss: 0.9396 | avg_entropy: 12.03 | param_change: 0.001316


 42%|████▏     | 419/1000 [15:19<21:39,  2.24s/it]

loss: -11.38 | unlearn_loss: -11.87 | retain_loss: 0.491 | avg_entropy: 11.87 | param_change: 0.000975


 42%|████▏     | 420/1000 [15:21<21:09,  2.19s/it]

loss: -11.4 | unlearn_loss: -11.98 | retain_loss: 0.5821 | avg_entropy: 11.98 | param_change: 0.00116


 42%|████▏     | 421/1000 [15:23<20:51,  2.16s/it]

loss: -11.17 | unlearn_loss: -11.89 | retain_loss: 0.7155 | avg_entropy: 11.89 | param_change: 0.001219


 42%|████▏     | 422/1000 [15:25<20:23,  2.12s/it]

loss: -11.02 | unlearn_loss: -11.79 | retain_loss: 0.773 | avg_entropy: 11.79 | param_change: 0.001297


 42%|████▏     | 423/1000 [15:27<20:05,  2.09s/it]

loss: -11.3 | unlearn_loss: -12.03 | retain_loss: 0.7306 | avg_entropy: 12.03 | param_change: 0.00135


 42%|████▏     | 424/1000 [15:30<22:04,  2.30s/it]

loss: -11.26 | unlearn_loss: -12 | retain_loss: 0.7389 | avg_entropy: 12 | param_change: 0.001005


 42%|████▎     | 425/1000 [15:32<22:13,  2.32s/it]

loss: -11.01 | unlearn_loss: -11.8 | retain_loss: 0.7898 | avg_entropy: 11.8 | param_change: 0.001019


 43%|████▎     | 426/1000 [15:34<21:43,  2.27s/it]

loss: -11.06 | unlearn_loss: -11.82 | retain_loss: 0.7598 | avg_entropy: 11.82 | param_change: 0.001403


 43%|████▎     | 427/1000 [15:37<21:24,  2.24s/it]

loss: -10.28 | unlearn_loss: -11.78 | retain_loss: 1.499 | avg_entropy: 11.78 | param_change: 0.004531


 43%|████▎     | 428/1000 [15:39<20:41,  2.17s/it]

loss: -11.38 | unlearn_loss: -11.89 | retain_loss: 0.516 | avg_entropy: 11.89 | param_change: 0.001166


 43%|████▎     | 429/1000 [15:41<20:31,  2.16s/it]

loss: -11.18 | unlearn_loss: -12.02 | retain_loss: 0.836 | avg_entropy: 12.02 | param_change: 0.002461


 43%|████▎     | 430/1000 [15:43<20:15,  2.13s/it]

loss: -11.24 | unlearn_loss: -11.73 | retain_loss: 0.4885 | avg_entropy: 11.73 | param_change: 0.001048


 43%|████▎     | 431/1000 [15:45<20:25,  2.15s/it]

loss: -10.97 | unlearn_loss: -11.44 | retain_loss: 0.4691 | avg_entropy: 11.44 | param_change: 0.0009005


 43%|████▎     | 432/1000 [15:47<20:44,  2.19s/it]

loss: -11.56 | unlearn_loss: -12.03 | retain_loss: 0.4699 | avg_entropy: 12.03 | param_change: 0.0008529


 43%|████▎     | 433/1000 [15:50<21:34,  2.28s/it]

loss: -11.43 | unlearn_loss: -12.03 | retain_loss: 0.5999 | avg_entropy: 12.03 | param_change: 0.0008256


 43%|████▎     | 434/1000 [15:52<22:14,  2.36s/it]

loss: -11.47 | unlearn_loss: -12.04 | retain_loss: 0.5657 | avg_entropy: 12.04 | param_change: 0.0008465


 44%|████▎     | 435/1000 [15:54<21:25,  2.28s/it]

loss: -11.32 | unlearn_loss: -12.03 | retain_loss: 0.718 | avg_entropy: 12.03 | param_change: 0.001214


 44%|████▎     | 436/1000 [15:56<21:09,  2.25s/it]

loss: -11.35 | unlearn_loss: -11.91 | retain_loss: 0.5636 | avg_entropy: 11.91 | param_change: 0.001173


 44%|████▎     | 437/1000 [15:58<20:23,  2.17s/it]

loss: -11.24 | unlearn_loss: -11.92 | retain_loss: 0.6804 | avg_entropy: 11.92 | param_change: 0.001334


 44%|████▍     | 438/1000 [16:01<20:34,  2.20s/it]

loss: -11.36 | unlearn_loss: -11.93 | retain_loss: 0.5638 | avg_entropy: 11.93 | param_change: 0.00115


 44%|████▍     | 439/1000 [16:03<20:05,  2.15s/it]

loss: -11.31 | unlearn_loss: -11.93 | retain_loss: 0.6259 | avg_entropy: 11.93 | param_change: 0.001432


 44%|████▍     | 440/1000 [16:05<20:03,  2.15s/it]

loss: -11.3 | unlearn_loss: -11.97 | retain_loss: 0.6746 | avg_entropy: 11.97 | param_change: 0.001126


 44%|████▍     | 441/1000 [16:07<20:20,  2.18s/it]

loss: -11.43 | unlearn_loss: -12.02 | retain_loss: 0.5898 | avg_entropy: 12.02 | param_change: 0.001062


 44%|████▍     | 442/1000 [16:10<21:05,  2.27s/it]

loss: -11.29 | unlearn_loss: -12 | retain_loss: 0.7099 | avg_entropy: 12 | param_change: 0.001199


 44%|████▍     | 443/1000 [16:12<21:10,  2.28s/it]

loss: -11.09 | unlearn_loss: -12.03 | retain_loss: 0.9363 | avg_entropy: 12.03 | param_change: 0.002772


 44%|████▍     | 444/1000 [16:14<20:46,  2.24s/it]

loss: -11.41 | unlearn_loss: -12.03 | retain_loss: 0.6192 | avg_entropy: 12.03 | param_change: 0.001154


 44%|████▍     | 445/1000 [16:16<20:59,  2.27s/it]

loss: -11.46 | unlearn_loss: -12.03 | retain_loss: 0.5709 | avg_entropy: 12.03 | param_change: 0.001079


 45%|████▍     | 446/1000 [16:19<20:40,  2.24s/it]

loss: -11.05 | unlearn_loss: -11.74 | retain_loss: 0.6894 | avg_entropy: 11.74 | param_change: 0.001262


 45%|████▍     | 447/1000 [16:21<21:08,  2.29s/it]

loss: -11.1 | unlearn_loss: -11.98 | retain_loss: 0.8857 | avg_entropy: 11.98 | param_change: 0.001281


 45%|████▍     | 448/1000 [16:24<21:49,  2.37s/it]

loss: -11.35 | unlearn_loss: -12.01 | retain_loss: 0.6618 | avg_entropy: 12.01 | param_change: 0.001403


 45%|████▍     | 449/1000 [16:26<20:53,  2.28s/it]

loss: -11.11 | unlearn_loss: -11.64 | retain_loss: 0.5328 | avg_entropy: 11.64 | param_change: 0.001312


 45%|████▌     | 450/1000 [16:28<22:14,  2.43s/it]

loss: -11.31 | unlearn_loss: -11.74 | retain_loss: 0.4292 | avg_entropy: 11.74 | param_change: 0.001236


 45%|████▌     | 451/1000 [16:30<21:06,  2.31s/it]

loss: -10.99 | unlearn_loss: -11.99 | retain_loss: 0.9959 | avg_entropy: 11.99 | param_change: 0.002341


 45%|████▌     | 452/1000 [16:32<20:20,  2.23s/it]

loss: -11.2 | unlearn_loss: -12.02 | retain_loss: 0.8157 | avg_entropy: 12.02 | param_change: 0.002041


 45%|████▌     | 453/1000 [16:35<19:52,  2.18s/it]

loss: -11.26 | unlearn_loss: -11.95 | retain_loss: 0.6953 | avg_entropy: 11.95 | param_change: 0.001525


 45%|████▌     | 454/1000 [16:37<19:34,  2.15s/it]

loss: -11.3 | unlearn_loss: -12 | retain_loss: 0.6962 | avg_entropy: 12 | param_change: 0.001524


 46%|████▌     | 455/1000 [16:39<20:01,  2.20s/it]

loss: -11.49 | unlearn_loss: -12.01 | retain_loss: 0.5121 | avg_entropy: 12.01 | param_change: 0.001169


 46%|████▌     | 456/1000 [16:41<20:32,  2.27s/it]

loss: -11.22 | unlearn_loss: -11.8 | retain_loss: 0.5853 | avg_entropy: 11.8 | param_change: 0.001409


 46%|████▌     | 457/1000 [16:44<20:06,  2.22s/it]

loss: -11.55 | unlearn_loss: -12.02 | retain_loss: 0.4648 | avg_entropy: 12.02 | param_change: 0.0009524


 46%|████▌     | 458/1000 [16:46<20:19,  2.25s/it]

loss: -11.43 | unlearn_loss: -11.96 | retain_loss: 0.5324 | avg_entropy: 11.96 | param_change: 0.001413


 46%|████▌     | 459/1000 [16:48<20:14,  2.24s/it]

loss: -11.32 | unlearn_loss: -12.04 | retain_loss: 0.7265 | avg_entropy: 12.04 | param_change: 0.001921


 46%|████▌     | 460/1000 [16:50<20:05,  2.23s/it]

loss: -11.4 | unlearn_loss: -12.04 | retain_loss: 0.64 | avg_entropy: 12.04 | param_change: 0.001268


 46%|████▌     | 461/1000 [16:52<19:47,  2.20s/it]

loss: -11.28 | unlearn_loss: -12.03 | retain_loss: 0.7541 | avg_entropy: 12.03 | param_change: 0.002146


 46%|████▌     | 462/1000 [16:54<19:21,  2.16s/it]

loss: -10.85 | unlearn_loss: -11.4 | retain_loss: 0.5487 | avg_entropy: 11.4 | param_change: 0.001198


 46%|████▋     | 463/1000 [16:57<19:42,  2.20s/it]

loss: -9.084 | unlearn_loss: -12.01 | retain_loss: 2.93 | avg_entropy: 12.01 | param_change: 0.004447


 46%|████▋     | 464/1000 [16:59<19:07,  2.14s/it]

loss: -11.55 | unlearn_loss: -12.04 | retain_loss: 0.4926 | avg_entropy: 12.04 | param_change: 0.001477


 46%|████▋     | 465/1000 [17:01<19:10,  2.15s/it]

loss: -11.14 | unlearn_loss: -12.03 | retain_loss: 0.8924 | avg_entropy: 12.03 | param_change: 0.002411


 47%|████▋     | 466/1000 [17:03<18:54,  2.12s/it]

loss: -9.69 | unlearn_loss: -12.02 | retain_loss: 2.329 | avg_entropy: 12.02 | param_change: 0.004292


 47%|████▋     | 467/1000 [17:05<18:29,  2.08s/it]

loss: -10.36 | unlearn_loss: -11.57 | retain_loss: 1.208 | avg_entropy: 11.57 | param_change: 0.004395


 47%|████▋     | 468/1000 [17:07<18:46,  2.12s/it]

loss: -11.52 | unlearn_loss: -12.04 | retain_loss: 0.5162 | avg_entropy: 12.04 | param_change: 0.001528


 47%|████▋     | 469/1000 [17:09<18:43,  2.12s/it]

loss: -11.39 | unlearn_loss: -11.81 | retain_loss: 0.4164 | avg_entropy: 11.81 | param_change: 0.0008927


 47%|████▋     | 470/1000 [17:12<18:59,  2.15s/it]

loss: -11 | unlearn_loss: -11.93 | retain_loss: 0.9295 | avg_entropy: 11.93 | param_change: 0.002452


 47%|████▋     | 471/1000 [17:13<18:25,  2.09s/it]

loss: -5.347 | unlearn_loss: -12.01 | retain_loss: 6.664 | avg_entropy: 12.01 | param_change: 0.01038


 47%|████▋     | 472/1000 [17:15<18:05,  2.06s/it]

loss: -10.85 | unlearn_loss: -11.6 | retain_loss: 0.7471 | avg_entropy: 11.6 | param_change: 0.001736


 47%|████▋     | 473/1000 [17:17<18:02,  2.05s/it]

loss: -10.21 | unlearn_loss: -11.89 | retain_loss: 1.689 | avg_entropy: 11.89 | param_change: 0.00332


 47%|████▋     | 474/1000 [17:20<18:05,  2.06s/it]

loss: -10.45 | unlearn_loss: -12 | retain_loss: 1.548 | avg_entropy: 12 | param_change: 0.002902


 48%|████▊     | 475/1000 [17:22<18:21,  2.10s/it]

loss: -11.4 | unlearn_loss: -12.01 | retain_loss: 0.6168 | avg_entropy: 12.01 | param_change: 0.001015


 48%|████▊     | 476/1000 [17:24<18:40,  2.14s/it]

loss: -11.41 | unlearn_loss: -12.02 | retain_loss: 0.6119 | avg_entropy: 12.02 | param_change: 0.001239


 48%|████▊     | 477/1000 [17:26<19:27,  2.23s/it]

loss: -10.57 | unlearn_loss: -11.77 | retain_loss: 1.197 | avg_entropy: 11.77 | param_change: 0.00237


 48%|████▊     | 478/1000 [17:29<19:33,  2.25s/it]

loss: -11.28 | unlearn_loss: -11.94 | retain_loss: 0.6665 | avg_entropy: 11.94 | param_change: 0.0009477


 48%|████▊     | 479/1000 [17:31<19:43,  2.27s/it]

loss: -11.29 | unlearn_loss: -12.01 | retain_loss: 0.7239 | avg_entropy: 12.01 | param_change: 0.001136


 48%|████▊     | 480/1000 [17:33<20:07,  2.32s/it]

loss: -10.77 | unlearn_loss: -11.75 | retain_loss: 0.9782 | avg_entropy: 11.75 | param_change: 0.001771


 48%|████▊     | 481/1000 [17:36<19:50,  2.29s/it]

loss: -10.1 | unlearn_loss: -12.03 | retain_loss: 1.934 | avg_entropy: 12.03 | param_change: 0.003639


 48%|████▊     | 482/1000 [17:38<20:13,  2.34s/it]

loss: -10.98 | unlearn_loss: -12.02 | retain_loss: 1.042 | avg_entropy: 12.02 | param_change: 0.001765


 48%|████▊     | 483/1000 [17:41<20:23,  2.37s/it]

loss: -11.05 | unlearn_loss: -12.05 | retain_loss: 0.9934 | avg_entropy: 12.05 | param_change: 0.001713


 48%|████▊     | 484/1000 [17:43<19:57,  2.32s/it]

loss: -11.26 | unlearn_loss: -12.05 | retain_loss: 0.7864 | avg_entropy: 12.05 | param_change: 0.001492


 48%|████▊     | 485/1000 [17:45<20:13,  2.36s/it]

loss: -10.64 | unlearn_loss: -11.85 | retain_loss: 1.213 | avg_entropy: 11.85 | param_change: 0.002744


 49%|████▊     | 486/1000 [17:48<20:27,  2.39s/it]

loss: -11.07 | unlearn_loss: -12.03 | retain_loss: 0.9651 | avg_entropy: 12.03 | param_change: 0.001152


 49%|████▊     | 487/1000 [17:50<19:42,  2.31s/it]

loss: -10.96 | unlearn_loss: -12.05 | retain_loss: 1.086 | avg_entropy: 12.05 | param_change: 0.002214


 49%|████▉     | 488/1000 [17:52<19:52,  2.33s/it]

loss: -7.309 | unlearn_loss: -12.03 | retain_loss: 4.726 | avg_entropy: 12.03 | param_change: 0.004459


 49%|████▉     | 489/1000 [17:54<19:01,  2.23s/it]

loss: -3.343 | unlearn_loss: -12.03 | retain_loss: 8.683 | avg_entropy: 12.03 | param_change: 0.01139


 49%|████▉     | 490/1000 [17:56<18:23,  2.16s/it]

loss: -1.073 | unlearn_loss: -12.05 | retain_loss: 10.97 | avg_entropy: 12.05 | param_change: 0.008242


 49%|████▉     | 491/1000 [17:58<17:55,  2.11s/it]

loss: -10.15 | unlearn_loss: -11.91 | retain_loss: 1.757 | avg_entropy: 11.91 | param_change: 0.00452


 49%|████▉     | 492/1000 [18:00<17:59,  2.13s/it]

loss: -10.17 | unlearn_loss: -12.04 | retain_loss: 1.862 | avg_entropy: 12.04 | param_change: 0.002602


 49%|████▉     | 493/1000 [18:03<18:12,  2.15s/it]

loss: -10.91 | unlearn_loss: -11.87 | retain_loss: 0.9565 | avg_entropy: 11.87 | param_change: 0.00177


 49%|████▉     | 494/1000 [18:05<18:40,  2.21s/it]

loss: -10.09 | unlearn_loss: -12.05 | retain_loss: 1.956 | avg_entropy: 12.05 | param_change: 0.0035


 50%|████▉     | 495/1000 [18:07<18:37,  2.21s/it]

loss: -10.05 | unlearn_loss: -11.69 | retain_loss: 1.645 | avg_entropy: 11.69 | param_change: 0.003408


 50%|████▉     | 496/1000 [18:09<18:44,  2.23s/it]

loss: -8.696 | unlearn_loss: -11.85 | retain_loss: 3.155 | avg_entropy: 11.85 | param_change: 0.006752


 50%|████▉     | 497/1000 [18:12<18:44,  2.24s/it]

loss: -10.21 | unlearn_loss: -12.02 | retain_loss: 1.813 | avg_entropy: 12.02 | param_change: 0.004288


 50%|████▉     | 498/1000 [18:14<18:40,  2.23s/it]

loss: -7.56 | unlearn_loss: -12.04 | retain_loss: 4.482 | avg_entropy: 12.04 | param_change: 0.01019


 50%|████▉     | 499/1000 [18:16<18:12,  2.18s/it]

loss: -9.867 | unlearn_loss: -12.03 | retain_loss: 2.159 | avg_entropy: 12.03 | param_change: 0.004451


 50%|█████     | 500/1000 [18:18<18:48,  2.26s/it]

loss: -10.39 | unlearn_loss: -11.82 | retain_loss: 1.425 | avg_entropy: 11.82 | param_change: 0.002794


 50%|█████     | 501/1000 [18:21<19:04,  2.29s/it]

loss: -10.74 | unlearn_loss: -11.89 | retain_loss: 1.159 | avg_entropy: 11.89 | param_change: 0.001495


 50%|█████     | 502/1000 [18:23<19:17,  2.32s/it]

loss: -7.984 | unlearn_loss: -11.85 | retain_loss: 3.87 | avg_entropy: 11.85 | param_change: 0.006211


 50%|█████     | 503/1000 [18:26<19:31,  2.36s/it]

loss: -10.64 | unlearn_loss: -11.98 | retain_loss: 1.339 | avg_entropy: 11.98 | param_change: 0.001538


 50%|█████     | 504/1000 [18:28<19:17,  2.33s/it]

loss: -10.36 | unlearn_loss: -11.99 | retain_loss: 1.631 | avg_entropy: 11.99 | param_change: 0.003457


 50%|█████     | 505/1000 [18:30<19:22,  2.35s/it]

loss: -8.635 | unlearn_loss: -11.48 | retain_loss: 2.842 | avg_entropy: 11.48 | param_change: 0.004078


 51%|█████     | 506/1000 [18:32<19:01,  2.31s/it]

loss: -10.48 | unlearn_loss: -12.03 | retain_loss: 1.552 | avg_entropy: 12.03 | param_change: 0.001947


 51%|█████     | 507/1000 [18:35<19:23,  2.36s/it]

loss: -10.28 | unlearn_loss: -12.04 | retain_loss: 1.757 | avg_entropy: 12.04 | param_change: 0.003079


 51%|█████     | 508/1000 [18:37<18:57,  2.31s/it]

loss: -9.394 | unlearn_loss: -11.85 | retain_loss: 2.452 | avg_entropy: 11.85 | param_change: 0.002804


 51%|█████     | 509/1000 [18:40<19:35,  2.39s/it]

loss: -10.25 | unlearn_loss: -11.86 | retain_loss: 1.607 | avg_entropy: 11.86 | param_change: 0.002876


 51%|█████     | 510/1000 [18:42<19:11,  2.35s/it]

loss: -10.93 | unlearn_loss: -12.01 | retain_loss: 1.087 | avg_entropy: 12.01 | param_change: 0.00143


 51%|█████     | 511/1000 [18:44<18:47,  2.31s/it]

loss: -9.873 | unlearn_loss: -12.04 | retain_loss: 2.164 | avg_entropy: 12.04 | param_change: 0.004356


 51%|█████     | 512/1000 [18:46<18:41,  2.30s/it]

loss: -10.57 | unlearn_loss: -12 | retain_loss: 1.427 | avg_entropy: 12 | param_change: 0.002509


 51%|█████▏    | 513/1000 [18:49<18:10,  2.24s/it]

loss: -10.5 | unlearn_loss: -11.76 | retain_loss: 1.259 | avg_entropy: 11.76 | param_change: 0.001859


 51%|█████▏    | 514/1000 [18:51<18:03,  2.23s/it]

loss: -10.5 | unlearn_loss: -12.02 | retain_loss: 1.513 | avg_entropy: 12.02 | param_change: 0.002354


 52%|█████▏    | 515/1000 [18:53<18:15,  2.26s/it]

loss: -10.62 | unlearn_loss: -12.05 | retain_loss: 1.421 | avg_entropy: 12.05 | param_change: 0.001838


 52%|█████▏    | 516/1000 [18:55<18:13,  2.26s/it]

loss: -11.16 | unlearn_loss: -12.05 | retain_loss: 0.8865 | avg_entropy: 12.05 | param_change: 0.001158


 52%|█████▏    | 517/1000 [18:58<18:07,  2.25s/it]

loss: -10.41 | unlearn_loss: -12 | retain_loss: 1.591 | avg_entropy: 12 | param_change: 0.00179


 52%|█████▏    | 518/1000 [19:00<18:33,  2.31s/it]

loss: -10.99 | unlearn_loss: -11.8 | retain_loss: 0.8152 | avg_entropy: 11.8 | param_change: 0.001517


 52%|█████▏    | 519/1000 [19:03<18:57,  2.37s/it]

loss: -11.12 | unlearn_loss: -11.92 | retain_loss: 0.7988 | avg_entropy: 11.92 | param_change: 0.001728


 52%|█████▏    | 520/1000 [19:05<19:21,  2.42s/it]

loss: -10.45 | unlearn_loss: -12.04 | retain_loss: 1.589 | avg_entropy: 12.04 | param_change: 0.002919


 52%|█████▏    | 521/1000 [19:08<19:30,  2.44s/it]

loss: -10.74 | unlearn_loss: -11.67 | retain_loss: 0.9301 | avg_entropy: 11.67 | param_change: 0.001316


 52%|█████▏    | 522/1000 [19:10<19:13,  2.41s/it]

loss: -10.84 | unlearn_loss: -11.97 | retain_loss: 1.128 | avg_entropy: 11.97 | param_change: 0.002266


 52%|█████▏    | 523/1000 [19:12<18:17,  2.30s/it]

loss: -11.24 | unlearn_loss: -12.04 | retain_loss: 0.7948 | avg_entropy: 12.04 | param_change: 0.001149


 52%|█████▏    | 524/1000 [19:14<18:30,  2.33s/it]

loss: -10.57 | unlearn_loss: -11.98 | retain_loss: 1.41 | avg_entropy: 11.98 | param_change: 0.001944


 52%|█████▎    | 525/1000 [19:17<18:29,  2.34s/it]

loss: -11.28 | unlearn_loss: -12.01 | retain_loss: 0.7327 | avg_entropy: 12.01 | param_change: 0.0009494


 53%|█████▎    | 526/1000 [19:19<18:33,  2.35s/it]

loss: -11.03 | unlearn_loss: -12.05 | retain_loss: 1.016 | avg_entropy: 12.05 | param_change: 0.001808


 53%|█████▎    | 527/1000 [19:21<17:51,  2.27s/it]

loss: -10.8 | unlearn_loss: -12.03 | retain_loss: 1.23 | avg_entropy: 12.03 | param_change: 0.00184


 53%|█████▎    | 528/1000 [19:24<18:12,  2.32s/it]

loss: -10.91 | unlearn_loss: -12.03 | retain_loss: 1.115 | avg_entropy: 12.03 | param_change: 0.002882


 53%|█████▎    | 529/1000 [19:26<17:53,  2.28s/it]

loss: -11.19 | unlearn_loss: -12.05 | retain_loss: 0.8598 | avg_entropy: 12.05 | param_change: 0.001204


 53%|█████▎    | 530/1000 [19:28<17:50,  2.28s/it]

loss: -10.65 | unlearn_loss: -11.98 | retain_loss: 1.33 | avg_entropy: 11.98 | param_change: 0.002266


 53%|█████▎    | 531/1000 [19:30<17:55,  2.29s/it]

loss: -11.15 | unlearn_loss: -12.01 | retain_loss: 0.8669 | avg_entropy: 12.01 | param_change: 0.001384


 53%|█████▎    | 532/1000 [19:33<18:39,  2.39s/it]

loss: -11.03 | unlearn_loss: -12.02 | retain_loss: 0.9982 | avg_entropy: 12.02 | param_change: 0.001362


 53%|█████▎    | 533/1000 [19:35<18:09,  2.33s/it]

loss: -10.06 | unlearn_loss: -11.6 | retain_loss: 1.542 | avg_entropy: 11.6 | param_change: 0.003004


 53%|█████▎    | 534/1000 [19:38<18:03,  2.32s/it]

loss: -10.7 | unlearn_loss: -11.98 | retain_loss: 1.28 | avg_entropy: 11.98 | param_change: 0.001461


 54%|█████▎    | 535/1000 [19:40<17:12,  2.22s/it]

loss: -10.66 | unlearn_loss: -11.88 | retain_loss: 1.229 | avg_entropy: 11.88 | param_change: 0.001434


 54%|█████▎    | 536/1000 [19:42<16:53,  2.18s/it]

loss: -10.41 | unlearn_loss: -11.82 | retain_loss: 1.409 | avg_entropy: 11.82 | param_change: 0.002129


 54%|█████▎    | 537/1000 [19:44<16:42,  2.17s/it]

loss: -10.23 | unlearn_loss: -11.99 | retain_loss: 1.756 | avg_entropy: 11.99 | param_change: 0.002388


 54%|█████▍    | 538/1000 [19:46<16:14,  2.11s/it]

loss: -9.823 | unlearn_loss: -12.04 | retain_loss: 2.222 | avg_entropy: 12.04 | param_change: 0.004985


 54%|█████▍    | 539/1000 [19:48<16:35,  2.16s/it]

loss: -10.33 | unlearn_loss: -11.81 | retain_loss: 1.485 | avg_entropy: 11.81 | param_change: 0.002855


 54%|█████▍    | 540/1000 [19:50<16:50,  2.20s/it]

loss: -9.371 | unlearn_loss: -12.02 | retain_loss: 2.648 | avg_entropy: 12.02 | param_change: 0.005512


 54%|█████▍    | 541/1000 [19:52<16:26,  2.15s/it]

loss: -9.939 | unlearn_loss: -11.76 | retain_loss: 1.823 | avg_entropy: 11.76 | param_change: 0.002649


 54%|█████▍    | 542/1000 [19:54<16:25,  2.15s/it]

loss: -10.55 | unlearn_loss: -11.78 | retain_loss: 1.232 | avg_entropy: 11.78 | param_change: 0.0021


 54%|█████▍    | 543/1000 [19:56<16:04,  2.11s/it]

loss: -10.21 | unlearn_loss: -12.02 | retain_loss: 1.816 | avg_entropy: 12.02 | param_change: 0.003992


 54%|█████▍    | 544/1000 [19:58<15:50,  2.08s/it]

loss: -10.7 | unlearn_loss: -11.87 | retain_loss: 1.167 | avg_entropy: 11.87 | param_change: 0.002053


 55%|█████▍    | 545/1000 [20:01<15:47,  2.08s/it]

loss: -8.136 | unlearn_loss: -12.02 | retain_loss: 3.887 | avg_entropy: 12.02 | param_change: 0.003941


 55%|█████▍    | 546/1000 [20:03<15:34,  2.06s/it]

loss: -9.982 | unlearn_loss: -12.05 | retain_loss: 2.066 | avg_entropy: 12.05 | param_change: 0.004801


 55%|█████▍    | 547/1000 [20:05<15:39,  2.07s/it]

loss: -10.06 | unlearn_loss: -12.05 | retain_loss: 1.995 | avg_entropy: 12.05 | param_change: 0.003868


 55%|█████▍    | 548/1000 [20:07<15:26,  2.05s/it]

loss: -9.062 | unlearn_loss: -11.57 | retain_loss: 2.507 | avg_entropy: 11.57 | param_change: 0.003817


 55%|█████▍    | 549/1000 [20:09<15:24,  2.05s/it]

loss: -10.08 | unlearn_loss: -12.04 | retain_loss: 1.954 | avg_entropy: 12.04 | param_change: 0.003921


 55%|█████▌    | 550/1000 [20:11<15:57,  2.13s/it]

loss: -9.436 | unlearn_loss: -12.05 | retain_loss: 2.618 | avg_entropy: 12.05 | param_change: 0.005259


 55%|█████▌    | 551/1000 [20:13<15:46,  2.11s/it]

loss: -10.08 | unlearn_loss: -12.02 | retain_loss: 1.934 | avg_entropy: 12.02 | param_change: 0.005602


 55%|█████▌    | 552/1000 [20:15<15:43,  2.11s/it]

loss: -9.98 | unlearn_loss: -12.04 | retain_loss: 2.061 | avg_entropy: 12.04 | param_change: 0.005573


 55%|█████▌    | 553/1000 [20:17<16:06,  2.16s/it]

loss: -10.64 | unlearn_loss: -11.96 | retain_loss: 1.32 | avg_entropy: 11.96 | param_change: 0.003302


 55%|█████▌    | 554/1000 [20:20<15:58,  2.15s/it]

loss: -8.853 | unlearn_loss: -11.87 | retain_loss: 3.012 | avg_entropy: 11.87 | param_change: 0.005685


 56%|█████▌    | 555/1000 [20:22<15:39,  2.11s/it]

loss: -9.603 | unlearn_loss: -12.04 | retain_loss: 2.438 | avg_entropy: 12.04 | param_change: 0.00434


 56%|█████▌    | 556/1000 [20:24<15:26,  2.09s/it]

loss: -10.81 | unlearn_loss: -12.03 | retain_loss: 1.223 | avg_entropy: 12.03 | param_change: 0.002223


 56%|█████▌    | 557/1000 [20:26<15:13,  2.06s/it]

loss: -10.63 | unlearn_loss: -12.03 | retain_loss: 1.4 | avg_entropy: 12.03 | param_change: 0.002017


 56%|█████▌    | 558/1000 [20:28<15:31,  2.11s/it]

loss: -10.21 | unlearn_loss: -11.75 | retain_loss: 1.54 | avg_entropy: 11.75 | param_change: 0.003034


 56%|█████▌    | 559/1000 [20:30<15:11,  2.07s/it]

loss: -9.808 | unlearn_loss: -11.86 | retain_loss: 2.054 | avg_entropy: 11.86 | param_change: 0.004762


 56%|█████▌    | 560/1000 [20:32<15:02,  2.05s/it]

loss: -10.22 | unlearn_loss: -12.04 | retain_loss: 1.821 | avg_entropy: 12.04 | param_change: 0.002294


 56%|█████▌    | 561/1000 [20:34<15:38,  2.14s/it]

loss: -10.76 | unlearn_loss: -12.06 | retain_loss: 1.304 | avg_entropy: 12.06 | param_change: 0.001765


 56%|█████▌    | 562/1000 [20:36<15:19,  2.10s/it]

loss: -11.16 | unlearn_loss: -12.06 | retain_loss: 0.9068 | avg_entropy: 12.06 | param_change: 0.001313


 56%|█████▋    | 563/1000 [20:38<15:36,  2.14s/it]

loss: -10.94 | unlearn_loss: -11.83 | retain_loss: 0.8931 | avg_entropy: 11.83 | param_change: 0.0009923


 56%|█████▋    | 564/1000 [20:41<16:00,  2.20s/it]

loss: -11.22 | unlearn_loss: -12.07 | retain_loss: 0.8422 | avg_entropy: 12.07 | param_change: 0.001305


 56%|█████▋    | 565/1000 [20:43<15:39,  2.16s/it]

loss: -10.53 | unlearn_loss: -11.69 | retain_loss: 1.165 | avg_entropy: 11.69 | param_change: 0.001992


 57%|█████▋    | 566/1000 [20:45<15:51,  2.19s/it]

loss: -11.03 | unlearn_loss: -12.01 | retain_loss: 0.9779 | avg_entropy: 12.01 | param_change: 0.001792


 57%|█████▋    | 567/1000 [20:48<16:18,  2.26s/it]

loss: -11.28 | unlearn_loss: -12.05 | retain_loss: 0.7661 | avg_entropy: 12.05 | param_change: 0.001589


 57%|█████▋    | 568/1000 [20:50<15:56,  2.22s/it]

loss: -11.38 | unlearn_loss: -12.04 | retain_loss: 0.6651 | avg_entropy: 12.04 | param_change: 0.0008205


 57%|█████▋    | 569/1000 [20:52<16:24,  2.28s/it]

loss: -11.29 | unlearn_loss: -12.03 | retain_loss: 0.7466 | avg_entropy: 12.03 | param_change: 0.001071


 57%|█████▋    | 570/1000 [20:54<16:08,  2.25s/it]

loss: -11.07 | unlearn_loss: -12.04 | retain_loss: 0.9722 | avg_entropy: 12.04 | param_change: 0.001625


 57%|█████▋    | 571/1000 [20:57<16:16,  2.28s/it]

loss: -10.94 | unlearn_loss: -12.04 | retain_loss: 1.095 | avg_entropy: 12.04 | param_change: 0.002127


 57%|█████▋    | 572/1000 [20:59<16:24,  2.30s/it]

loss: -10.63 | unlearn_loss: -11.87 | retain_loss: 1.235 | avg_entropy: 11.87 | param_change: 0.002231


 57%|█████▋    | 573/1000 [21:01<16:23,  2.30s/it]

loss: -11.32 | unlearn_loss: -12.06 | retain_loss: 0.7384 | avg_entropy: 12.06 | param_change: 0.001085


 57%|█████▋    | 574/1000 [21:04<16:11,  2.28s/it]

loss: -11.3 | unlearn_loss: -12.03 | retain_loss: 0.7325 | avg_entropy: 12.03 | param_change: 0.001718


 57%|█████▊    | 575/1000 [21:06<15:57,  2.25s/it]

loss: -11.14 | unlearn_loss: -12.07 | retain_loss: 0.9283 | avg_entropy: 12.07 | param_change: 0.002007


 58%|█████▊    | 576/1000 [21:08<16:05,  2.28s/it]

loss: -11 | unlearn_loss: -12.06 | retain_loss: 1.066 | avg_entropy: 12.06 | param_change: 0.003569


 58%|█████▊    | 577/1000 [21:11<16:28,  2.34s/it]

loss: -10.97 | unlearn_loss: -12.05 | retain_loss: 1.081 | avg_entropy: 12.05 | param_change: 0.002525


 58%|█████▊    | 578/1000 [21:13<16:22,  2.33s/it]

loss: -11.49 | unlearn_loss: -12.07 | retain_loss: 0.5757 | avg_entropy: 12.07 | param_change: 0.0008373


 58%|█████▊    | 579/1000 [21:15<16:08,  2.30s/it]

loss: -11.2 | unlearn_loss: -11.98 | retain_loss: 0.7884 | avg_entropy: 11.98 | param_change: 0.002333


 58%|█████▊    | 580/1000 [21:17<15:50,  2.26s/it]

loss: -10.66 | unlearn_loss: -11.89 | retain_loss: 1.228 | avg_entropy: 11.89 | param_change: 0.002922


 58%|█████▊    | 581/1000 [21:20<16:16,  2.33s/it]

loss: -11.42 | unlearn_loss: -12.07 | retain_loss: 0.6467 | avg_entropy: 12.07 | param_change: 0.0009047


 58%|█████▊    | 582/1000 [21:22<16:40,  2.39s/it]

loss: -11.09 | unlearn_loss: -11.98 | retain_loss: 0.8891 | avg_entropy: 11.98 | param_change: 0.001319


 58%|█████▊    | 583/1000 [21:25<16:56,  2.44s/it]

loss: -11.36 | unlearn_loss: -12.07 | retain_loss: 0.7063 | avg_entropy: 12.07 | param_change: 0.001122


 58%|█████▊    | 584/1000 [21:27<16:01,  2.31s/it]

loss: -11.19 | unlearn_loss: -12.05 | retain_loss: 0.8643 | avg_entropy: 12.05 | param_change: 0.00136


 58%|█████▊    | 585/1000 [21:29<15:54,  2.30s/it]

loss: -11.36 | unlearn_loss: -12.03 | retain_loss: 0.6742 | avg_entropy: 12.03 | param_change: 0.001117


 59%|█████▊    | 586/1000 [21:31<15:22,  2.23s/it]

loss: -10.36 | unlearn_loss: -11.88 | retain_loss: 1.524 | avg_entropy: 11.88 | param_change: 0.002482


 59%|█████▊    | 587/1000 [21:33<14:49,  2.15s/it]

loss: -10.64 | unlearn_loss: -11.84 | retain_loss: 1.192 | avg_entropy: 11.84 | param_change: 0.003893


 59%|█████▉    | 588/1000 [21:35<14:42,  2.14s/it]

loss: -10.85 | unlearn_loss: -12.03 | retain_loss: 1.178 | avg_entropy: 12.03 | param_change: 0.003547


 59%|█████▉    | 589/1000 [21:37<14:35,  2.13s/it]

loss: -10.93 | unlearn_loss: -12.07 | retain_loss: 1.146 | avg_entropy: 12.07 | param_change: 0.003173


 59%|█████▉    | 590/1000 [21:39<14:26,  2.11s/it]

loss: -10.9 | unlearn_loss: -11.73 | retain_loss: 0.8296 | avg_entropy: 11.73 | param_change: 0.002007


 59%|█████▉    | 591/1000 [21:42<14:20,  2.10s/it]

loss: -11.2 | unlearn_loss: -12.04 | retain_loss: 0.8381 | avg_entropy: 12.04 | param_change: 0.002255


 59%|█████▉    | 592/1000 [21:44<14:14,  2.09s/it]

loss: -11.21 | unlearn_loss: -12.06 | retain_loss: 0.8574 | avg_entropy: 12.06 | param_change: 0.001986


 59%|█████▉    | 593/1000 [21:46<13:58,  2.06s/it]

loss: -11.15 | unlearn_loss: -12.05 | retain_loss: 0.8987 | avg_entropy: 12.05 | param_change: 0.002112


 59%|█████▉    | 594/1000 [21:48<14:03,  2.08s/it]

loss: -11.16 | unlearn_loss: -11.98 | retain_loss: 0.8255 | avg_entropy: 11.98 | param_change: 0.001779


 60%|█████▉    | 595/1000 [21:50<13:58,  2.07s/it]

loss: -10.71 | unlearn_loss: -12 | retain_loss: 1.289 | avg_entropy: 12 | param_change: 0.002598


 60%|█████▉    | 596/1000 [21:52<13:58,  2.08s/it]

loss: -10.53 | unlearn_loss: -11.92 | retain_loss: 1.389 | avg_entropy: 11.92 | param_change: 0.002127


 60%|█████▉    | 597/1000 [21:54<14:46,  2.20s/it]

loss: -11.35 | unlearn_loss: -12.05 | retain_loss: 0.6996 | avg_entropy: 12.05 | param_change: 0.001


 60%|█████▉    | 598/1000 [21:56<14:37,  2.18s/it]

loss: -8.628 | unlearn_loss: -12.05 | retain_loss: 3.425 | avg_entropy: 12.05 | param_change: 0.004538


 60%|█████▉    | 599/1000 [21:59<14:22,  2.15s/it]

loss: -11.01 | unlearn_loss: -12.03 | retain_loss: 1.017 | avg_entropy: 12.03 | param_change: 0.00147


 60%|██████    | 600/1000 [22:01<14:07,  2.12s/it]

loss: -9.926 | unlearn_loss: -11.98 | retain_loss: 2.049 | avg_entropy: 11.98 | param_change: 0.003378


 60%|██████    | 601/1000 [22:03<13:50,  2.08s/it]

loss: -10.38 | unlearn_loss: -11.9 | retain_loss: 1.519 | avg_entropy: 11.9 | param_change: 0.003835


 60%|██████    | 602/1000 [22:05<14:08,  2.13s/it]

loss: -11.01 | unlearn_loss: -11.9 | retain_loss: 0.8993 | avg_entropy: 11.9 | param_change: 0.00177


 60%|██████    | 603/1000 [22:07<14:35,  2.20s/it]

loss: -9.689 | unlearn_loss: -10.96 | retain_loss: 1.275 | avg_entropy: 10.96 | param_change: 0.005883


 60%|██████    | 604/1000 [22:10<15:28,  2.34s/it]

loss: -11.01 | unlearn_loss: -11.91 | retain_loss: 0.9037 | avg_entropy: 11.91 | param_change: 0.001466


 60%|██████    | 605/1000 [22:12<14:59,  2.28s/it]

loss: -11.03 | unlearn_loss: -11.99 | retain_loss: 0.9631 | avg_entropy: 11.99 | param_change: 0.003184


 61%|██████    | 606/1000 [22:14<15:08,  2.31s/it]

loss: -11.23 | unlearn_loss: -11.91 | retain_loss: 0.6722 | avg_entropy: 11.91 | param_change: 0.001647


 61%|██████    | 607/1000 [22:17<15:36,  2.38s/it]

loss: -11.01 | unlearn_loss: -11.95 | retain_loss: 0.9422 | avg_entropy: 11.95 | param_change: 0.002427


 61%|██████    | 608/1000 [22:19<15:28,  2.37s/it]

loss: -11.41 | unlearn_loss: -12.05 | retain_loss: 0.6388 | avg_entropy: 12.05 | param_change: 0.001321


 61%|██████    | 609/1000 [22:22<15:38,  2.40s/it]

loss: -11.22 | unlearn_loss: -12 | retain_loss: 0.7714 | avg_entropy: 12 | param_change: 0.0009948


 61%|██████    | 610/1000 [22:24<15:01,  2.31s/it]

loss: -11.25 | unlearn_loss: -12.04 | retain_loss: 0.7946 | avg_entropy: 12.04 | param_change: 0.001882


 61%|██████    | 611/1000 [22:26<14:56,  2.30s/it]

loss: -11.4 | unlearn_loss: -12.06 | retain_loss: 0.6558 | avg_entropy: 12.06 | param_change: 0.001047


 61%|██████    | 612/1000 [22:28<14:28,  2.24s/it]

loss: -11.07 | unlearn_loss: -12.04 | retain_loss: 0.9719 | avg_entropy: 12.04 | param_change: 0.002083


 61%|██████▏   | 613/1000 [22:30<14:21,  2.23s/it]

loss: -9.848 | unlearn_loss: -11.85 | retain_loss: 2.002 | avg_entropy: 11.85 | param_change: 0.005085


 61%|██████▏   | 614/1000 [22:32<14:01,  2.18s/it]

loss: -11.14 | unlearn_loss: -11.99 | retain_loss: 0.8499 | avg_entropy: 11.99 | param_change: 0.001237


 62%|██████▏   | 615/1000 [22:35<14:16,  2.22s/it]

loss: -10.89 | unlearn_loss: -11.72 | retain_loss: 0.8309 | avg_entropy: 11.72 | param_change: 0.001336


 62%|██████▏   | 616/1000 [22:38<15:18,  2.39s/it]

loss: -11.1 | unlearn_loss: -11.74 | retain_loss: 0.6371 | avg_entropy: 11.74 | param_change: 0.001152


 62%|██████▏   | 617/1000 [22:40<15:03,  2.36s/it]

loss: -11.11 | unlearn_loss: -12 | retain_loss: 0.8887 | avg_entropy: 12 | param_change: 0.001503


 62%|██████▏   | 618/1000 [22:42<14:46,  2.32s/it]

loss: -11.27 | unlearn_loss: -11.92 | retain_loss: 0.6521 | avg_entropy: 11.92 | param_change: 0.001369


 62%|██████▏   | 619/1000 [22:44<14:30,  2.29s/it]

loss: -11.13 | unlearn_loss: -12.06 | retain_loss: 0.9345 | avg_entropy: 12.06 | param_change: 0.002125


 62%|██████▏   | 620/1000 [22:47<14:33,  2.30s/it]

loss: -9.83 | unlearn_loss: -11.64 | retain_loss: 1.809 | avg_entropy: 11.64 | param_change: 0.003906


 62%|██████▏   | 621/1000 [22:49<14:11,  2.25s/it]

loss: -10.43 | unlearn_loss: -12.05 | retain_loss: 1.619 | avg_entropy: 12.05 | param_change: 0.003457


 62%|██████▏   | 622/1000 [22:51<13:41,  2.17s/it]

loss: -10.33 | unlearn_loss: -11.81 | retain_loss: 1.482 | avg_entropy: 11.81 | param_change: 0.002913


 62%|██████▏   | 623/1000 [22:53<13:46,  2.19s/it]

loss: -9.464 | unlearn_loss: -12.02 | retain_loss: 2.557 | avg_entropy: 12.02 | param_change: 0.004137


 62%|██████▏   | 624/1000 [22:55<13:22,  2.13s/it]

loss: -10.95 | unlearn_loss: -11.89 | retain_loss: 0.9349 | avg_entropy: 11.89 | param_change: 0.00193


 62%|██████▎   | 625/1000 [22:57<13:28,  2.16s/it]

loss: -11.13 | unlearn_loss: -12.02 | retain_loss: 0.8938 | avg_entropy: 12.02 | param_change: 0.001551


 63%|██████▎   | 626/1000 [23:00<13:44,  2.20s/it]

loss: -10.78 | unlearn_loss: -12.06 | retain_loss: 1.281 | avg_entropy: 12.06 | param_change: 0.001931


 63%|██████▎   | 627/1000 [23:02<13:58,  2.25s/it]

loss: -10.51 | unlearn_loss: -12.03 | retain_loss: 1.518 | avg_entropy: 12.03 | param_change: 0.002434


 63%|██████▎   | 628/1000 [23:04<14:17,  2.31s/it]

loss: -11.16 | unlearn_loss: -12.03 | retain_loss: 0.8757 | avg_entropy: 12.03 | param_change: 0.001472


 63%|██████▎   | 629/1000 [23:06<13:57,  2.26s/it]

loss: -10.99 | unlearn_loss: -12.03 | retain_loss: 1.045 | avg_entropy: 12.03 | param_change: 0.001521


 63%|██████▎   | 630/1000 [23:09<13:56,  2.26s/it]

loss: -10.82 | unlearn_loss: -11.84 | retain_loss: 1.025 | avg_entropy: 11.84 | param_change: 0.001592


 63%|██████▎   | 631/1000 [23:11<13:45,  2.24s/it]

loss: -10.65 | unlearn_loss: -11.85 | retain_loss: 1.194 | avg_entropy: 11.85 | param_change: 0.001884


 63%|██████▎   | 632/1000 [23:13<13:15,  2.16s/it]

loss: -11.3 | unlearn_loss: -12.05 | retain_loss: 0.7513 | avg_entropy: 12.05 | param_change: 0.001107


 63%|██████▎   | 633/1000 [23:15<13:20,  2.18s/it]

loss: -10.56 | unlearn_loss: -12.06 | retain_loss: 1.503 | avg_entropy: 12.06 | param_change: 0.002785


 63%|██████▎   | 634/1000 [23:17<13:24,  2.20s/it]

loss: -11.02 | unlearn_loss: -12.04 | retain_loss: 1.023 | avg_entropy: 12.04 | param_change: 0.001772


 64%|██████▎   | 635/1000 [23:19<13:02,  2.14s/it]

loss: -10.84 | unlearn_loss: -12.06 | retain_loss: 1.222 | avg_entropy: 12.06 | param_change: 0.003445


 64%|██████▎   | 636/1000 [23:21<12:42,  2.09s/it]

loss: -10.73 | unlearn_loss: -12.07 | retain_loss: 1.343 | avg_entropy: 12.07 | param_change: 0.002525


 64%|██████▎   | 637/1000 [23:23<12:39,  2.09s/it]

loss: -10.67 | unlearn_loss: -12.07 | retain_loss: 1.4 | avg_entropy: 12.07 | param_change: 0.002661


 64%|██████▍   | 638/1000 [23:25<12:26,  2.06s/it]

loss: -10.7 | unlearn_loss: -12.04 | retain_loss: 1.337 | avg_entropy: 12.04 | param_change: 0.002972


 64%|██████▍   | 639/1000 [23:28<12:28,  2.07s/it]

loss: -10.92 | unlearn_loss: -11.96 | retain_loss: 1.042 | avg_entropy: 11.96 | param_change: 0.002402


 64%|██████▍   | 640/1000 [23:30<12:18,  2.05s/it]

loss: -10.45 | unlearn_loss: -12.06 | retain_loss: 1.613 | avg_entropy: 12.06 | param_change: 0.002687


 64%|██████▍   | 641/1000 [23:32<12:14,  2.05s/it]

loss: -8.59 | unlearn_loss: -11.83 | retain_loss: 3.238 | avg_entropy: 11.83 | param_change: 0.006905


 64%|██████▍   | 642/1000 [23:34<12:12,  2.05s/it]

loss: -9.45 | unlearn_loss: -12.06 | retain_loss: 2.611 | avg_entropy: 12.06 | param_change: 0.01032


 64%|██████▍   | 643/1000 [23:36<12:09,  2.04s/it]

loss: -10.26 | unlearn_loss: -12.06 | retain_loss: 1.802 | avg_entropy: 12.06 | param_change: 0.003711


 64%|██████▍   | 644/1000 [23:38<12:14,  2.06s/it]

loss: -9.505 | unlearn_loss: -12.04 | retain_loss: 2.54 | avg_entropy: 12.04 | param_change: 0.008168


 64%|██████▍   | 645/1000 [23:40<12:01,  2.03s/it]

loss: 1.126 | unlearn_loss: -12.06 | retain_loss: 13.18 | avg_entropy: 12.06 | param_change: 0.02968


 65%|██████▍   | 646/1000 [23:42<11:48,  2.00s/it]

loss: -8.255 | unlearn_loss: -11.73 | retain_loss: 3.472 | avg_entropy: 11.73 | param_change: 0.009515


 65%|██████▍   | 647/1000 [23:44<11:37,  1.98s/it]

loss: -9.06 | unlearn_loss: -11.9 | retain_loss: 2.84 | avg_entropy: 11.9 | param_change: 0.007177


 65%|██████▍   | 648/1000 [23:46<11:42,  1.99s/it]

loss: -10.08 | unlearn_loss: -11.9 | retain_loss: 1.818 | avg_entropy: 11.9 | param_change: 0.003714


 65%|██████▍   | 649/1000 [23:48<11:36,  1.98s/it]

loss: -9.741 | unlearn_loss: -12.06 | retain_loss: 2.319 | avg_entropy: 12.06 | param_change: 0.003661


 65%|██████▌   | 650/1000 [23:50<11:34,  1.98s/it]

loss: -9.601 | unlearn_loss: -11.84 | retain_loss: 2.235 | avg_entropy: 11.84 | param_change: 0.004779


 65%|██████▌   | 651/1000 [23:51<11:21,  1.95s/it]

loss: -7.261 | unlearn_loss: -11.85 | retain_loss: 4.591 | avg_entropy: 11.85 | param_change: 0.007981


 65%|██████▌   | 652/1000 [23:53<11:19,  1.95s/it]

loss: -7.868 | unlearn_loss: -12 | retain_loss: 4.129 | avg_entropy: 12 | param_change: 0.008559


 65%|██████▌   | 653/1000 [23:55<11:19,  1.96s/it]

loss: -7.866 | unlearn_loss: -12.06 | retain_loss: 4.198 | avg_entropy: 12.06 | param_change: 0.00842


 65%|██████▌   | 654/1000 [23:57<11:21,  1.97s/it]

loss: -9.438 | unlearn_loss: -12.07 | retain_loss: 2.628 | avg_entropy: 12.07 | param_change: 0.006327


 66%|██████▌   | 655/1000 [23:59<11:29,  2.00s/it]

loss: -8.359 | unlearn_loss: -10.64 | retain_loss: 2.281 | avg_entropy: 10.64 | param_change: 0.005734


 66%|██████▌   | 656/1000 [24:01<11:17,  1.97s/it]

loss: -8.218 | unlearn_loss: -12.06 | retain_loss: 3.839 | avg_entropy: 12.06 | param_change: 0.006796


 66%|██████▌   | 657/1000 [24:03<11:12,  1.96s/it]

loss: -10.08 | unlearn_loss: -12.06 | retain_loss: 1.978 | avg_entropy: 12.06 | param_change: 0.002991


 66%|██████▌   | 658/1000 [24:05<11:12,  1.97s/it]

loss: -9.311 | unlearn_loss: -11.86 | retain_loss: 2.554 | avg_entropy: 11.86 | param_change: 0.005589


 66%|██████▌   | 659/1000 [24:07<11:12,  1.97s/it]

loss: -9.755 | unlearn_loss: -12.06 | retain_loss: 2.308 | avg_entropy: 12.06 | param_change: 0.003975


 66%|██████▌   | 660/1000 [24:09<11:16,  1.99s/it]

loss: -5.908 | unlearn_loss: -11.81 | retain_loss: 5.897 | avg_entropy: 11.81 | param_change: 0.00943


 66%|██████▌   | 661/1000 [24:11<11:18,  2.00s/it]

loss: -9.584 | unlearn_loss: -11.89 | retain_loss: 2.309 | avg_entropy: 11.89 | param_change: 0.004293


 66%|██████▌   | 662/1000 [24:13<11:16,  2.00s/it]

loss: -9.879 | unlearn_loss: -12.05 | retain_loss: 2.168 | avg_entropy: 12.05 | param_change: 0.003279


 66%|██████▋   | 663/1000 [24:15<11:16,  2.01s/it]

loss: -9.48 | unlearn_loss: -12.05 | retain_loss: 2.572 | avg_entropy: 12.05 | param_change: 0.004019


 66%|██████▋   | 664/1000 [24:17<11:23,  2.03s/it]

loss: 6.998 | unlearn_loss: -12.04 | retain_loss: 19.03 | avg_entropy: 12.04 | param_change: 0.01274


 66%|██████▋   | 665/1000 [24:19<11:18,  2.03s/it]

loss: 2.069 | unlearn_loss: -12.05 | retain_loss: 14.12 | avg_entropy: 12.05 | param_change: 0.01007


 67%|██████▋   | 666/1000 [24:21<11:14,  2.02s/it]

loss: -7.901 | unlearn_loss: -11.87 | retain_loss: 3.967 | avg_entropy: 11.87 | param_change: 0.007582


 67%|██████▋   | 667/1000 [24:23<11:06,  2.00s/it]

loss: -9.795 | unlearn_loss: -11.86 | retain_loss: 2.066 | avg_entropy: 11.86 | param_change: 0.00376


 67%|██████▋   | 668/1000 [24:25<11:03,  2.00s/it]

loss: -9.431 | unlearn_loss: -11.69 | retain_loss: 2.257 | avg_entropy: 11.69 | param_change: 0.004196


 67%|██████▋   | 669/1000 [24:27<10:57,  1.99s/it]

loss: -6.826 | unlearn_loss: -12.06 | retain_loss: 5.239 | avg_entropy: 12.06 | param_change: 0.005282


 67%|██████▋   | 670/1000 [24:29<10:55,  1.99s/it]

loss: -7.676 | unlearn_loss: -12.04 | retain_loss: 4.365 | avg_entropy: 12.04 | param_change: 0.006568


 67%|██████▋   | 671/1000 [24:31<10:59,  2.01s/it]

loss: -7.065 | unlearn_loss: -11.9 | retain_loss: 4.834 | avg_entropy: 11.9 | param_change: 0.007051


 67%|██████▋   | 672/1000 [24:33<10:56,  2.00s/it]

loss: -9.164 | unlearn_loss: -11.93 | retain_loss: 2.761 | avg_entropy: 11.93 | param_change: 0.003697


 67%|██████▋   | 673/1000 [24:35<10:49,  1.99s/it]

loss: 3.408 | unlearn_loss: -11.93 | retain_loss: 15.34 | avg_entropy: 11.93 | param_change: 0.008612


 67%|██████▋   | 674/1000 [24:37<10:45,  1.98s/it]

loss: -7.655 | unlearn_loss: -12.08 | retain_loss: 4.425 | avg_entropy: 12.08 | param_change: 0.006289


 68%|██████▊   | 675/1000 [24:39<10:42,  1.98s/it]

loss: -8.845 | unlearn_loss: -12.08 | retain_loss: 3.234 | avg_entropy: 12.08 | param_change: 0.005218


 68%|██████▊   | 676/1000 [24:41<10:43,  1.99s/it]

loss: -9.612 | unlearn_loss: -12 | retain_loss: 2.385 | avg_entropy: 12 | param_change: 0.003861


 68%|██████▊   | 677/1000 [24:43<10:38,  1.98s/it]

loss: -0.4896 | unlearn_loss: -11.89 | retain_loss: 11.4 | avg_entropy: 11.89 | param_change: 0.005936


 68%|██████▊   | 678/1000 [24:45<10:39,  1.98s/it]

loss: -9.566 | unlearn_loss: -11.89 | retain_loss: 2.326 | avg_entropy: 11.89 | param_change: 0.003623


 68%|██████▊   | 679/1000 [24:47<10:39,  1.99s/it]

loss: -7.19 | unlearn_loss: -12.01 | retain_loss: 4.821 | avg_entropy: 12.01 | param_change: 0.006367


 68%|██████▊   | 680/1000 [24:49<10:41,  2.01s/it]

loss: -8.396 | unlearn_loss: -12.05 | retain_loss: 3.652 | avg_entropy: 12.05 | param_change: 0.005007


 68%|██████▊   | 681/1000 [24:51<10:46,  2.03s/it]

loss: -4.171 | unlearn_loss: -12.06 | retain_loss: 7.894 | avg_entropy: 12.06 | param_change: 0.01055


 68%|██████▊   | 682/1000 [24:53<10:38,  2.01s/it]

loss: -4.944 | unlearn_loss: -11.75 | retain_loss: 6.805 | avg_entropy: 11.75 | param_change: 0.01468


 68%|██████▊   | 683/1000 [24:55<10:34,  2.00s/it]

loss: -7.68 | unlearn_loss: -12.01 | retain_loss: 4.327 | avg_entropy: 12.01 | param_change: 0.006173


 68%|██████▊   | 684/1000 [24:57<10:33,  2.01s/it]

loss: -5.559 | unlearn_loss: -11.83 | retain_loss: 6.273 | avg_entropy: 11.83 | param_change: 0.008131


 68%|██████▊   | 685/1000 [24:59<10:33,  2.01s/it]

loss: -7.499 | unlearn_loss: -12.04 | retain_loss: 4.543 | avg_entropy: 12.04 | param_change: 0.00704


 69%|██████▊   | 686/1000 [25:01<10:35,  2.02s/it]

loss: -5.508 | unlearn_loss: -12.07 | retain_loss: 6.567 | avg_entropy: 12.07 | param_change: 0.005931


 69%|██████▊   | 687/1000 [25:03<10:31,  2.02s/it]

loss: -0.7905 | unlearn_loss: -12.08 | retain_loss: 11.29 | avg_entropy: 12.08 | param_change: 0.008369


 69%|██████▉   | 688/1000 [25:05<10:24,  2.00s/it]

loss: -7.146 | unlearn_loss: -11.78 | retain_loss: 4.635 | avg_entropy: 11.78 | param_change: 0.006069


 69%|██████▉   | 689/1000 [25:07<10:26,  2.01s/it]

loss: -5.364 | unlearn_loss: -11.85 | retain_loss: 6.486 | avg_entropy: 11.85 | param_change: 0.006301


 69%|██████▉   | 690/1000 [25:09<10:24,  2.02s/it]

loss: -6.938 | unlearn_loss: -12.05 | retain_loss: 5.115 | avg_entropy: 12.05 | param_change: 0.006463


 69%|██████▉   | 691/1000 [25:11<10:22,  2.01s/it]

loss: -1.545 | unlearn_loss: -11.8 | retain_loss: 10.25 | avg_entropy: 11.8 | param_change: 0.007831


 69%|██████▉   | 692/1000 [25:13<10:18,  2.01s/it]

loss: -6.408 | unlearn_loss: -12.08 | retain_loss: 5.67 | avg_entropy: 12.08 | param_change: 0.004555


 69%|██████▉   | 693/1000 [25:15<10:22,  2.03s/it]

loss: -5.176 | unlearn_loss: -12.06 | retain_loss: 6.882 | avg_entropy: 12.06 | param_change: 0.005023


 69%|██████▉   | 694/1000 [25:17<10:19,  2.03s/it]

loss: -9.573 | unlearn_loss: -12.07 | retain_loss: 2.497 | avg_entropy: 12.07 | param_change: 0.002713


 70%|██████▉   | 695/1000 [25:20<10:27,  2.06s/it]

loss: -9.935 | unlearn_loss: -12.07 | retain_loss: 2.13 | avg_entropy: 12.07 | param_change: 0.002654


 70%|██████▉   | 696/1000 [25:22<10:25,  2.06s/it]

loss: -10.19 | unlearn_loss: -12.08 | retain_loss: 1.891 | avg_entropy: 12.08 | param_change: 0.002199


 70%|██████▉   | 697/1000 [25:24<10:38,  2.11s/it]

loss: -9.997 | unlearn_loss: -11.7 | retain_loss: 1.703 | avg_entropy: 11.7 | param_change: 0.001285


 70%|██████▉   | 698/1000 [25:26<10:30,  2.09s/it]

loss: -10.41 | unlearn_loss: -12.07 | retain_loss: 1.663 | avg_entropy: 12.07 | param_change: 0.002074


 70%|██████▉   | 699/1000 [25:28<11:00,  2.19s/it]

loss: -9.343 | unlearn_loss: -12.08 | retain_loss: 2.736 | avg_entropy: 12.08 | param_change: 0.002819


 70%|███████   | 700/1000 [25:30<10:45,  2.15s/it]

loss: -10.48 | unlearn_loss: -12.09 | retain_loss: 1.605 | avg_entropy: 12.09 | param_change: 0.001961


 70%|███████   | 701/1000 [25:33<10:38,  2.14s/it]

loss: -11.03 | unlearn_loss: -12.07 | retain_loss: 1.041 | avg_entropy: 12.07 | param_change: 0.001227


 70%|███████   | 702/1000 [25:35<10:45,  2.17s/it]

loss: -10.12 | unlearn_loss: -11.73 | retain_loss: 1.61 | avg_entropy: 11.73 | param_change: 0.002621


 70%|███████   | 703/1000 [25:37<10:40,  2.16s/it]

loss: -4.132 | unlearn_loss: -12.06 | retain_loss: 7.928 | avg_entropy: 12.06 | param_change: 0.00522


 70%|███████   | 704/1000 [25:39<10:23,  2.11s/it]

loss: -10.24 | unlearn_loss: -12.07 | retain_loss: 1.829 | avg_entropy: 12.07 | param_change: 0.002852


 70%|███████   | 705/1000 [25:41<10:12,  2.08s/it]

loss: -9.767 | unlearn_loss: -11.93 | retain_loss: 2.163 | avg_entropy: 11.93 | param_change: 0.003035


 71%|███████   | 706/1000 [25:43<09:56,  2.03s/it]

loss: -10.84 | unlearn_loss: -12.09 | retain_loss: 1.25 | avg_entropy: 12.09 | param_change: 0.001933


 71%|███████   | 707/1000 [25:45<10:05,  2.07s/it]

loss: -10.23 | unlearn_loss: -12.09 | retain_loss: 1.863 | avg_entropy: 12.09 | param_change: 0.004032


 71%|███████   | 708/1000 [25:47<09:57,  2.05s/it]

loss: -10.66 | unlearn_loss: -12.09 | retain_loss: 1.436 | avg_entropy: 12.09 | param_change: 0.003241


 71%|███████   | 709/1000 [25:49<09:56,  2.05s/it]

loss: -9.503 | unlearn_loss: -12.09 | retain_loss: 2.587 | avg_entropy: 12.09 | param_change: 0.006557


 71%|███████   | 710/1000 [25:51<09:43,  2.01s/it]

loss: -8.017 | unlearn_loss: -12 | retain_loss: 3.985 | avg_entropy: 12 | param_change: 0.004211


 71%|███████   | 711/1000 [25:53<09:38,  2.00s/it]

loss: -10.38 | unlearn_loss: -11.91 | retain_loss: 1.527 | avg_entropy: 11.91 | param_change: 0.002968


 71%|███████   | 712/1000 [25:55<09:34,  2.00s/it]

loss: -10.12 | unlearn_loss: -12.04 | retain_loss: 1.915 | avg_entropy: 12.04 | param_change: 0.003193


 71%|███████▏  | 713/1000 [25:57<09:33,  2.00s/it]

loss: -10.69 | unlearn_loss: -12.09 | retain_loss: 1.395 | avg_entropy: 12.09 | param_change: 0.002545


 71%|███████▏  | 714/1000 [25:59<09:37,  2.02s/it]

loss: -8.844 | unlearn_loss: -12.08 | retain_loss: 3.234 | avg_entropy: 12.08 | param_change: 0.002732


 72%|███████▏  | 715/1000 [26:01<09:32,  2.01s/it]

loss: -9.904 | unlearn_loss: -12.08 | retain_loss: 2.174 | avg_entropy: 12.08 | param_change: 0.0035


 72%|███████▏  | 716/1000 [26:03<09:35,  2.03s/it]

loss: -8.926 | unlearn_loss: -11.42 | retain_loss: 2.491 | avg_entropy: 11.42 | param_change: 0.00458


 72%|███████▏  | 717/1000 [26:05<09:26,  2.00s/it]

loss: -10.33 | unlearn_loss: -11.83 | retain_loss: 1.498 | avg_entropy: 11.83 | param_change: 0.00193


 72%|███████▏  | 718/1000 [26:07<09:35,  2.04s/it]

loss: -10.1 | unlearn_loss: -12.1 | retain_loss: 1.992 | avg_entropy: 12.1 | param_change: 0.002481


 72%|███████▏  | 719/1000 [26:09<09:39,  2.06s/it]

loss: -9.262 | unlearn_loss: -12.1 | retain_loss: 2.835 | avg_entropy: 12.1 | param_change: 0.003397


 72%|███████▏  | 720/1000 [26:11<09:31,  2.04s/it]

loss: -9.067 | unlearn_loss: -11.59 | retain_loss: 2.528 | avg_entropy: 11.59 | param_change: 0.002206


 72%|███████▏  | 721/1000 [26:13<09:47,  2.11s/it]

loss: -7.798 | unlearn_loss: -11.72 | retain_loss: 3.92 | avg_entropy: 11.72 | param_change: 0.003468


 72%|███████▏  | 722/1000 [26:15<09:35,  2.07s/it]

loss: -9.911 | unlearn_loss: -12.07 | retain_loss: 2.155 | avg_entropy: 12.07 | param_change: 0.00202


 72%|███████▏  | 723/1000 [26:18<09:47,  2.12s/it]

loss: -10.54 | unlearn_loss: -12.01 | retain_loss: 1.474 | avg_entropy: 12.01 | param_change: 0.001381


 72%|███████▏  | 724/1000 [26:20<09:41,  2.11s/it]

loss: -10.46 | unlearn_loss: -11.81 | retain_loss: 1.343 | avg_entropy: 11.81 | param_change: 0.001657


 72%|███████▎  | 725/1000 [26:22<10:09,  2.22s/it]

loss: -10.67 | unlearn_loss: -12.06 | retain_loss: 1.385 | avg_entropy: 12.06 | param_change: 0.001254


 73%|███████▎  | 726/1000 [26:25<10:39,  2.33s/it]

loss: -10.59 | unlearn_loss: -11.74 | retain_loss: 1.146 | avg_entropy: 11.74 | param_change: 0.001241


 73%|███████▎  | 727/1000 [26:27<10:26,  2.30s/it]

loss: -10.95 | unlearn_loss: -12.09 | retain_loss: 1.137 | avg_entropy: 12.09 | param_change: 0.001442


 73%|███████▎  | 728/1000 [26:29<10:20,  2.28s/it]

loss: -10.85 | unlearn_loss: -12.01 | retain_loss: 1.158 | avg_entropy: 12.01 | param_change: 0.001343


 73%|███████▎  | 729/1000 [26:32<10:19,  2.28s/it]

loss: -10.03 | unlearn_loss: -11.9 | retain_loss: 1.865 | avg_entropy: 11.9 | param_change: 0.001898


 73%|███████▎  | 730/1000 [26:34<10:11,  2.27s/it]

loss: -11.2 | unlearn_loss: -12.08 | retain_loss: 0.8801 | avg_entropy: 12.08 | param_change: 0.0009604


 73%|███████▎  | 731/1000 [26:36<10:20,  2.31s/it]

loss: -11.08 | unlearn_loss: -12.09 | retain_loss: 1.003 | avg_entropy: 12.09 | param_change: 0.0009198


 73%|███████▎  | 732/1000 [26:39<10:32,  2.36s/it]

loss: -10.19 | unlearn_loss: -12.03 | retain_loss: 1.837 | avg_entropy: 12.03 | param_change: 0.002004


 73%|███████▎  | 733/1000 [26:41<10:25,  2.34s/it]

loss: -10.82 | unlearn_loss: -12.06 | retain_loss: 1.236 | avg_entropy: 12.06 | param_change: 0.001646


 73%|███████▎  | 734/1000 [26:43<09:57,  2.25s/it]

loss: -10.58 | unlearn_loss: -12.07 | retain_loss: 1.49 | avg_entropy: 12.07 | param_change: 0.002705


 74%|███████▎  | 735/1000 [26:45<09:58,  2.26s/it]

loss: -9.981 | unlearn_loss: -11.9 | retain_loss: 1.922 | avg_entropy: 11.9 | param_change: 0.002646


 74%|███████▎  | 736/1000 [26:48<10:09,  2.31s/it]

loss: -10.43 | unlearn_loss: -12.09 | retain_loss: 1.656 | avg_entropy: 12.09 | param_change: 0.002103


 74%|███████▎  | 737/1000 [26:50<10:01,  2.29s/it]

loss: -10.19 | unlearn_loss: -12.07 | retain_loss: 1.886 | avg_entropy: 12.07 | param_change: 0.003222


 74%|███████▍  | 738/1000 [26:52<09:41,  2.22s/it]

loss: -10.5 | unlearn_loss: -11.82 | retain_loss: 1.318 | avg_entropy: 11.82 | param_change: 0.001888


 74%|███████▍  | 739/1000 [26:54<09:23,  2.16s/it]

loss: -9.819 | unlearn_loss: -11.96 | retain_loss: 2.137 | avg_entropy: 11.96 | param_change: 0.003064


 74%|███████▍  | 740/1000 [26:56<09:17,  2.14s/it]

loss: -10.81 | unlearn_loss: -12.08 | retain_loss: 1.267 | avg_entropy: 12.08 | param_change: 0.001469


 74%|███████▍  | 741/1000 [26:58<09:18,  2.16s/it]

loss: -10.91 | unlearn_loss: -12.08 | retain_loss: 1.171 | avg_entropy: 12.08 | param_change: 0.0014


 74%|███████▍  | 742/1000 [27:01<09:21,  2.18s/it]

loss: -10.53 | unlearn_loss: -12.09 | retain_loss: 1.562 | avg_entropy: 12.09 | param_change: 0.00178


 74%|███████▍  | 743/1000 [27:03<09:32,  2.23s/it]

loss: -10.44 | unlearn_loss: -12.05 | retain_loss: 1.611 | avg_entropy: 12.05 | param_change: 0.002014


 74%|███████▍  | 744/1000 [27:06<10:06,  2.37s/it]

loss: -10.93 | unlearn_loss: -12.09 | retain_loss: 1.157 | avg_entropy: 12.09 | param_change: 0.001256


 74%|███████▍  | 745/1000 [27:08<09:41,  2.28s/it]

loss: -10.66 | unlearn_loss: -11.92 | retain_loss: 1.265 | avg_entropy: 11.92 | param_change: 0.001269


 75%|███████▍  | 746/1000 [27:10<09:34,  2.26s/it]

loss: -10.9 | unlearn_loss: -12.09 | retain_loss: 1.182 | avg_entropy: 12.09 | param_change: 0.00154


 75%|███████▍  | 747/1000 [27:12<09:25,  2.24s/it]

loss: -10.55 | unlearn_loss: -12.06 | retain_loss: 1.51 | avg_entropy: 12.06 | param_change: 0.002062


 75%|███████▍  | 748/1000 [27:14<09:33,  2.27s/it]

loss: -10.35 | unlearn_loss: -12.08 | retain_loss: 1.727 | avg_entropy: 12.08 | param_change: 0.0019


 75%|███████▍  | 749/1000 [27:16<09:10,  2.19s/it]

loss: -10.99 | unlearn_loss: -11.85 | retain_loss: 0.8633 | avg_entropy: 11.85 | param_change: 0.0008703


 75%|███████▌  | 750/1000 [27:19<09:42,  2.33s/it]

loss: -11.26 | unlearn_loss: -12.07 | retain_loss: 0.8117 | avg_entropy: 12.07 | param_change: 0.001135


 75%|███████▌  | 751/1000 [27:21<09:32,  2.30s/it]

loss: -10.59 | unlearn_loss: -12.07 | retain_loss: 1.485 | avg_entropy: 12.07 | param_change: 0.001828


 75%|███████▌  | 752/1000 [27:24<09:22,  2.27s/it]

loss: -10.9 | unlearn_loss: -12.08 | retain_loss: 1.182 | avg_entropy: 12.08 | param_change: 0.001642


 75%|███████▌  | 753/1000 [27:26<09:12,  2.24s/it]

loss: -8.427 | unlearn_loss: -9.461 | retain_loss: 1.033 | avg_entropy: 9.461 | param_change: 0.00137


 75%|███████▌  | 754/1000 [27:28<09:17,  2.27s/it]

loss: -10.51 | unlearn_loss: -11.91 | retain_loss: 1.399 | avg_entropy: 11.91 | param_change: 0.001421


 76%|███████▌  | 755/1000 [27:30<09:22,  2.30s/it]

loss: -10.22 | unlearn_loss: -11.02 | retain_loss: 0.7939 | avg_entropy: 11.02 | param_change: 0.001161


 76%|███████▌  | 756/1000 [27:32<08:58,  2.21s/it]

loss: -10.59 | unlearn_loss: -12.07 | retain_loss: 1.478 | avg_entropy: 12.07 | param_change: 0.002332


 76%|███████▌  | 757/1000 [27:34<08:42,  2.15s/it]

loss: -8.858 | unlearn_loss: -11.91 | retain_loss: 3.054 | avg_entropy: 11.91 | param_change: 0.002783


 76%|███████▌  | 758/1000 [27:36<08:28,  2.10s/it]

loss: -10.99 | unlearn_loss: -11.95 | retain_loss: 0.9614 | avg_entropy: 11.95 | param_change: 0.001293


 76%|███████▌  | 759/1000 [27:39<09:17,  2.31s/it]

loss: -10.78 | unlearn_loss: -11.72 | retain_loss: 0.9386 | avg_entropy: 11.72 | param_change: 0.001442


 76%|███████▌  | 760/1000 [27:42<09:20,  2.33s/it]

loss: -10.5 | unlearn_loss: -11.93 | retain_loss: 1.433 | avg_entropy: 11.93 | param_change: 0.002041


 76%|███████▌  | 761/1000 [27:44<09:46,  2.45s/it]

loss: -11.2 | unlearn_loss: -11.94 | retain_loss: 0.7337 | avg_entropy: 11.94 | param_change: 0.000829


 76%|███████▌  | 762/1000 [27:47<09:44,  2.45s/it]

loss: -10.42 | unlearn_loss: -11.99 | retain_loss: 1.563 | avg_entropy: 11.99 | param_change: 0.003168


 76%|███████▋  | 763/1000 [27:50<10:09,  2.57s/it]

loss: -10.62 | unlearn_loss: -12.08 | retain_loss: 1.454 | avg_entropy: 12.08 | param_change: 0.002706


 76%|███████▋  | 764/1000 [27:52<10:00,  2.55s/it]

loss: -11.24 | unlearn_loss: -11.97 | retain_loss: 0.7363 | avg_entropy: 11.97 | param_change: 0.0009013


 76%|███████▋  | 765/1000 [27:54<09:41,  2.47s/it]

loss: -11.22 | unlearn_loss: -12.02 | retain_loss: 0.8002 | avg_entropy: 12.02 | param_change: 0.001113


 77%|███████▋  | 766/1000 [27:57<09:58,  2.56s/it]

loss: -10.99 | unlearn_loss: -11.88 | retain_loss: 0.8815 | avg_entropy: 11.88 | param_change: 0.001204


 77%|███████▋  | 767/1000 [27:59<09:38,  2.48s/it]

loss: -8.42 | unlearn_loss: -12.09 | retain_loss: 3.674 | avg_entropy: 12.09 | param_change: 0.005736


 77%|███████▋  | 768/1000 [28:02<09:20,  2.41s/it]

loss: -11.2 | unlearn_loss: -12.1 | retain_loss: 0.8989 | avg_entropy: 12.1 | param_change: 0.001844


 77%|███████▋  | 769/1000 [28:04<09:01,  2.34s/it]

loss: -10.97 | unlearn_loss: -11.99 | retain_loss: 1.02 | avg_entropy: 11.99 | param_change: 0.001569


 77%|███████▋  | 770/1000 [28:06<08:44,  2.28s/it]

loss: -10.72 | unlearn_loss: -12.09 | retain_loss: 1.364 | avg_entropy: 12.09 | param_change: 0.001519


 77%|███████▋  | 771/1000 [28:08<08:42,  2.28s/it]

loss: -10.8 | unlearn_loss: -12.09 | retain_loss: 1.289 | avg_entropy: 12.09 | param_change: 0.001847


 77%|███████▋  | 772/1000 [28:11<08:45,  2.31s/it]

loss: -8.766 | unlearn_loss: -11.96 | retain_loss: 3.197 | avg_entropy: 11.96 | param_change: 0.004642


 77%|███████▋  | 773/1000 [28:13<08:32,  2.26s/it]

loss: -10.45 | unlearn_loss: -12.1 | retain_loss: 1.648 | avg_entropy: 12.1 | param_change: 0.002446


 77%|███████▋  | 774/1000 [28:15<08:33,  2.27s/it]

loss: -10.8 | unlearn_loss: -12.1 | retain_loss: 1.292 | avg_entropy: 12.1 | param_change: 0.002255


 78%|███████▊  | 775/1000 [28:17<08:33,  2.28s/it]

loss: -10.39 | unlearn_loss: -12.1 | retain_loss: 1.705 | avg_entropy: 12.1 | param_change: 0.003236


 78%|███████▊  | 776/1000 [28:20<08:29,  2.27s/it]

loss: -11.03 | unlearn_loss: -12.1 | retain_loss: 1.073 | avg_entropy: 12.1 | param_change: 0.001375


 78%|███████▊  | 777/1000 [28:22<08:12,  2.21s/it]

loss: -10.09 | unlearn_loss: -12.1 | retain_loss: 2.006 | avg_entropy: 12.1 | param_change: 0.005169


 78%|███████▊  | 778/1000 [28:24<08:14,  2.23s/it]

loss: -10.67 | unlearn_loss: -11.5 | retain_loss: 0.829 | avg_entropy: 11.5 | param_change: 0.001628


 78%|███████▊  | 779/1000 [28:26<08:19,  2.26s/it]

loss: -11.26 | unlearn_loss: -12.1 | retain_loss: 0.8377 | avg_entropy: 12.1 | param_change: 0.001396


 78%|███████▊  | 780/1000 [28:29<08:17,  2.26s/it]

loss: -10.16 | unlearn_loss: -12 | retain_loss: 1.836 | avg_entropy: 12 | param_change: 0.002692


 78%|███████▊  | 781/1000 [28:31<08:20,  2.29s/it]

loss: -10.45 | unlearn_loss: -12.09 | retain_loss: 1.646 | avg_entropy: 12.09 | param_change: 0.002384


 78%|███████▊  | 782/1000 [28:33<08:00,  2.20s/it]

loss: -11.03 | unlearn_loss: -11.94 | retain_loss: 0.9088 | avg_entropy: 11.94 | param_change: 0.001483


 78%|███████▊  | 783/1000 [28:35<07:51,  2.17s/it]

loss: -10.62 | unlearn_loss: -12.06 | retain_loss: 1.446 | avg_entropy: 12.06 | param_change: 0.001785


 78%|███████▊  | 784/1000 [28:37<07:58,  2.21s/it]

loss: -10.78 | unlearn_loss: -11.95 | retain_loss: 1.163 | avg_entropy: 11.95 | param_change: 0.00171


 78%|███████▊  | 785/1000 [28:40<08:10,  2.28s/it]

loss: -11.16 | unlearn_loss: -12.09 | retain_loss: 0.9276 | avg_entropy: 12.09 | param_change: 0.001783


 79%|███████▊  | 786/1000 [28:42<07:57,  2.23s/it]

loss: -11.15 | unlearn_loss: -12.02 | retain_loss: 0.8694 | avg_entropy: 12.02 | param_change: 0.001357


 79%|███████▊  | 787/1000 [28:44<07:47,  2.20s/it]

loss: -10.41 | unlearn_loss: -11.83 | retain_loss: 1.417 | avg_entropy: 11.83 | param_change: 0.001681


 79%|███████▉  | 788/1000 [28:47<08:22,  2.37s/it]

loss: -10.83 | unlearn_loss: -12.1 | retain_loss: 1.274 | avg_entropy: 12.1 | param_change: 0.001434


 79%|███████▉  | 789/1000 [28:49<08:33,  2.43s/it]

loss: -10.73 | unlearn_loss: -11.92 | retain_loss: 1.191 | avg_entropy: 11.92 | param_change: 0.001365


 79%|███████▉  | 790/1000 [28:52<08:49,  2.52s/it]

loss: -10.8 | unlearn_loss: -12.1 | retain_loss: 1.294 | avg_entropy: 12.1 | param_change: 0.001936


 79%|███████▉  | 791/1000 [28:55<08:49,  2.53s/it]

loss: -11 | unlearn_loss: -12.1 | retain_loss: 1.097 | avg_entropy: 12.1 | param_change: 0.001294


 79%|███████▉  | 792/1000 [28:57<08:41,  2.51s/it]

loss: -11.23 | unlearn_loss: -12.1 | retain_loss: 0.877 | avg_entropy: 12.1 | param_change: 0.001986


 79%|███████▉  | 793/1000 [28:59<08:18,  2.41s/it]

loss: -10.59 | unlearn_loss: -11.91 | retain_loss: 1.326 | avg_entropy: 11.91 | param_change: 0.002078


 79%|███████▉  | 794/1000 [29:02<08:26,  2.46s/it]

loss: -10.68 | unlearn_loss: -11.87 | retain_loss: 1.19 | avg_entropy: 11.87 | param_change: 0.00219


 80%|███████▉  | 795/1000 [29:04<08:24,  2.46s/it]

loss: -11.15 | unlearn_loss: -12.07 | retain_loss: 0.9212 | avg_entropy: 12.07 | param_change: 0.001116


 80%|███████▉  | 796/1000 [29:07<08:01,  2.36s/it]

loss: -11.02 | unlearn_loss: -12.04 | retain_loss: 1.015 | avg_entropy: 12.04 | param_change: 0.001126


 80%|███████▉  | 797/1000 [29:09<08:03,  2.38s/it]

loss: -11.12 | unlearn_loss: -11.89 | retain_loss: 0.7754 | avg_entropy: 11.89 | param_change: 0.00131


 80%|███████▉  | 798/1000 [29:11<07:51,  2.33s/it]

loss: -10.8 | unlearn_loss: -12.1 | retain_loss: 1.302 | avg_entropy: 12.1 | param_change: 0.002057


 80%|███████▉  | 799/1000 [29:13<07:44,  2.31s/it]

loss: -9.682 | unlearn_loss: -10.71 | retain_loss: 1.033 | avg_entropy: 10.71 | param_change: 0.003122


 80%|████████  | 800/1000 [29:15<07:21,  2.21s/it]

loss: -11.19 | unlearn_loss: -12.04 | retain_loss: 0.8523 | avg_entropy: 12.04 | param_change: 0.001125


 80%|████████  | 801/1000 [29:18<07:23,  2.23s/it]

loss: -11.38 | unlearn_loss: -12.09 | retain_loss: 0.7127 | avg_entropy: 12.09 | param_change: 0.001347


 80%|████████  | 802/1000 [29:20<07:20,  2.22s/it]

loss: -10.75 | unlearn_loss: -11.73 | retain_loss: 0.9742 | avg_entropy: 11.73 | param_change: 0.001535


 80%|████████  | 803/1000 [29:22<07:17,  2.22s/it]

loss: -10.95 | unlearn_loss: -11.95 | retain_loss: 0.9924 | avg_entropy: 11.95 | param_change: 0.002003


 80%|████████  | 804/1000 [29:24<07:15,  2.22s/it]

loss: -10.46 | unlearn_loss: -12.06 | retain_loss: 1.599 | avg_entropy: 12.06 | param_change: 0.002822


 80%|████████  | 805/1000 [29:26<07:09,  2.20s/it]

loss: -10.91 | unlearn_loss: -11.99 | retain_loss: 1.076 | avg_entropy: 11.99 | param_change: 0.001664


 81%|████████  | 806/1000 [29:29<06:57,  2.15s/it]

loss: -10.95 | unlearn_loss: -12.09 | retain_loss: 1.148 | avg_entropy: 12.09 | param_change: 0.001404


 81%|████████  | 807/1000 [29:31<06:49,  2.12s/it]

loss: -10.26 | unlearn_loss: -11.73 | retain_loss: 1.472 | avg_entropy: 11.73 | param_change: 0.002971


 81%|████████  | 808/1000 [29:33<06:49,  2.13s/it]

loss: -10.51 | unlearn_loss: -11.84 | retain_loss: 1.337 | avg_entropy: 11.84 | param_change: 0.002289


 81%|████████  | 809/1000 [29:35<06:41,  2.10s/it]

loss: -10.71 | unlearn_loss: -11.94 | retain_loss: 1.23 | avg_entropy: 11.94 | param_change: 0.002329


 81%|████████  | 810/1000 [29:37<06:44,  2.13s/it]

loss: -7.544 | unlearn_loss: -12.06 | retain_loss: 4.518 | avg_entropy: 12.06 | param_change: 0.004153


 81%|████████  | 811/1000 [29:39<06:42,  2.13s/it]

loss: -8.906 | unlearn_loss: -11.78 | retain_loss: 2.871 | avg_entropy: 11.78 | param_change: 0.002757


 81%|████████  | 812/1000 [29:41<06:48,  2.17s/it]

loss: -9.988 | unlearn_loss: -12.01 | retain_loss: 2.027 | avg_entropy: 12.01 | param_change: 0.0029


 81%|████████▏ | 813/1000 [29:43<06:36,  2.12s/it]

loss: -10.78 | unlearn_loss: -12.07 | retain_loss: 1.282 | avg_entropy: 12.07 | param_change: 0.001441


 81%|████████▏ | 814/1000 [29:45<06:31,  2.10s/it]

loss: -9.825 | unlearn_loss: -11.87 | retain_loss: 2.05 | avg_entropy: 11.87 | param_change: 0.00285


 82%|████████▏ | 815/1000 [29:48<06:39,  2.16s/it]

loss: -10.24 | unlearn_loss: -12.07 | retain_loss: 1.822 | avg_entropy: 12.07 | param_change: 0.002924


 82%|████████▏ | 816/1000 [29:50<06:43,  2.19s/it]

loss: -11.14 | unlearn_loss: -12.07 | retain_loss: 0.9273 | avg_entropy: 12.07 | param_change: 0.001257


 82%|████████▏ | 817/1000 [29:52<06:41,  2.20s/it]

loss: -10.36 | unlearn_loss: -12.07 | retain_loss: 1.711 | avg_entropy: 12.07 | param_change: 0.002306


 82%|████████▏ | 818/1000 [29:54<06:37,  2.18s/it]

loss: -10.96 | unlearn_loss: -11.85 | retain_loss: 0.8913 | avg_entropy: 11.85 | param_change: 0.001104


 82%|████████▏ | 819/1000 [29:57<06:36,  2.19s/it]

loss: -10.71 | unlearn_loss: -12.05 | retain_loss: 1.342 | avg_entropy: 12.05 | param_change: 0.001535


 82%|████████▏ | 820/1000 [29:59<06:29,  2.16s/it]

loss: -10.28 | unlearn_loss: -12.1 | retain_loss: 1.82 | avg_entropy: 12.1 | param_change: 0.002417


 82%|████████▏ | 821/1000 [30:01<06:26,  2.16s/it]

loss: -10.33 | unlearn_loss: -12.04 | retain_loss: 1.705 | avg_entropy: 12.04 | param_change: 0.002248


 82%|████████▏ | 822/1000 [30:03<06:26,  2.17s/it]

loss: -10.96 | unlearn_loss: -12.07 | retain_loss: 1.119 | avg_entropy: 12.07 | param_change: 0.002203


 82%|████████▏ | 823/1000 [30:05<06:30,  2.21s/it]

loss: -10.38 | unlearn_loss: -12.05 | retain_loss: 1.668 | avg_entropy: 12.05 | param_change: 0.002011


 82%|████████▏ | 824/1000 [30:07<06:22,  2.17s/it]

loss: -10.38 | unlearn_loss: -11.92 | retain_loss: 1.535 | avg_entropy: 11.92 | param_change: 0.002098


 82%|████████▎ | 825/1000 [30:10<06:24,  2.20s/it]

loss: -10.49 | unlearn_loss: -11.86 | retain_loss: 1.366 | avg_entropy: 11.86 | param_change: 0.001796


 83%|████████▎ | 826/1000 [30:12<06:17,  2.17s/it]

loss: -10.94 | unlearn_loss: -12.06 | retain_loss: 1.111 | avg_entropy: 12.06 | param_change: 0.001609


 83%|████████▎ | 827/1000 [30:14<06:14,  2.17s/it]

loss: -10.33 | unlearn_loss: -11.89 | retain_loss: 1.553 | avg_entropy: 11.89 | param_change: 0.002547


 83%|████████▎ | 828/1000 [30:16<06:09,  2.15s/it]

loss: -10.15 | unlearn_loss: -11.89 | retain_loss: 1.743 | avg_entropy: 11.89 | param_change: 0.002868


 83%|████████▎ | 829/1000 [30:18<06:15,  2.19s/it]

loss: -10.22 | unlearn_loss: -12.07 | retain_loss: 1.849 | avg_entropy: 12.07 | param_change: 0.003719


 83%|████████▎ | 830/1000 [30:20<06:06,  2.16s/it]

loss: -10.92 | unlearn_loss: -12.07 | retain_loss: 1.158 | avg_entropy: 12.07 | param_change: 0.001926


 83%|████████▎ | 831/1000 [30:23<06:25,  2.28s/it]

loss: -10.43 | unlearn_loss: -12.04 | retain_loss: 1.609 | avg_entropy: 12.04 | param_change: 0.002315


 83%|████████▎ | 832/1000 [30:25<06:21,  2.27s/it]

loss: -10.33 | unlearn_loss: -11.89 | retain_loss: 1.558 | avg_entropy: 11.89 | param_change: 0.001988


 83%|████████▎ | 833/1000 [30:27<06:10,  2.22s/it]

loss: -10.86 | unlearn_loss: -12.07 | retain_loss: 1.204 | avg_entropy: 12.07 | param_change: 0.001535


 83%|████████▎ | 834/1000 [30:30<06:10,  2.23s/it]

loss: -10.98 | unlearn_loss: -12.04 | retain_loss: 1.056 | avg_entropy: 12.04 | param_change: 0.001547


 84%|████████▎ | 835/1000 [30:32<06:07,  2.22s/it]

loss: -10.76 | unlearn_loss: -12.1 | retain_loss: 1.339 | avg_entropy: 12.1 | param_change: 0.002264


 84%|████████▎ | 836/1000 [30:34<06:04,  2.22s/it]

loss: -10.67 | unlearn_loss: -12.05 | retain_loss: 1.374 | avg_entropy: 12.05 | param_change: 0.00229


 84%|████████▎ | 837/1000 [30:36<06:03,  2.23s/it]

loss: -11.24 | unlearn_loss: -12.1 | retain_loss: 0.8602 | avg_entropy: 12.1 | param_change: 0.00145


 84%|████████▍ | 838/1000 [30:38<05:52,  2.18s/it]

loss: -9.791 | unlearn_loss: -11.95 | retain_loss: 2.159 | avg_entropy: 11.95 | param_change: 0.002337


 84%|████████▍ | 839/1000 [30:40<05:39,  2.11s/it]

loss: -10.81 | unlearn_loss: -12.1 | retain_loss: 1.286 | avg_entropy: 12.1 | param_change: 0.002118


 84%|████████▍ | 840/1000 [30:42<05:45,  2.16s/it]

loss: -10.88 | unlearn_loss: -11.89 | retain_loss: 1.011 | avg_entropy: 11.89 | param_change: 0.001108


 84%|████████▍ | 841/1000 [30:45<05:56,  2.24s/it]

loss: -11.03 | unlearn_loss: -12.07 | retain_loss: 1.036 | avg_entropy: 12.07 | param_change: 0.001374


 84%|████████▍ | 842/1000 [30:47<06:01,  2.29s/it]

loss: -10.91 | unlearn_loss: -12.09 | retain_loss: 1.178 | avg_entropy: 12.09 | param_change: 0.001404


 84%|████████▍ | 843/1000 [30:50<05:55,  2.26s/it]

loss: -11.01 | unlearn_loss: -12.09 | retain_loss: 1.077 | avg_entropy: 12.09 | param_change: 0.001627


 84%|████████▍ | 844/1000 [30:52<05:49,  2.24s/it]

loss: -10.72 | unlearn_loss: -12.08 | retain_loss: 1.362 | avg_entropy: 12.08 | param_change: 0.001807


 84%|████████▍ | 845/1000 [30:54<05:56,  2.30s/it]

loss: -10.82 | unlearn_loss: -12.08 | retain_loss: 1.264 | avg_entropy: 12.08 | param_change: 0.001535


 85%|████████▍ | 846/1000 [30:56<05:49,  2.27s/it]

loss: -8.935 | unlearn_loss: -11.95 | retain_loss: 3.019 | avg_entropy: 11.95 | param_change: 0.003948


 85%|████████▍ | 847/1000 [30:58<05:33,  2.18s/it]

loss: -7.047 | unlearn_loss: -12.09 | retain_loss: 5.043 | avg_entropy: 12.09 | param_change: 0.00529


 85%|████████▍ | 848/1000 [31:00<05:23,  2.13s/it]

loss: -6.686 | unlearn_loss: -11.8 | retain_loss: 5.115 | avg_entropy: 11.8 | param_change: 0.004844


 85%|████████▍ | 849/1000 [31:02<05:16,  2.10s/it]

loss: -9.147 | unlearn_loss: -12.04 | retain_loss: 2.897 | avg_entropy: 12.04 | param_change: 0.003099


 85%|████████▌ | 850/1000 [31:04<05:13,  2.09s/it]

loss: -9.409 | unlearn_loss: -12.02 | retain_loss: 2.61 | avg_entropy: 12.02 | param_change: 0.003027


 85%|████████▌ | 851/1000 [31:06<05:08,  2.07s/it]

loss: -11.08 | unlearn_loss: -12.06 | retain_loss: 0.977 | avg_entropy: 12.06 | param_change: 0.001402


 85%|████████▌ | 852/1000 [31:09<05:06,  2.07s/it]

loss: -10.85 | unlearn_loss: -11.81 | retain_loss: 0.9639 | avg_entropy: 11.81 | param_change: 0.001252


 85%|████████▌ | 853/1000 [31:11<05:09,  2.11s/it]

loss: -10.51 | unlearn_loss: -12.09 | retain_loss: 1.578 | avg_entropy: 12.09 | param_change: 0.002425


 85%|████████▌ | 854/1000 [31:13<05:06,  2.10s/it]

loss: -10.57 | unlearn_loss: -12.1 | retain_loss: 1.53 | avg_entropy: 12.1 | param_change: 0.00168


 86%|████████▌ | 855/1000 [31:15<05:10,  2.14s/it]

loss: -11.02 | unlearn_loss: -11.95 | retain_loss: 0.9341 | avg_entropy: 11.95 | param_change: 0.001317


 86%|████████▌ | 856/1000 [31:17<05:05,  2.12s/it]

loss: -10.75 | unlearn_loss: -12.06 | retain_loss: 1.309 | avg_entropy: 12.06 | param_change: 0.001541


 86%|████████▌ | 857/1000 [31:19<05:05,  2.14s/it]

loss: -10.85 | unlearn_loss: -12.07 | retain_loss: 1.223 | avg_entropy: 12.07 | param_change: 0.001714


 86%|████████▌ | 858/1000 [31:21<05:00,  2.11s/it]

loss: -10.86 | unlearn_loss: -12.09 | retain_loss: 1.234 | avg_entropy: 12.09 | param_change: 0.001724


 86%|████████▌ | 859/1000 [31:23<04:52,  2.08s/it]

loss: -10.79 | unlearn_loss: -12.08 | retain_loss: 1.291 | avg_entropy: 12.08 | param_change: 0.001774


 86%|████████▌ | 860/1000 [31:25<04:51,  2.08s/it]

loss: -10.59 | unlearn_loss: -12.05 | retain_loss: 1.462 | avg_entropy: 12.05 | param_change: 0.001919


 86%|████████▌ | 861/1000 [31:27<04:47,  2.07s/it]

loss: -10.61 | unlearn_loss: -12.1 | retain_loss: 1.482 | avg_entropy: 12.1 | param_change: 0.001923


 86%|████████▌ | 862/1000 [31:30<04:50,  2.11s/it]

loss: -9.974 | unlearn_loss: -12.1 | retain_loss: 2.127 | avg_entropy: 12.1 | param_change: 0.00313


 86%|████████▋ | 863/1000 [31:32<04:56,  2.16s/it]

loss: -9.893 | unlearn_loss: -12.08 | retain_loss: 2.184 | avg_entropy: 12.08 | param_change: 0.003099


 86%|████████▋ | 864/1000 [31:34<04:57,  2.19s/it]

loss: -10.1 | unlearn_loss: -11.86 | retain_loss: 1.76 | avg_entropy: 11.86 | param_change: 0.002119


 86%|████████▋ | 865/1000 [31:36<04:58,  2.21s/it]

loss: -10.79 | unlearn_loss: -11.93 | retain_loss: 1.135 | avg_entropy: 11.93 | param_change: 0.001674


 87%|████████▋ | 866/1000 [31:39<05:00,  2.24s/it]

loss: -10.97 | unlearn_loss: -11.87 | retain_loss: 0.9011 | avg_entropy: 11.87 | param_change: 0.001214


 87%|████████▋ | 867/1000 [31:41<04:57,  2.23s/it]

loss: -10.89 | unlearn_loss: -11.9 | retain_loss: 1.006 | avg_entropy: 11.9 | param_change: 0.00147


 87%|████████▋ | 868/1000 [31:44<05:14,  2.38s/it]

loss: -11.13 | unlearn_loss: -12.07 | retain_loss: 0.934 | avg_entropy: 12.07 | param_change: 0.001208


 87%|████████▋ | 869/1000 [31:46<05:12,  2.39s/it]

loss: -11.23 | unlearn_loss: -12.09 | retain_loss: 0.8612 | avg_entropy: 12.09 | param_change: 0.001121


 87%|████████▋ | 870/1000 [31:49<05:12,  2.41s/it]

loss: -10.97 | unlearn_loss: -12.08 | retain_loss: 1.108 | avg_entropy: 12.08 | param_change: 0.001077


 87%|████████▋ | 871/1000 [31:51<05:21,  2.49s/it]

loss: -10.73 | unlearn_loss: -12.08 | retain_loss: 1.357 | avg_entropy: 12.08 | param_change: 0.001542


 87%|████████▋ | 872/1000 [31:54<05:21,  2.51s/it]

loss: -11.33 | unlearn_loss: -12.09 | retain_loss: 0.764 | avg_entropy: 12.09 | param_change: 0.0008917


 87%|████████▋ | 873/1000 [31:56<05:12,  2.46s/it]

loss: -10.69 | unlearn_loss: -12.07 | retain_loss: 1.387 | avg_entropy: 12.07 | param_change: 0.001425


 87%|████████▋ | 874/1000 [31:59<05:05,  2.42s/it]

loss: -11.35 | unlearn_loss: -12.1 | retain_loss: 0.7584 | avg_entropy: 12.1 | param_change: 0.0008685


 88%|████████▊ | 875/1000 [32:01<05:00,  2.40s/it]

loss: -10.54 | unlearn_loss: -11.87 | retain_loss: 1.33 | avg_entropy: 11.87 | param_change: 0.001447


 88%|████████▊ | 876/1000 [32:03<04:59,  2.42s/it]

loss: -5.813 | unlearn_loss: -11.85 | retain_loss: 6.036 | avg_entropy: 11.85 | param_change: 0.006262


 88%|████████▊ | 877/1000 [32:05<04:40,  2.28s/it]

loss: -8.903 | unlearn_loss: -12.02 | retain_loss: 3.118 | avg_entropy: 12.02 | param_change: 0.003764


 88%|████████▊ | 878/1000 [32:08<04:39,  2.29s/it]

loss: -10.14 | unlearn_loss: -12.08 | retain_loss: 1.94 | avg_entropy: 12.08 | param_change: 0.00222


 88%|████████▊ | 879/1000 [32:10<04:42,  2.33s/it]

loss: -11.1 | unlearn_loss: -11.9 | retain_loss: 0.7991 | avg_entropy: 11.9 | param_change: 0.001053


 88%|████████▊ | 880/1000 [32:12<04:40,  2.33s/it]

loss: -10.45 | unlearn_loss: -12.11 | retain_loss: 1.657 | avg_entropy: 12.11 | param_change: 0.003153


 88%|████████▊ | 881/1000 [32:15<04:43,  2.38s/it]

loss: -10.6 | unlearn_loss: -12.09 | retain_loss: 1.493 | avg_entropy: 12.09 | param_change: 0.00242


 88%|████████▊ | 882/1000 [32:17<04:44,  2.41s/it]

loss: -10.84 | unlearn_loss: -12 | retain_loss: 1.159 | avg_entropy: 12 | param_change: 0.001508


 88%|████████▊ | 883/1000 [32:20<04:40,  2.40s/it]

loss: -10.59 | unlearn_loss: -12.08 | retain_loss: 1.495 | avg_entropy: 12.08 | param_change: 0.001354


 88%|████████▊ | 884/1000 [32:22<04:49,  2.49s/it]

loss: -10.83 | unlearn_loss: -11.95 | retain_loss: 1.121 | avg_entropy: 11.95 | param_change: 0.001566


 88%|████████▊ | 885/1000 [32:25<04:34,  2.38s/it]

loss: -10.31 | unlearn_loss: -12.09 | retain_loss: 1.779 | avg_entropy: 12.09 | param_change: 0.002396


 89%|████████▊ | 886/1000 [32:27<04:33,  2.40s/it]

loss: -10.65 | unlearn_loss: -12.11 | retain_loss: 1.461 | avg_entropy: 12.11 | param_change: 0.001559


 89%|████████▊ | 887/1000 [32:29<04:35,  2.44s/it]

loss: -10.92 | unlearn_loss: -11.87 | retain_loss: 0.9485 | avg_entropy: 11.87 | param_change: 0.0009076


 89%|████████▉ | 888/1000 [32:32<04:27,  2.39s/it]

loss: -10.79 | unlearn_loss: -11.95 | retain_loss: 1.165 | avg_entropy: 11.95 | param_change: 0.001221


 89%|████████▉ | 889/1000 [32:34<04:32,  2.45s/it]

loss: -11.05 | unlearn_loss: -12.05 | retain_loss: 0.9985 | avg_entropy: 12.05 | param_change: 0.001087


 89%|████████▉ | 890/1000 [32:37<04:25,  2.41s/it]

loss: -11.03 | unlearn_loss: -12.09 | retain_loss: 1.064 | avg_entropy: 12.09 | param_change: 0.001511


 89%|████████▉ | 891/1000 [32:40<04:37,  2.55s/it]

loss: -10.7 | unlearn_loss: -11.68 | retain_loss: 0.9792 | avg_entropy: 11.68 | param_change: 0.0009227


 89%|████████▉ | 892/1000 [32:42<04:25,  2.45s/it]

loss: -10.47 | unlearn_loss: -12.09 | retain_loss: 1.617 | avg_entropy: 12.09 | param_change: 0.001495


 89%|████████▉ | 893/1000 [32:44<04:18,  2.42s/it]

loss: -10.83 | unlearn_loss: -11.79 | retain_loss: 0.9567 | avg_entropy: 11.79 | param_change: 0.0009676


 89%|████████▉ | 894/1000 [32:46<04:13,  2.39s/it]

loss: -10.51 | unlearn_loss: -12.04 | retain_loss: 1.535 | avg_entropy: 12.04 | param_change: 0.001874


 90%|████████▉ | 895/1000 [32:48<03:59,  2.28s/it]

loss: -10.27 | unlearn_loss: -12.11 | retain_loss: 1.839 | avg_entropy: 12.11 | param_change: 0.002362


 90%|████████▉ | 896/1000 [32:51<03:59,  2.30s/it]

loss: -11 | unlearn_loss: -12.07 | retain_loss: 1.077 | avg_entropy: 12.07 | param_change: 0.001279


 90%|████████▉ | 897/1000 [32:53<03:58,  2.32s/it]

loss: -10.87 | unlearn_loss: -12.09 | retain_loss: 1.214 | avg_entropy: 12.09 | param_change: 0.001653


 90%|████████▉ | 898/1000 [32:56<04:07,  2.42s/it]

loss: -8.772 | unlearn_loss: -12.05 | retain_loss: 3.281 | avg_entropy: 12.05 | param_change: 0.006318


 90%|████████▉ | 899/1000 [32:58<03:56,  2.34s/it]

loss: -10.82 | unlearn_loss: -11.95 | retain_loss: 1.13 | avg_entropy: 11.95 | param_change: 0.001209


 90%|█████████ | 900/1000 [33:01<04:00,  2.41s/it]

loss: -10.45 | unlearn_loss: -11.83 | retain_loss: 1.378 | avg_entropy: 11.83 | param_change: 0.001831


 90%|█████████ | 901/1000 [33:03<04:07,  2.50s/it]

loss: -10.22 | unlearn_loss: -11.77 | retain_loss: 1.553 | avg_entropy: 11.77 | param_change: 0.001892


 90%|█████████ | 902/1000 [33:05<03:53,  2.39s/it]

loss: -10.53 | unlearn_loss: -11.93 | retain_loss: 1.408 | avg_entropy: 11.93 | param_change: 0.001444


 90%|█████████ | 903/1000 [33:08<03:46,  2.34s/it]

loss: -9.81 | unlearn_loss: -12.08 | retain_loss: 2.269 | avg_entropy: 12.08 | param_change: 0.001869


 90%|█████████ | 904/1000 [33:10<03:50,  2.41s/it]

loss: -9.874 | unlearn_loss: -11.83 | retain_loss: 1.955 | avg_entropy: 11.83 | param_change: 0.003031


 90%|█████████ | 905/1000 [33:12<03:39,  2.31s/it]

loss: -10.29 | unlearn_loss: -12.1 | retain_loss: 1.809 | avg_entropy: 12.1 | param_change: 0.002201


 91%|█████████ | 906/1000 [33:14<03:30,  2.24s/it]

loss: -10.97 | unlearn_loss: -11.95 | retain_loss: 0.9793 | avg_entropy: 11.95 | param_change: 0.001461


 91%|█████████ | 907/1000 [33:16<03:23,  2.19s/it]

loss: -10.07 | unlearn_loss: -12.08 | retain_loss: 2.006 | avg_entropy: 12.08 | param_change: 0.002424


 91%|█████████ | 908/1000 [33:19<03:20,  2.18s/it]

loss: -10.94 | unlearn_loss: -11.91 | retain_loss: 0.9704 | avg_entropy: 11.91 | param_change: 0.001387


 91%|█████████ | 909/1000 [33:21<03:14,  2.13s/it]

loss: -6.446 | unlearn_loss: -7.637 | retain_loss: 1.19 | avg_entropy: 7.637 | param_change: 0.001754


 91%|█████████ | 910/1000 [33:23<03:16,  2.18s/it]

loss: -11.29 | unlearn_loss: -12.1 | retain_loss: 0.8015 | avg_entropy: 12.1 | param_change: 0.001125


 91%|█████████ | 911/1000 [33:25<03:14,  2.18s/it]

loss: -10.87 | unlearn_loss: -12.08 | retain_loss: 1.213 | avg_entropy: 12.08 | param_change: 0.002242


 91%|█████████ | 912/1000 [33:27<03:14,  2.21s/it]

loss: -10.53 | unlearn_loss: -11.69 | retain_loss: 1.166 | avg_entropy: 11.69 | param_change: 0.0015


 91%|█████████▏| 913/1000 [33:30<03:10,  2.20s/it]

loss: -11.01 | unlearn_loss: -12.09 | retain_loss: 1.074 | avg_entropy: 12.09 | param_change: 0.00121


 91%|█████████▏| 914/1000 [33:32<03:07,  2.19s/it]

loss: -11.15 | unlearn_loss: -12.08 | retain_loss: 0.9304 | avg_entropy: 12.08 | param_change: 0.001638


 92%|█████████▏| 915/1000 [33:34<03:02,  2.15s/it]

loss: -10.8 | unlearn_loss: -12.04 | retain_loss: 1.233 | avg_entropy: 12.04 | param_change: 0.001731


 92%|█████████▏| 916/1000 [33:36<02:59,  2.13s/it]

loss: -10.85 | unlearn_loss: -11.95 | retain_loss: 1.102 | avg_entropy: 11.95 | param_change: 0.001351


 92%|█████████▏| 917/1000 [33:38<03:00,  2.18s/it]

loss: -10.88 | unlearn_loss: -12.1 | retain_loss: 1.22 | avg_entropy: 12.1 | param_change: 0.00189


 92%|█████████▏| 918/1000 [33:40<02:55,  2.14s/it]

loss: -11.2 | unlearn_loss: -12.11 | retain_loss: 0.9078 | avg_entropy: 12.11 | param_change: 0.001587


 92%|█████████▏| 919/1000 [33:42<02:50,  2.10s/it]

loss: -10.88 | unlearn_loss: -12.06 | retain_loss: 1.187 | avg_entropy: 12.06 | param_change: 0.001221


 92%|█████████▏| 920/1000 [33:45<02:54,  2.18s/it]

loss: -10.9 | unlearn_loss: -11.91 | retain_loss: 1.011 | avg_entropy: 11.91 | param_change: 0.001706


 92%|█████████▏| 921/1000 [33:47<02:54,  2.20s/it]

loss: -11.37 | unlearn_loss: -12.08 | retain_loss: 0.7109 | avg_entropy: 12.08 | param_change: 0.001036


 92%|█████████▏| 922/1000 [33:49<02:48,  2.17s/it]

loss: -10.89 | unlearn_loss: -12.08 | retain_loss: 1.198 | avg_entropy: 12.08 | param_change: 0.002078


 92%|█████████▏| 923/1000 [33:51<02:44,  2.14s/it]

loss: -11.16 | unlearn_loss: -11.88 | retain_loss: 0.7209 | avg_entropy: 11.88 | param_change: 0.0009219


 92%|█████████▏| 924/1000 [33:53<02:42,  2.13s/it]

loss: -11.03 | unlearn_loss: -12.08 | retain_loss: 1.054 | avg_entropy: 12.08 | param_change: 0.001554


 92%|█████████▎| 925/1000 [33:55<02:40,  2.13s/it]

loss: -11.14 | unlearn_loss: -11.9 | retain_loss: 0.7592 | avg_entropy: 11.9 | param_change: 0.001193


 93%|█████████▎| 926/1000 [33:57<02:38,  2.15s/it]

loss: -11.43 | unlearn_loss: -12.09 | retain_loss: 0.6585 | avg_entropy: 12.09 | param_change: 0.001028


 93%|█████████▎| 927/1000 [33:59<02:34,  2.12s/it]

loss: -9.984 | unlearn_loss: -11.71 | retain_loss: 1.728 | avg_entropy: 11.71 | param_change: 0.002783


 93%|█████████▎| 928/1000 [34:01<02:30,  2.10s/it]

loss: -11.08 | unlearn_loss: -12.08 | retain_loss: 0.9949 | avg_entropy: 12.08 | param_change: 0.001139


 93%|█████████▎| 929/1000 [34:04<02:32,  2.14s/it]

loss: -9.279 | unlearn_loss: -12.09 | retain_loss: 2.808 | avg_entropy: 12.09 | param_change: 0.002674


 93%|█████████▎| 930/1000 [34:06<02:26,  2.10s/it]

loss: -9.664 | unlearn_loss: -11.96 | retain_loss: 2.299 | avg_entropy: 11.96 | param_change: 0.009056


 93%|█████████▎| 931/1000 [34:08<02:22,  2.07s/it]

loss: -10.79 | unlearn_loss: -12.08 | retain_loss: 1.286 | avg_entropy: 12.08 | param_change: 0.002704


 93%|█████████▎| 932/1000 [34:10<02:18,  2.04s/it]

loss: -9.277 | unlearn_loss: -11.93 | retain_loss: 2.651 | avg_entropy: 11.93 | param_change: 0.004771


 93%|█████████▎| 933/1000 [34:12<02:18,  2.07s/it]

loss: -10.68 | unlearn_loss: -12.1 | retain_loss: 1.415 | avg_entropy: 12.1 | param_change: 0.003441


 93%|█████████▎| 934/1000 [34:14<02:19,  2.11s/it]

loss: -10.22 | unlearn_loss: -12.09 | retain_loss: 1.869 | avg_entropy: 12.09 | param_change: 0.004354


 94%|█████████▎| 935/1000 [34:16<02:15,  2.08s/it]

loss: -10.08 | unlearn_loss: -12.09 | retain_loss: 2.017 | avg_entropy: 12.09 | param_change: 0.006286


 94%|█████████▎| 936/1000 [34:18<02:11,  2.05s/it]

loss: -9.87 | unlearn_loss: -12.1 | retain_loss: 2.235 | avg_entropy: 12.1 | param_change: 0.0058


 94%|█████████▎| 937/1000 [34:20<02:09,  2.05s/it]

loss: -10.33 | unlearn_loss: -12.09 | retain_loss: 1.765 | avg_entropy: 12.09 | param_change: 0.003232


 94%|█████████▍| 938/1000 [34:22<02:10,  2.11s/it]

loss: -10.62 | unlearn_loss: -12.1 | retain_loss: 1.486 | avg_entropy: 12.1 | param_change: 0.003088


 94%|█████████▍| 939/1000 [34:24<02:08,  2.11s/it]

loss: -10.25 | unlearn_loss: -12.1 | retain_loss: 1.841 | avg_entropy: 12.1 | param_change: 0.003109


 94%|█████████▍| 940/1000 [34:27<02:05,  2.09s/it]

loss: -10.53 | unlearn_loss: -11.98 | retain_loss: 1.453 | avg_entropy: 11.98 | param_change: 0.00241


 94%|█████████▍| 941/1000 [34:29<02:03,  2.09s/it]

loss: -9.995 | unlearn_loss: -12.12 | retain_loss: 2.121 | avg_entropy: 12.12 | param_change: 0.003041


 94%|█████████▍| 942/1000 [34:31<02:01,  2.09s/it]

loss: -9.592 | unlearn_loss: -12.11 | retain_loss: 2.522 | avg_entropy: 12.11 | param_change: 0.00355


 94%|█████████▍| 943/1000 [34:33<02:00,  2.12s/it]

loss: -9.748 | unlearn_loss: -12.09 | retain_loss: 2.339 | avg_entropy: 12.09 | param_change: 0.0033


 94%|█████████▍| 944/1000 [34:35<01:57,  2.09s/it]

loss: -10.05 | unlearn_loss: -12.05 | retain_loss: 2.001 | avg_entropy: 12.05 | param_change: 0.003437


 94%|█████████▍| 945/1000 [34:37<01:56,  2.12s/it]

loss: -10.4 | unlearn_loss: -12.11 | retain_loss: 1.71 | avg_entropy: 12.11 | param_change: 0.003661


 95%|█████████▍| 946/1000 [34:39<01:52,  2.08s/it]

loss: -8.811 | unlearn_loss: -11.95 | retain_loss: 3.142 | avg_entropy: 11.95 | param_change: 0.006495


 95%|█████████▍| 947/1000 [34:41<01:48,  2.06s/it]

loss: -10.76 | unlearn_loss: -12.11 | retain_loss: 1.354 | avg_entropy: 12.11 | param_change: 0.002302


 95%|█████████▍| 948/1000 [34:43<01:46,  2.05s/it]

loss: -9.583 | unlearn_loss: -12.1 | retain_loss: 2.514 | avg_entropy: 12.1 | param_change: 0.004149


 95%|█████████▍| 949/1000 [34:45<01:43,  2.04s/it]

loss: -10.2 | unlearn_loss: -11.98 | retain_loss: 1.778 | avg_entropy: 11.98 | param_change: 0.002842


 95%|█████████▌| 950/1000 [34:47<01:40,  2.02s/it]

loss: -10.51 | unlearn_loss: -11.96 | retain_loss: 1.456 | avg_entropy: 11.96 | param_change: 0.002162


 95%|█████████▌| 951/1000 [34:49<01:38,  2.02s/it]

loss: -10.16 | unlearn_loss: -12.12 | retain_loss: 1.961 | avg_entropy: 12.12 | param_change: 0.003151


 95%|█████████▌| 952/1000 [34:51<01:37,  2.04s/it]

loss: -11.03 | unlearn_loss: -12.1 | retain_loss: 1.071 | avg_entropy: 12.1 | param_change: 0.001252


 95%|█████████▌| 953/1000 [34:53<01:38,  2.09s/it]

loss: -10.92 | unlearn_loss: -11.98 | retain_loss: 1.06 | avg_entropy: 11.98 | param_change: 0.001446


 95%|█████████▌| 954/1000 [34:55<01:34,  2.06s/it]

loss: -10.7 | unlearn_loss: -12.12 | retain_loss: 1.415 | avg_entropy: 12.12 | param_change: 0.002127


 96%|█████████▌| 955/1000 [34:57<01:33,  2.07s/it]

loss: -10.28 | unlearn_loss: -11.95 | retain_loss: 1.667 | avg_entropy: 11.95 | param_change: 0.002103


 96%|█████████▌| 956/1000 [35:00<01:31,  2.07s/it]

loss: -10.7 | unlearn_loss: -12.12 | retain_loss: 1.426 | avg_entropy: 12.12 | param_change: 0.002087


 96%|█████████▌| 957/1000 [35:02<01:29,  2.09s/it]

loss: -10.68 | unlearn_loss: -11.95 | retain_loss: 1.271 | avg_entropy: 11.95 | param_change: 0.001743


 96%|█████████▌| 958/1000 [35:04<01:26,  2.06s/it]

loss: -10.41 | unlearn_loss: -12.12 | retain_loss: 1.706 | avg_entropy: 12.12 | param_change: 0.002399


 96%|█████████▌| 959/1000 [35:06<01:23,  2.04s/it]

loss: -11.11 | unlearn_loss: -12.1 | retain_loss: 0.9852 | avg_entropy: 12.1 | param_change: 0.001382


 96%|█████████▌| 960/1000 [35:08<01:21,  2.05s/it]

loss: -9.722 | unlearn_loss: -12.12 | retain_loss: 2.402 | avg_entropy: 12.12 | param_change: 0.003344


 96%|█████████▌| 961/1000 [35:10<01:20,  2.07s/it]

loss: -10.44 | unlearn_loss: -11.98 | retain_loss: 1.542 | avg_entropy: 11.98 | param_change: 0.002059


 96%|█████████▌| 962/1000 [35:12<01:18,  2.06s/it]

loss: -10.54 | unlearn_loss: -12.1 | retain_loss: 1.565 | avg_entropy: 12.1 | param_change: 0.002134


 96%|█████████▋| 963/1000 [35:14<01:15,  2.04s/it]

loss: -10.93 | unlearn_loss: -12.1 | retain_loss: 1.17 | avg_entropy: 12.1 | param_change: 0.001783


 96%|█████████▋| 964/1000 [35:16<01:14,  2.06s/it]

loss: -10.78 | unlearn_loss: -11.76 | retain_loss: 0.9825 | avg_entropy: 11.76 | param_change: 0.001214


 96%|█████████▋| 965/1000 [35:18<01:11,  2.05s/it]

loss: -10.79 | unlearn_loss: -12.1 | retain_loss: 1.316 | avg_entropy: 12.1 | param_change: 0.00193


 97%|█████████▋| 966/1000 [35:20<01:10,  2.07s/it]

loss: -10.98 | unlearn_loss: -12.09 | retain_loss: 1.106 | avg_entropy: 12.09 | param_change: 0.001618


 97%|█████████▋| 967/1000 [35:22<01:07,  2.05s/it]

loss: -10.46 | unlearn_loss: -11.96 | retain_loss: 1.5 | avg_entropy: 11.96 | param_change: 0.002082


 97%|█████████▋| 968/1000 [35:24<01:05,  2.05s/it]

loss: -9.871 | unlearn_loss: -12.06 | retain_loss: 2.193 | avg_entropy: 12.06 | param_change: 0.003537


 97%|█████████▋| 969/1000 [35:26<01:03,  2.03s/it]

loss: -10.74 | unlearn_loss: -11.96 | retain_loss: 1.22 | avg_entropy: 11.96 | param_change: 0.001436


 97%|█████████▋| 970/1000 [35:28<01:02,  2.10s/it]

loss: -10.91 | unlearn_loss: -12.12 | retain_loss: 1.208 | avg_entropy: 12.12 | param_change: 0.001509


 97%|█████████▋| 971/1000 [35:31<01:02,  2.14s/it]

loss: -10 | unlearn_loss: -12.12 | retain_loss: 2.114 | avg_entropy: 12.12 | param_change: 0.002714


 97%|█████████▋| 972/1000 [35:33<00:59,  2.14s/it]

loss: -11.02 | unlearn_loss: -12.11 | retain_loss: 1.089 | avg_entropy: 12.11 | param_change: 0.001685


 97%|█████████▋| 973/1000 [35:35<00:57,  2.11s/it]

loss: -10.52 | unlearn_loss: -11.9 | retain_loss: 1.378 | avg_entropy: 11.9 | param_change: 0.001968


 97%|█████████▋| 974/1000 [35:37<00:54,  2.10s/it]

loss: -10.54 | unlearn_loss: -11.9 | retain_loss: 1.356 | avg_entropy: 11.9 | param_change: 0.002014


 98%|█████████▊| 975/1000 [35:39<00:52,  2.10s/it]

loss: -9.837 | unlearn_loss: -12.1 | retain_loss: 2.266 | avg_entropy: 12.1 | param_change: 0.002837


 98%|█████████▊| 976/1000 [35:41<00:50,  2.09s/it]

loss: -10.79 | unlearn_loss: -12.1 | retain_loss: 1.314 | avg_entropy: 12.1 | param_change: 0.002047


 98%|█████████▊| 977/1000 [35:43<00:48,  2.10s/it]

loss: -11.21 | unlearn_loss: -12.08 | retain_loss: 0.8614 | avg_entropy: 12.08 | param_change: 0.001215


 98%|█████████▊| 978/1000 [35:45<00:46,  2.12s/it]

loss: -10.59 | unlearn_loss: -11.91 | retain_loss: 1.326 | avg_entropy: 11.91 | param_change: 0.002979


 98%|█████████▊| 979/1000 [35:47<00:43,  2.08s/it]

loss: -10.51 | unlearn_loss: -11.96 | retain_loss: 1.447 | avg_entropy: 11.96 | param_change: 0.002011


 98%|█████████▊| 980/1000 [35:50<00:41,  2.10s/it]

loss: -11.03 | unlearn_loss: -12.12 | retain_loss: 1.092 | avg_entropy: 12.12 | param_change: 0.00234


 98%|█████████▊| 981/1000 [35:52<00:40,  2.11s/it]

loss: -9.733 | unlearn_loss: -11.93 | retain_loss: 2.194 | avg_entropy: 11.93 | param_change: 0.00267


 98%|█████████▊| 982/1000 [35:54<00:37,  2.07s/it]

loss: -10.92 | unlearn_loss: -11.95 | retain_loss: 1.028 | avg_entropy: 11.95 | param_change: 0.001916


 98%|█████████▊| 983/1000 [35:56<00:34,  2.05s/it]

loss: -11.32 | unlearn_loss: -12.1 | retain_loss: 0.7799 | avg_entropy: 12.1 | param_change: 0.001739


 98%|█████████▊| 984/1000 [35:58<00:32,  2.04s/it]

loss: -11.12 | unlearn_loss: -11.92 | retain_loss: 0.8 | avg_entropy: 11.92 | param_change: 0.001455


 98%|█████████▊| 985/1000 [36:00<00:30,  2.01s/it]

loss: -11.17 | unlearn_loss: -12.12 | retain_loss: 0.95 | avg_entropy: 12.12 | param_change: 0.001707


 99%|█████████▊| 986/1000 [36:02<00:28,  2.05s/it]

loss: -11.14 | unlearn_loss: -12.12 | retain_loss: 0.9748 | avg_entropy: 12.12 | param_change: 0.002159


 99%|█████████▊| 987/1000 [36:04<00:26,  2.03s/it]

loss: -10.68 | unlearn_loss: -11.95 | retain_loss: 1.27 | avg_entropy: 11.95 | param_change: 0.002947


 99%|█████████▉| 988/1000 [36:06<00:24,  2.01s/it]

loss: -11.42 | unlearn_loss: -12.12 | retain_loss: 0.6997 | avg_entropy: 12.12 | param_change: 0.001203


 99%|█████████▉| 989/1000 [36:08<00:22,  2.00s/it]

loss: -11.2 | unlearn_loss: -12.12 | retain_loss: 0.9182 | avg_entropy: 12.12 | param_change: 0.002255


 99%|█████████▉| 990/1000 [36:10<00:20,  2.03s/it]

loss: -11.08 | unlearn_loss: -12.06 | retain_loss: 0.978 | avg_entropy: 12.06 | param_change: 0.002127


 99%|█████████▉| 991/1000 [36:12<00:18,  2.04s/it]

loss: -11.11 | unlearn_loss: -12.12 | retain_loss: 1.011 | avg_entropy: 12.12 | param_change: 0.001806


 99%|█████████▉| 992/1000 [36:14<00:16,  2.05s/it]

loss: -11.45 | unlearn_loss: -12.12 | retain_loss: 0.6687 | avg_entropy: 12.12 | param_change: 0.001256


 99%|█████████▉| 993/1000 [36:16<00:14,  2.01s/it]

loss: -10.83 | unlearn_loss: -12.13 | retain_loss: 1.294 | avg_entropy: 12.13 | param_change: 0.00203


 99%|█████████▉| 994/1000 [36:18<00:11,  1.99s/it]

loss: -10.72 | unlearn_loss: -11.98 | retain_loss: 1.261 | avg_entropy: 11.98 | param_change: 0.002578


100%|█████████▉| 995/1000 [36:20<00:10,  2.02s/it]

loss: -10.12 | unlearn_loss: -12.13 | retain_loss: 2.005 | avg_entropy: 12.13 | param_change: 0.003773


100%|█████████▉| 996/1000 [36:22<00:08,  2.02s/it]

loss: -9.884 | unlearn_loss: -11.41 | retain_loss: 1.528 | avg_entropy: 11.41 | param_change: 0.002914


100%|█████████▉| 997/1000 [36:24<00:06,  2.07s/it]

loss: -11.15 | unlearn_loss: -12.12 | retain_loss: 0.9773 | avg_entropy: 12.12 | param_change: 0.001706


100%|█████████▉| 998/1000 [36:26<00:04,  2.13s/it]

loss: -10.67 | unlearn_loss: -11.9 | retain_loss: 1.226 | avg_entropy: 11.9 | param_change: 0.001533


100%|█████████▉| 999/1000 [36:29<00:02,  2.19s/it]

loss: -11.06 | unlearn_loss: -12.1 | retain_loss: 1.042 | avg_entropy: 12.1 | param_change: 0.001374


100%|██████████| 1000/1000 [36:31<00:00,  2.19s/it]


Saved model to ./bio/maxentropy-bio-only


In [10]:
# # Clear memory first
# import gc
# gc.collect()
# torch.cuda.empty_cache()

# args = get_args_notebook(
#     output_dir="./bio/graddiff-bio-only",
#     forget_corpora=["bio-forget-corpus"],  # Only bio
#     retain_corpora=["wikitext"],
#     batch_size=2,
#     max_num_batches=80
# )

# SEED = args.seed
# torch.cuda.manual_seed(SEED)
# torch.cuda.manual_seed_all(SEED)
# torch.manual_seed(SEED)
# np.random.seed(SEED)

# model, tokenizer = load_model(args.model_name_or_path)
# forget_data_list, retain_data_list = get_data(
#   args.forget_corpora,
#   args.retain_corpora,
#   args.min_len,
#   args.max_len,
#   args.batch_size,
# )

# # Flatten the nested list structure from get_data
# # get_data returns [[batches for corpus1], [batches for corpus2]]
# # run_grad_diff expects a flat list of batches
# forget_flat = [batch for corpus in forget_data_list for batch in corpus]
# retain_flat = [batch for corpus in retain_data_list for batch in corpus]

# run_grad_diff(
#   model=model,
#   tokenizer=tokenizer,
#   forget_data_list=forget_flat,
#   retain_data_list=retain_flat,
#   lr=1e-5,
#   num_epochs=1,
#   alpha=args.alpha[0],  # Use first alpha value
#   max_length=512,
#   output_dir=args.output_dir,
# )

In [11]:
# # Clear memory first
# import gc
# gc.collect()
# torch.cuda.empty_cache()

# args = get_args_notebook(
#     output_dir="./bio/npo-bio-only",
#     forget_corpora=["bio-forget-corpus"],
#     retain_corpora=["wikitext"],
#     batch_size=1,  # CRITICAL: Reduce to 1 for NPO (was 2)
#     max_num_batches=80
# )

# SEED = args.seed
# torch.cuda.manual_seed(SEED)
# torch.cuda.manual_seed_all(SEED)
# torch.manual_seed(SEED)
# np.random.seed(SEED)

# # Load only ONE model - reference losses precomputed
# model, tokenizer = load_model(args.model_name_or_path)

# forget_data_list, retain_data_list = get_data(
#     args.forget_corpora,
#     args.retain_corpora,
#     args.min_len,
#     args.max_len,
#     args.batch_size,
# )

# # Flatten the nested list structure from get_data
# forget_flat = [batch for corpus in forget_data_list for batch in corpus]
# retain_flat = [batch for corpus in retain_data_list for batch in corpus]

# run_npo(
#     model=model,
#     tokenizer=tokenizer,
#     forget_data_list=forget_flat,
#     retain_data_list=retain_flat,
#     lr=1e-5,
#     num_epochs=1,
#     beta=0.1,  # NPO beta parameter
#     alpha=args.alpha[0],  # Use first alpha value for retain loss
#     max_length=512,
#     output_dir=args.output_dir,
# )