In [1]:
# Simplified imports for single GPU
from logging import getLogger
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from memory_layers import HashingMemory, MemoryLayerMonitorAndCheckpoint, load_and_process_dataset, ModelEvaluator

logger = getLogger()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load Qwen0.5 Instruct
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", dtype=torch.float16).to(device)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", dtype=torch.float16)

# Qwen0.5 specs: 896 hidden_dim, 24 layers
hidden_dim = 896
layers_to_replace = [6, 12, 18]  # Which FFN layers to replace

# Replace FFNs with Memory Layers
for layer_idx in layers_to_replace:
    layer = model.model.layers[layer_idx]
    
    # Create memory layer
    memory_layer = HashingMemory(
        input_dim=hidden_dim,
        output_dim=hidden_dim,
        mem_n_keys=128,          # Memory size = 512¬≤ = 262k entries
        mem_heads=4,
        mem_knn=16,
        mem_k_dim=256,
        mem_v_dim=-1,            # Auto: uses output_dim
        swilu_projection=True,
        value_fixed_lr=0.001,
        mem_share_values=False,  # Don't share across layers for fine-tuning
    )
    
    # Initialize the memory layer
    memory_layer.reset_parameters()
    # Ensure memory layer matches model dtype (float16)
    memory_layer.to(device)
    
    # Replace the FFN (MLP) with memory layer
    original_mlp = layer.mlp
    layer.mlp = memory_layer
    
    print(f"Replaced layer {layer_idx} FFN with memory layer")

# FREEZE EVERYTHING EXCEPT MEMORY LAYERS
for name, param in model.named_parameters():
    if 'mlp' in name and any(f'layers.{idx}.' in name for idx in layers_to_replace):
        # This is a memory layer parameter - keep trainable
        param.requires_grad = True
        print(f"‚úì Trainable: {name}")
    else:
        # Freeze all other parameters
        param.requires_grad = False

# Verify what's trainable
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTrainable: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)")

Replaced layer 6 FFN with memory layer
Replaced layer 12 FFN with memory layer
Replaced layer 18 FFN with memory layer
‚úì Trainable: model.layers.6.mlp.keys
‚úì Trainable: model.layers.6.mlp.values.weight
‚úì Trainable: model.layers.6.mlp.value_proj.weight
‚úì Trainable: model.layers.6.mlp.value_proj.bias
‚úì Trainable: model.layers.6.mlp.swilu_projection.weight
‚úì Trainable: model.layers.6.mlp.swilu_projection.bias
‚úì Trainable: model.layers.6.mlp.query_proj.query_mlps.0.weight
‚úì Trainable: model.layers.6.mlp.query_proj.query_mlps.0.bias
‚úì Trainable: model.layers.12.mlp.keys
‚úì Trainable: model.layers.12.mlp.values.weight
‚úì Trainable: model.layers.12.mlp.value_proj.weight
‚úì Trainable: model.layers.12.mlp.value_proj.bias
‚úì Trainable: model.layers.12.mlp.swilu_projection.weight
‚úì Trainable: model.layers.12.mlp.swilu_projection.bias
‚úì Trainable: model.layers.12.mlp.query_proj.query_mlps.0.weight
‚úì Trainable: model.layers.12.mlp.query_proj.query_mlps.0.bias
‚úì Trainab

In [3]:
# Load and process dataset
tokenized = load_and_process_dataset(tokenizer, sample_size=10000)

Filtered dataset size: 7669
Tokenized dataset: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7669
})


In [4]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from memory_layers import ModelEvaluator

# Training arguments optimized for memory layers only
training_args = TrainingArguments(
    output_dir="./qwen_memory_finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=5e-4,  # Higher LR since only training memory
    warmup_steps=100,
    lr_scheduler_type="cosine",
    logging_steps=10,
    logging_first_step=True,  # Log immediately
    logging_dir="./logs",
    save_steps=500,
    eval_strategy="steps",
    eval_steps=100,   
    # Performance
    fp16=True,
    gradient_checkpointing=False,  # Not needed with frozen base
    dataloader_num_workers=2,
    
    # Monitoring
    report_to="tensorboard",  # or "wandb" if you have it
    # load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_strategy="no",
    
    # Memory optimization
    optim="adamw_torch_fused",  # Faster optimizer
    max_grad_norm=1.0,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Evaluator
evaluator = ModelEvaluator(model, tokenizer, device=device)

# Initialize callback
memory_monitor = MemoryLayerMonitorAndCheckpoint(
    model=model,
    layers_to_check=layers_to_replace,
    save_every=500,
    keep_last=2,
    monitor_every=50,
    evaluator=evaluator,
    eval_every=50,     # Run evaluation every 100 steps
    eval_samples=20     # Small sample size for speed during training
)

# Create trainer with callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    eval_dataset=tokenized.select(range(1000)),  # Use 1k for validation
    data_collator=data_collator,
    callbacks=[memory_monitor],  # Add our custom monitor
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
print("\nüöÄ Starting training...")
print(f"Total steps: {len(tokenized) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

# Train! 
trainer.train()

print("\n‚úÖ Training complete!")


üöÄ Starting training...
Total steps: 1437


Step,Training Loss,Validation Loss
100,2.0113,1.947128
200,1.8453,1.891734
300,1.9153,1.85564
400,1.8464,1.823111
500,1.7144,1.788283
600,1.8017,1.745393
700,1.8962,1.716421
800,1.869,1.689151
900,1.763,1.666582
1000,1.6231,1.637899


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 50

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0001, std=0.0361
    Values: mean=-0.0000, std=0.0334
  Changes since start:
    Keys:   0.000883 ‚úÖ
    Values: 0.000788 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0361
    Values: mean=+0.0000, std=0.0334
  Changes since start:
    Keys:   0.000818 ‚úÖ
    Values: 0.000778 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0361
    Values: mean=-0.0000, std=0.0334
  Changes since start:
    Keys:   0.000802 ‚úÖ
    Values: 0.000746 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 50
Evaluating on TriviaQA (validation, 20 samples)...


  0%|          | 0/20 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.37s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  8.98it/s]

HellaSwag Accuracy: 20.00%
Step 50 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 100

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0001, std=0.0360
    Values: mean=-0.0000, std=0.0335
  Changes since start:
    Keys:   0.002326 ‚úÖ
    Values: 0.001477 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0361
    Values: mean=+0.0000, std=0.0335
  Changes since start:
    Keys:   0.002094 ‚úÖ
    Values: 0.001545 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0362
    Values: mean=-0.0000, std=0.0335
  Changes since start:
    Keys:   0.002230 ‚úÖ
    Values: 0.001748 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 100
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.43s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.13it/s]

HellaSwag Accuracy: 25.00%
Step 100 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 150

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0001, std=0.0361
    Values: mean=-0.0000, std=0.0335
  Changes since start:
    Keys:   0.003030 ‚úÖ
    Values: 0.001746 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0362
    Values: mean=+0.0000, std=0.0335
  Changes since start:
    Keys:   0.002975 ‚úÖ
    Values: 0.001850 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0364
    Values: mean=-0.0000, std=0.0336
  Changes since start:
    Keys:   0.003274 ‚úÖ
    Values: 0.002324 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 150
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.43s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.14it/s]

HellaSwag Accuracy: 15.00%
Step 150 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0500
  hellaswag: 0.1500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 200

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0362
    Values: mean=-0.0000, std=0.0335
  Changes since start:
    Keys:   0.003554 ‚úÖ
    Values: 0.001961 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0363
    Values: mean=+0.0000, std=0.0336
  Changes since start:
    Keys:   0.003590 ‚úÖ
    Values: 0.002048 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0366
    Values: mean=-0.0000, std=0.0336
  Changes since start:
    Keys:   0.004029 ‚úÖ
    Values: 0.002730 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 200
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:06<00:00,  6.34s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.07it/s]

HellaSwag Accuracy: 25.00%
Step 200 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 250

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0363
    Values: mean=-0.0000, std=0.0336
  Changes since start:
    Keys:   0.004005 ‚úÖ
    Values: 0.002140 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0364
    Values: mean=+0.0000, std=0.0336
  Changes since start:
    Keys:   0.004135 ‚úÖ
    Values: 0.002187 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0367
    Values: mean=-0.0000, std=0.0337
  Changes since start:
    Keys:   0.004666 ‚úÖ
    Values: 0.003033 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 250
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.40s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.10it/s]

HellaSwag Accuracy: 20.00%
Step 250 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 300

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0364
    Values: mean=-0.0000, std=0.0336
  Changes since start:
    Keys:   0.004338 ‚úÖ
    Values: 0.002281 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0365
    Values: mean=+0.0001, std=0.0336
  Changes since start:
    Keys:   0.004587 ‚úÖ
    Values: 0.002328 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0369
    Values: mean=-0.0000, std=0.0337
  Changes since start:
    Keys:   0.005166 ‚úÖ
    Values: 0.003262 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 300
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.26it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.37s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.08it/s]

HellaSwag Accuracy: 15.00%
Step 300 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.1500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 350

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0365
    Values: mean=-0.0000, std=0.0336
  Changes since start:
    Keys:   0.004651 ‚úÖ
    Values: 0.002418 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0366
    Values: mean=+0.0001, std=0.0336
  Changes since start:
    Keys:   0.004930 ‚úÖ
    Values: 0.002432 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0370
    Values: mean=-0.0000, std=0.0338
  Changes since start:
    Keys:   0.005609 ‚úÖ
    Values: 0.003453 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 350
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:09<00:00,  6.46s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.07it/s]

HellaSwag Accuracy: 20.00%
Step 350 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0500
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 400

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0366
    Values: mean=-0.0000, std=0.0337
  Changes since start:
    Keys:   0.004930 ‚úÖ
    Values: 0.002550 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0367
    Values: mean=+0.0001, std=0.0337
  Changes since start:
    Keys:   0.005237 ‚úÖ
    Values: 0.002534 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0372
    Values: mean=-0.0000, std=0.0338
  Changes since start:
    Keys:   0.005955 ‚úÖ
    Values: 0.003631 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 400
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.44s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.10it/s]

HellaSwag Accuracy: 20.00%
Step 400 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 450

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0367
    Values: mean=-0.0000, std=0.0337
  Changes since start:
    Keys:   0.005211 ‚úÖ
    Values: 0.002694 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0368
    Values: mean=+0.0001, std=0.0337
  Changes since start:
    Keys:   0.005514 ‚úÖ
    Values: 0.002630 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0373
    Values: mean=-0.0000, std=0.0339
  Changes since start:
    Keys:   0.006306 ‚úÖ
    Values: 0.003794 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 450
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 10.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:09<00:00,  6.46s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  8.71it/s]

HellaSwag Accuracy: 20.00%
Step 450 Evaluation Results:
  trivia_qa: 0.1000
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 500

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0368
    Values: mean=-0.0000, std=0.0337
  Changes since start:
    Keys:   0.005515 ‚úÖ
    Values: 0.002843 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0369
    Values: mean=+0.0001, std=0.0337
  Changes since start:
    Keys:   0.005759 ‚úÖ
    Values: 0.002721 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0375
    Values: mean=-0.0000, std=0.0339
  Changes since start:
    Keys:   0.006622 ‚úÖ
    Values: 0.003963 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 500
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:06<00:00,  6.32s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.21it/s]


HellaSwag Accuracy: 20.00%
Step 500 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl


üíæ Saving checkpoint at step 500...
  ‚úÖ Checkpoint saved: ./checkpoints/step-500


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 550

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0370
    Values: mean=-0.0000, std=0.0338
  Changes since start:
    Keys:   0.005834 ‚úÖ
    Values: 0.002966 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0371
    Values: mean=+0.0001, std=0.0337
  Changes since start:
    Keys:   0.006073 ‚úÖ
    Values: 0.002794 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0378
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007018 ‚úÖ
    Values: 0.004118 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 550
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.26it/s]


TriviaQA Accuracy: 20.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.45s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.00it/s]

HellaSwag Accuracy: 20.00%
Step 550 Evaluation Results:
  trivia_qa: 0.2000
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 600

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0372
    Values: mean=-0.0000, std=0.0338
  Changes since start:
    Keys:   0.006065 ‚úÖ
    Values: 0.003081 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0373
    Values: mean=+0.0001, std=0.0338
  Changes since start:
    Keys:   0.006323 ‚úÖ
    Values: 0.002862 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0381
    Values: mean=-0.0000, std=0.0341
  Changes since start:
    Keys:   0.007343 ‚úÖ
    Values: 0.004268 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 600
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.23it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.45s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.09it/s]

HellaSwag Accuracy: 20.00%
Step 600 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 650

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0374
    Values: mean=-0.0000, std=0.0338
  Changes since start:
    Keys:   0.006289 ‚úÖ
    Values: 0.003187 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0375
    Values: mean=+0.0001, std=0.0338
  Changes since start:
    Keys:   0.006554 ‚úÖ
    Values: 0.002929 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0384
    Values: mean=-0.0000, std=0.0341
  Changes since start:
    Keys:   0.007646 ‚úÖ
    Values: 0.004401 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 650
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:09<00:00,  6.45s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.02it/s]

HellaSwag Accuracy: 20.00%
Step 650 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2000
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 700

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0376
    Values: mean=-0.0000, std=0.0339
  Changes since start:
    Keys:   0.006473 ‚úÖ
    Values: 0.003288 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0377
    Values: mean=+0.0001, std=0.0338
  Changes since start:
    Keys:   0.006751 ‚úÖ
    Values: 0.002983 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0387
    Values: mean=-0.0000, std=0.0342
  Changes since start:
    Keys:   0.007888 ‚úÖ
    Values: 0.004513 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 700
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.40s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.05it/s]

HellaSwag Accuracy: 15.00%
Step 700 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.1500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 750

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0377
    Values: mean=-0.0000, std=0.0339
  Changes since start:
    Keys:   0.006625 ‚úÖ
    Values: 0.003360 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0378
    Values: mean=+0.0001, std=0.0338
  Changes since start:
    Keys:   0.006931 ‚úÖ
    Values: 0.003034 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0389
    Values: mean=-0.0000, std=0.0342
  Changes since start:
    Keys:   0.008123 ‚úÖ
    Values: 0.004617 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 750
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.40s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.12it/s]

HellaSwag Accuracy: 25.00%
Step 750 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0500
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 800

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0378
    Values: mean=-0.0000, std=0.0339
  Changes since start:
    Keys:   0.006764 ‚úÖ
    Values: 0.003429 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0379
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007074 ‚úÖ
    Values: 0.003078 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0000, std=0.0391
    Values: mean=-0.0000, std=0.0342
  Changes since start:
    Keys:   0.008294 ‚úÖ
    Values: 0.004717 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 800
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.37s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.14it/s]

HellaSwag Accuracy: 25.00%
Step 800 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 850

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0379
    Values: mean=-0.0000, std=0.0339
  Changes since start:
    Keys:   0.006880 ‚úÖ
    Values: 0.003493 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0380
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007179 ‚úÖ
    Values: 0.003115 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0392
    Values: mean=-0.0000, std=0.0343
  Changes since start:
    Keys:   0.008436 ‚úÖ
    Values: 0.004799 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 850
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 10.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.43s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.14it/s]

HellaSwag Accuracy: 25.00%
Step 850 Evaluation Results:
  trivia_qa: 0.1000
  gsm8k: 0.0500
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 900

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0380
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.006968 ‚úÖ
    Values: 0.003549 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0381
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007275 ‚úÖ
    Values: 0.003143 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0394
    Values: mean=-0.0000, std=0.0343
  Changes since start:
    Keys:   0.008551 ‚úÖ
    Values: 0.004860 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 900
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.40s/it]


GSM8K Accuracy: 10.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.03it/s]

HellaSwag Accuracy: 25.00%
Step 900 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.1000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 950

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0381
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007042 ‚úÖ
    Values: 0.003595 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0382
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007342 ‚úÖ
    Values: 0.003165 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0395
    Values: mean=-0.0000, std=0.0343
  Changes since start:
    Keys:   0.008654 ‚úÖ
    Values: 0.004917 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 950
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:15<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.42s/it]


GSM8K Accuracy: 10.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.05it/s]

HellaSwag Accuracy: 25.00%
Step 950 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.1000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1000

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0382
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007161 ‚úÖ
    Values: 0.003635 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0382
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007461 ‚úÖ
    Values: 0.003189 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0396
    Values: mean=-0.0000, std=0.0343
  Changes since start:
    Keys:   0.008795 ‚úÖ
    Values: 0.004975 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1000
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.23it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.37s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.08it/s]


HellaSwag Accuracy: 25.00%
Step 1000 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0500
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl


üíæ Saving checkpoint at step 1000...
  ‚úÖ Checkpoint saved: ./checkpoints/step-1000


`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1050

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0383
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007286 ‚úÖ
    Values: 0.003667 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0383
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007588 ‚úÖ
    Values: 0.003207 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0397
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.008942 ‚úÖ
    Values: 0.005024 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1050
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.40s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.05it/s]

HellaSwag Accuracy: 25.00%
Step 1050 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1100

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0384
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007371 ‚úÖ
    Values: 0.003692 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0384
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007672 ‚úÖ
    Values: 0.003220 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0398
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009046 ‚úÖ
    Values: 0.005061 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1100
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.39s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.14it/s]

HellaSwag Accuracy: 25.00%
Step 1100 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1150

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0384
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007426 ‚úÖ
    Values: 0.003710 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0384
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007733 ‚úÖ
    Values: 0.003229 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0399
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009117 ‚úÖ
    Values: 0.005085 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1150
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.44s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.09it/s]

HellaSwag Accuracy: 25.00%
Step 1150 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0500
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1200

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0384
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007464 ‚úÖ
    Values: 0.003722 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0385
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007775 ‚úÖ
    Values: 0.003236 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0400
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009169 ‚úÖ
    Values: 0.005103 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1200
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.23it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.40s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  8.98it/s]

HellaSwag Accuracy: 25.00%
Step 1200 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1250

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0385
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007489 ‚úÖ
    Values: 0.003731 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0385
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007803 ‚úÖ
    Values: 0.003240 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0400
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009201 ‚úÖ
    Values: 0.005115 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1250
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.23it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:09<00:00,  6.46s/it]


GSM8K Accuracy: 5.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.10it/s]

HellaSwag Accuracy: 25.00%
Step 1250 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0500
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1300

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0385
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007506 ‚úÖ
    Values: 0.003736 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0385
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007819 ‚úÖ
    Values: 0.003243 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0400
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009220 ‚úÖ
    Values: 0.005122 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1300
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.24it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:07<00:00,  6.38s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.09it/s]

HellaSwag Accuracy: 25.00%
Step 1300 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1350

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0385
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007513 ‚úÖ
    Values: 0.003738 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0385
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007826 ‚úÖ
    Values: 0.003244 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0401
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009229 ‚úÖ
    Values: 0.005125 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1350
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.25it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [02:08<00:00,  6.44s/it]


GSM8K Accuracy: 0.00%
Evaluating on HellaSwag (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:02<00:00,  9.13it/s]

HellaSwag Accuracy: 25.00%
Step 1350 Evaluation Results:
  trivia_qa: 0.1500
  gsm8k: 0.0000
  hellaswag: 0.2500
  ‚úÖ Results saved to training_eval_results.jsonl




`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'trivia_qa' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üîç MEMORY LAYER HEALTH CHECK - Step 1400

üìä Layer 6 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0385
    Values: mean=-0.0000, std=0.0340
  Changes since start:
    Keys:   0.007515 ‚úÖ
    Values: 0.003739 ‚úÖ

üìä Layer 12 Memory:
  Parameters:
    Keys:   mean=-0.0000, std=0.0385
    Values: mean=+0.0001, std=0.0339
  Changes since start:
    Keys:   0.007828 ‚úÖ
    Values: 0.003244 ‚úÖ

üìä Layer 18 Memory:
  Parameters:
    Keys:   mean=+0.0001, std=0.0401
    Values: mean=-0.0000, std=0.0344
  Changes since start:
    Keys:   0.009232 ‚úÖ
    Values: 0.005126 ‚úÖ

‚úÖ All memory layers healthy!


üìä RUNNING BENCHMARK EVALUATION - Step 1400
Evaluating on TriviaQA (validation, 20 samples)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:16<00:00,  1.23it/s]


TriviaQA Accuracy: 15.00%
Evaluating on GSM8K (test, 20 samples)...


 35%|‚ñà‚ñà‚ñà‚ñå      | 7/20 [00:44<01:23,  6.41s/it]

In [None]:
# 1. Check actual loss history
import pandas as pd

# Extract all losses from trainer state
losses = [(log.get('step', i), log.get('loss'), log.get('eval_loss')) 
          for i, log in enumerate(trainer.state.log_history)]

df = pd.DataFrame(losses, columns=['step', 'train_loss', 'eval_loss'])
print(df)

# 2. Check if model is producing valid outputs
model.eval()
test_input = tokenizer("Hello, how are you?", return_tensors="pt").to(device)


    step  train_loss  eval_loss
0      1      2.0335        NaN
1     10      2.1182        NaN
2     20      2.0718        NaN
3     30      2.1551        NaN
4     40      7.7784        NaN
5     50    254.0208        NaN
6     60    280.1049        NaN
7     70    246.2023        NaN
8     80      0.0000        NaN
9     90      0.0000        NaN
10   100      0.0000        NaN
11   100         NaN        NaN
12   110      0.0000        NaN
13   120      0.0000        NaN
14   130      0.0000        NaN
15   140      0.0000        NaN


In [None]:
with torch.no_grad():
    outputs = model(**test_input)
    logits = outputs.logits
    
    print(f"\nLogits stats:")
    print(f"  Mean: {logits.mean().item()}")
    print(f"  Std: {logits.std().item()}")
    print(f"  Min: {logits.min().item()}")
    print(f"  Max: {logits.max().item()}")
    print(f"  Any NaN: {torch.isnan(logits).any().item()}")
    print(f"  Any Inf: {torch.isinf(logits).any().item()}")

# 3. Generate some text to see if model is working
generated = model.generate(**test_input, max_new_tokens=20)
print(f"\nGenerated text: {tokenizer.decode(generated[0])}")


Logits stats:
  Mean: nan
  Std: nan
  Min: nan
  Max: nan
  Any NaN: True
  Any Inf: False


AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


/pytorch/aten/src/ATen/native/cuda/TensorCompare.cu:112: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.


In [None]:
# Get all logged history
log_history = trainer.state.log_history

# Extract just the loss entries
losses = [log['loss'] for log in log_history if 'loss' in log]
steps = [log['step'] for log in log_history if 'loss' in log]

# Plot
plt.plot(steps, losses)
plt.show()

In [None]:
from safetensors.torch import load_file

device = "cuda" if torch.cuda.is_available() else "cpu"
hidden_dim = 896
layers_to_replace = [6, 12, 18]

# Reload model for testing
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct",
    dtype=torch.float16,
).to(device)

# Add memory layers
for idx in layers_to_replace:
    # Initialize and cast to correct device/dtype
    mem_layer = HashingMemory(
        input_dim=hidden_dim, output_dim=hidden_dim, mem_n_keys=128, mem_heads=4,
        mem_knn=16, mem_k_dim=256, mem_v_dim=-1, swilu_projection=True,
        value_fixed_lr=0.001, mem_share_values=False
    )
    # Important: Cast to model's dtype (float16) to avoid "Half and Float" errors
    model.model.layers[idx].mlp = mem_layer.to(device, dtype=model.dtype)

# Load weights
try:
    state_dict = load_file("./qwen_memory_final/model.safetensors")
except:
    state_dict = torch.load("./qwen_memory_final/pytorch_model.bin", 
                           weights_only=False)

model.load_state_dict(state_dict, strict=False)
print("\n‚úÖ Model loaded successfully!")

# Test generation
def test_model(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Try some prompts
# test_prompts = [
#     "Explain quantum computing in simple terms:",
#     "Write a Python function to sort a list:",
#     "What are the health benefits of exercise?",
# ]

# for prompt in test_prompts:
#     print(f"\n{'='*80}")
#     print(f"Prompt: {prompt}")
#     print(f"{'='*80}")
#     response = test_model(prompt)
#     print(response)


‚úÖ Model loaded successfully!


In [None]:
# Load original Qwen model for comparison
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-0.5B-Instruct",
    torch_dtype=torch.float16,
)
base_model.to(device)

def compare_models(prompt):
    # Your fine-tuned model
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        # Fine-tuned
        ft_outputs = model.generate(**inputs, max_new_tokens=100)
        ft_response = tokenizer.decode(ft_outputs[0], skip_special_tokens=True)
        
        # Base
        base_outputs = base_model.generate(**inputs, max_new_tokens=100)
        base_response = tokenizer.decode(base_outputs[0], skip_special_tokens=True)
    
    print(f"\n{'='*80}")
    print(f"Prompt: {prompt}")
    print(f"{'='*80}")
    print(f"\nüî∑ BASE MODEL:")
    print(base_response)
    print(f"\nüî∂ FINE-TUNED (with memory layers):")
    print(ft_response)
    print(f"{'='*80}\n")

# Test
compare_models("Explain machine learning:")