In [1]:
from hydra import initialize, compose
from omegaconf import OmegaConf
import torch
import torch.nn as nn
from accelerate import Accelerator
from colorama import Fore, Style
from datasets import load_dataset
from dotenv import dotenv_values
from torch.utils.data import DataLoader
# from tqdm.auto import tqdm
from transformers import AutoTokenizer

from hercules import MemoryLlama, log_config

ModuleNotFoundError: No module named 'hydra'

In [2]:
initialize(version_base="1.3", config_path="hercules/config")
cfg = compose("pre_training")



In [3]:
OmegaConf.set_struct(cfg, False)
cfg_dict = OmegaConf.to_container(cfg, resolve=True)
log_config(cfg_dict)
cfg.memory_llama["token"] = dotenv_values(".env")["HF_TOKEN"]

accelerator = Accelerator()
device = accelerator.device

model = MemoryLlama(neural_memory_config=cfg.lmm, **cfg.memory_llama)
tokenizer = AutoTokenizer.from_pretrained(cfg.memory_llama.llama_hf_path)
tokenizer.pad_token = tokenizer.eos_token

model.to(device)
print(f"{Style.BRIGHT}{Fore.RED}Using device: {device}")

Config:
Hyperparameters:{
    "lmm": {
        "attention_window_size": 32,
        "hidden_dim": 512,
        "learning_rate": 0.0004,
        "max_adaptive_lr": 0.01,
        "meta_memory_dim": 16,
        "n_chunks": 4,
        "n_hidden_layers": 1,
        "num_attention_heads": 4,
        "output_dim": 128
    },
    "memory_llama": {
        "device_map": "auto",
        "freeze_llama_layers": true,
        "llama_hf_path": "meta-llama/Llama-3.2-1B",
        "memory_layer_ids": -1,
        "quantize": true
    },
    "train": {
        "batch_size": 8,
        "epochs": 1,
        "outer_learning_rate": 0.0003,
        "sample_size": 512
    }
}
Memory Llama:"
Trainable parameters: 1.984e+07
Frozen parameters: 1.236e+09
Using device: cpu


In [4]:
torch.__version__

'2.6.0+cpu'

In [5]:
torch.cuda.get_device_name(0)

AssertionError: Torch not compiled with CUDA enabled

In [None]:
train_ds = load_dataset("RMT-team/babilong-train-5k-samples", "2k", split="qa1")
train_loader = DataLoader(train_ds, batch_size=1)
total_loss = 0

model, train_loader = accelerator.prepare(model, train_loader)

for epoch in tqdm(range(cfg.train.epochs)):
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{cfg.train.epochs}")

    for batch in train_loader:
        inputs = batch["input"]
        questions = batch["question"]
        targets = batch["target"]

        prompts = tokenizer(
            [f"{i}, Question: {q}, Answer:" for i, q in zip(inputs, questions)],
            padding="max_length",
            return_tensors="pt",
        )
        labels = tokenizer(targets, return_tensors="pt")

        input_ids = prompts["input_ids"].to(device)
        attention_mask = prompts["attention_mask"].to(device)

        # outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # batch_inputs = {"input_ids": batch, "labels": batch}

        # outputs = model(**batch_inputs)
        # loss = outputs.loss

        # total_loss += loss.item()
        # progress_bar.set_postfix({"loss": loss.item()})  # noqa: F821
        break

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/1:   0%|          | 0/5000 [00:00<?, ?it/s]