In [1]:
# !rm -rf ./* ./.*
# !git clone https://github.com/Kuduxaaa/ava-llm .
!rm -rf checkpoints

In [2]:
import torch
import json
import traceback
import numpy as np

from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer

from ava import AvaConfig, AvaForCausalLM
from ava.data.datasets import AvaDataset
from ava.training.trainer import train_model
from ava.utils import collate_fn


░░      ░░░  ░░░░  ░░░      ░░
▒  ▒▒▒▒  ▒▒  ▒▒▒▒  ▒▒  ▒▒▒▒  ▒
▓  ▓▓▓▓  ▓▓▓  ▓▓  ▓▓▓  ▓▓▓▓  ▓
█        ████    ████        █
█  ████  █████  █████  ████  █



In [3]:
config = AvaConfig().apply_for('100m')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
device = 'cuda' if torch.cuda.is_available() else 'cpu'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
config.vocab_size = len(tokenizer)
config.pad_token_id = tokenizer.pad_token_id
config.bos_token_id = tokenizer.bos_token_id or tokenizer.eos_token_id
config.eos_token_id = tokenizer.eos_token_id

print(f'Tokenizer vocabulary size: {len(tokenizer)}')
print(f'Config vocabulary size: {config.vocab_size}')

Tokenizer vocabulary size: 50258
Config vocabulary size: 50258


In [5]:
with open('/content/data/oasst1_en_conv.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

data_small = data[:50]
valid_data = []

for conv in data_small:
    if isinstance(conv, list) and len(conv) > 0:
        valid_data.append(conv[0])

print(f'Found {len(valid_data)}/{len(data_small)} valid conversations')

Found 50/50 valid conversations


In [6]:
np.random.shuffle(valid_data)
split_idx = int(len(valid_data) * 0.9)
train_data = valid_data[:split_idx]
val_data = valid_data[split_idx:]

max_seq_length = 256
train_dataset = AvaDataset(train_data, tokenizer, max_length=max_seq_length)
val_dataset = AvaDataset(val_data, tokenizer, max_length=max_seq_length)

print(f'Training dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')

if len(train_dataset) == 0 or len(val_dataset) == 0:
    raise ValueError('Dataset is empty after processing. Check data format and filtering.')

batch_size = 2
train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle    = True,
    collate_fn = collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn = collate_fn
)

Training dataset size: 45
Validation dataset size: 5


In [7]:
sample_batch = next(iter(train_loader))

print(f'Sample batch shapes:')
print(f'input_ids: {sample_batch["input_ids"].shape}')
print(f'attention_mask: {sample_batch["attention_mask"].shape}')
print(f'labels: {sample_batch["labels"].shape}')

Sample batch shapes:
input_ids: torch.Size([2, 256])
attention_mask: torch.Size([2, 256])
labels: torch.Size([2, 256])


In [8]:
max_token_id = torch.max(sample_batch['input_ids']).item()
print(f'Maximum token ID in batch: {max_token_id}')
print(f'Tokenizer vocabulary size: {len(tokenizer)}')

if max_token_id >= len(tokenizer):
    raise ValueError(f'Maximum token ID {max_token_id} is out of range for vocabulary size {len(tokenizer)}')

Maximum token ID in batch: 50257
Tokenizer vocabulary size: 50258


In [9]:
model = AvaForCausalLM(config).to(device)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 5e-5,
    weight_decay = 0.01
)

In [None]:
try:
    train_model(
        model        = model,
        train_loader = train_loader,
        val_loader   = val_loader,
        optimizer    = optimizer,
        num_epochs   = 1,
        device       = device
    )

    torch.save(model.state_dict(), 'ava_model_trained.pt')

except Exception as e:
    print(f'❌ Training error: {e}')
    traceback.print_exc()

except KeyboardInterrupt:
    print('🙄 As you wish, Sir!')

✨ Starting training...
🍀 Epoch 1/1 | Batch 0/23 | Loss: 9.4651 | Time: 11.19s
🍀 Epoch 1/1 completed in 176.29s | Average Loss: 8.9878
💾 Checkpoint saved to checkpoints/ava_model_epoch_1.pt


In [None]:
print(f"Config: hidden_size={config.hidden_size}, num_attention_heads={config.num_attention_heads}, head_dim={config.head_dim}")

In [None]:
input_text = 'User: What is AI?\nAssistant:'
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

try:
    output = model.generate(
        input_ids,
        max_length=100,
        temperature=0.7,
        top_p=0.9
    )

    print(tokenizer.decode(output[0]))
except Exception as e:
    print(f'❌ Generation error: {e}')
    traceback.print_exc()