In [None]:
# Import necessary modules
import torch
from pathlib import Path
from litgpt import Config
from litgpt.data import TextFiles
from litgpt.pretrain import setup
from litgpt.args import EvalArgs, TrainArgs

# Define your custom model configuration
class CustomConfig(Config):
    def __init__(self, **kwargs):
        super().__init__(
            name="custom_model",
            n_layer=6,
            n_head=6,
            n_embd=384,
            block_size=1024,
            vocab_size=25005,  # Adjust this based on your tokenizer
            **kwargs
        )


# Set up the data module
data_module = TextFiles(
    train_data_path=Path("/content/drive/MyDrive/litgpt/input/train"),
    val_data_path=Path("/content/drive/MyDrive/litgpt/input/val"),  # Optional, can be None
    seed=42
)

# Set up training arguments
train_args = TrainArgs(
    save_interval=1000,
    log_interval=1,
    global_batch_size=32,
    micro_batch_size=4,
    max_tokens=int(1e9),  # Adjust based on your dataset size and available compute
    max_norm=1.0,
    min_lr=4e-5,
    lr_warmup_steps=700,
    max_steps=19073,
    tie_embeddings=False,
)

# Set up evaluation arguments
eval_args = EvalArgs(interval=1000, max_iters=100)

# Run the pretraining setup
setup(
    model_name="custom_model",
    model_config=CustomConfig(),
    data=data_module,
    train=train_args,
    eval=eval_args,
    out_dir=Path("./out/pretrain_custom"),
    precision="bf16-mixed",  # Adjust based on your hardware capabilities
    devices="auto",
    tokenizer_dir=Path("/path/to/your/tokenizer/directory"),  # If you have a custom tokenizer
)