# Test multinode multigpu training with deepspeed and rayThis notebook requires access to a running Ray cluster with GPUs available on each node. Make sure the cluster is started and that the notebook kernel is connected to it before executing any code.Execute the cells sequentially to launch the training job using Ray and DeepSpeed.Key configuration files:- `config/train_config.yaml` – overall training parameters.- `config/ds_config.json` – DeepSpeed ZeRO strategy settings.

In [None]:
import torch
import pytorch_lightning as pl
from pytorch_lightning.strategies import DeepSpeedStrategy
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from ray.train.lightning import RayLightningTrainer
from ray.train import ScalingConfig


# LightningModule for training your Hugging Face LLM
class LLMModel(pl.LightningModule):
    def __init__(self, model_name, learning_rate=1e-5):
        super().__init__()
        self.save_hyperparameters()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        return outputs.loss

    def training_step(self, batch, batch_idx):
        loss = self(**batch)
        self.log("train_loss", loss, on_step=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.learning_rate)


# Dataset preparation
def prepare_dataset(tokenizer, seq_length=512):
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=seq_length
        )

    tokenized_dataset = dataset.map(tokenize_fn, batched=True)
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

    return DataLoader(tokenized_dataset, batch_size=2, shuffle=True)


# Training function (Ray calls this internally)
def train_func(config):
    model_name = "gpt2"  # replace with your desired LLM
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_loader = prepare_dataset(tokenizer)

    model = LLMModel(model_name=model_name)

    # DeepSpeed ZeRO-3 strategy with CPU offload
    strategy = DeepSpeedStrategy(config="ds_config.json")

    trainer = pl.Trainer(
        strategy=strategy,
        accelerator="gpu",
        devices=torch.cuda.device_count(),
        precision=16,
        max_epochs=3
    )

    trainer.fit(model, train_loader)


# Main entry-point (Ray handles the distributed training orchestration)
if __name__ == "__main__":

    scaling_config = ScalingConfig(
        num_workers=8,  # total number of GPUs across your nodes
        use_gpu=True,
        resources_per_worker={"CPU": 8, "GPU": 1},
    )

    trainer = RayLightningTrainer(
        scaling_config=scaling_config,
        run_config=None,
        lightning_config={},
        trainer_init_config={},
    )

    trainer.fit(train_func)
