<a href="https://colab.research.google.com/github/Saoudyahya/DeepSpeed-test-model/blob/main/DeepSpeed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import deepspeed
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler

class SimpleDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length',
                                  max_length=max_length, return_tensors='pt')

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

def get_args():
    parser = argparse.ArgumentParser(description='DeepSpeed Training Example')

    # Model and data arguments
    parser.add_argument('--model_name', type=str, default='gpt2', help='Pretrained model name')
    parser.add_argument('--train_batch_size', type=int, default=8, help='Training batch size')
    parser.add_argument('--max_seq_length', type=int, default=512, help='Maximum sequence length')
    parser.add_argument('--num_train_epochs', type=int, default=3, help='Number of training epochs')
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help='Gradient accumulation steps')
    parser.add_argument('--learning_rate', type=float, default=5e-5, help='Learning rate')
    parser.add_argument('--output_dir', type=str, default='./outputs', help='Output directory')

    # DeepSpeed specific arguments
    parser = deepspeed.add_config_arguments(parser)

    args = parser.parse_args()
    return args

def prepare_sample_data(tokenizer, num_samples=100):
    """Generate some dummy data for training."""
    samples = [
        "DeepSpeed is a deep learning optimization library for PyTorch.",
        "It enables training large models with billions of parameters.",
        "DeepSpeed offers optimization techniques like ZeRO, pipeline parallelism, and tensor slicing."
    ] * (num_samples // 3 + 1)
    return samples[:num_samples]

def train():
    args = get_args()

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model = AutoModelForCausalLM.from_pretrained(args.model_name)

    # Prepare dataset
    train_texts = prepare_sample_data(tokenizer)
    train_dataset = SimpleDataset(train_texts, tokenizer, args.max_seq_length)
    train_loader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)

    # Optimizer
    optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)

    # Learning rate scheduler
    num_training_steps = len(train_loader) * args.num_train_epochs
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=num_training_steps
    )

    # DeepSpeed configuration
    ds_config = {
        "train_batch_size": args.train_batch_size * args.gradient_accumulation_steps,
        "gradient_accumulation_steps": args.gradient_accumulation_steps,
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": args.learning_rate,
                "weight_decay": 0.0,
                "torch_adam": True,
                "adam_w_mode": True
            }
        },
        "scheduler": {
            "type": "WarmupLR",
            "params": {
                "warmup_min_lr": 0,
                "warmup_max_lr": args.learning_rate,
                "warmup_num_steps": 100
            }
        },
        "gradient_clipping": 1.0,
        "fp16": {
            "enabled": True
        },
        "zero_optimization": {
            "stage": 2,
            "contiguous_gradients": True,
            "overlap_comm": True,
            "reduce_scatter": True,
            "reduce_bucket_size": 5e8,
            "allgather_bucket_size": 5e8
        }
    }

    # Initialize DeepSpeed
    model_engine, optimizer, _, lr_scheduler = deepspeed.initialize(
        args=args,
        model=model,
        optimizer=optimizer,
        lr_scheduler=lr_scheduler,
        config=ds_config
    )

    # Training loop
    model_engine.train()
    for epoch in range(args.num_train_epochs):
        print(f"Starting epoch {epoch+1}/{args.num_train_epochs}")

        for step, batch in enumerate(train_loader):
            # Move batch to device
            batch = {k: v.to(model_engine.device) for k, v in batch.items()}

            # Forward pass
            outputs = model_engine(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            loss = outputs.loss

            # Backward pass
            model_engine.backward(loss)

            # Update parameters
            model_engine.step()

            # Print metrics
            if step % 10 == 0:
                print(f"Epoch: {epoch+1}/{args.num_train_epochs}, Step: {step}, Loss: {loss.item():.4f}")

    # Save the model
    os.makedirs(args.output_dir, exist_ok=True)
    model_engine.save_checkpoint(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)

    print(f"Training complete. Model saved to {args.output_dir}")

if __name__ == "__main__":
    train()

ModuleNotFoundError: No module named 'deepspeed'

In [2]:
!pip install torch transformers datasets accelerate deepspeed

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting deepspeed
  Downloading deepspeed-0.16.4.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Do

In [3]:
# DeepSpeed Training in Google Colab
# First, install the necessary packages

# Run these installation commands in a cell:
'''
!pip install torch transformers datasets accelerate deepspeed
'''

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler
from datasets import load_dataset
import deepspeed
import argparse
from tqdm.auto import tqdm

# Set up DeepSpeed configuration
def get_deepspeed_config():
    """
    Returns a basic DeepSpeed configuration
    """
    return {
        "train_batch_size": 16,
        "gradient_accumulation_steps": 1,
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": 5e-5,
                "betas": [0.9, 0.999],
                "eps": 1e-8,
                "weight_decay": 0.01
            }
        },
        "scheduler": {
            "type": "WarmupLR",
            "params": {
                "warmup_min_lr": 0,
                "warmup_max_lr": 5e-5,
                "warmup_num_steps": 100
            }
        },
        "zero_optimization": {
            "stage": 2,
            "offload_optimizer": {
                "device": "cpu",
                "pin_memory": True
            },
            "contiguous_gradients": True,
            "overlap_comm": True
        },
        "fp16": {
            "enabled": True
        }
    }

# Save DeepSpeed config to a file
with open('ds_config.json', 'w') as f:
    import json
    json.dump(get_deepspeed_config(), f, indent=4)

# Define a simple classifier model
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.layers(x)

# Set up a dataset - here using the IMDB dataset as an example
def prepare_dataset():
    # Load IMDB dataset from Hugging Face datasets
    dataset = load_dataset("imdb")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
    tokenized_datasets.set_format("torch")

    return tokenized_datasets, tokenizer

# Main training function
def main():
    parser = argparse.ArgumentParser()
    # Add arguments needed for DeepSpeed
    parser = deepspeed.add_config_arguments(parser)
    args = parser.parse_args([])  # In Colab, we'll pass args programmatically

    # Load and prepare data
    tokenized_datasets, tokenizer = prepare_dataset()
    train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=8, shuffle=True)
    eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

    # Load model - Using a pre-trained BERT model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )

    # Initialize DeepSpeed
    model_engine, optimizer, _, _ = deepspeed.initialize(
        args=args,
        model=model,
        model_parameters=model.parameters(),
        config=args.deepspeed_config
    )

    # Training loop
    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)

    # Progress bar
    progress_bar = tqdm(range(num_training_steps))

    model_engine.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            # Move batch to device
            batch = {k: v.to(model_engine.device) for k, v in batch.items()}

            # Forward pass
            outputs = model_engine(**batch)
            loss = outputs.loss

            # Backward pass
            model_engine.backward(loss)
            model_engine.step()

            progress_bar.update(1)
            progress_bar.set_description(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    # Evaluation
    model_engine.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            batch = {k: v.to(model_engine.device) for k, v in batch.items()}
            outputs = model_engine(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            correct += (predictions == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    accuracy = correct / total
    print(f"Evaluation Accuracy: {accuracy:.4f}")

    # Save the model
    model_engine.save_checkpoint("./saved_model")
    print("Model saved to ./saved_model")

# Run the training
if __name__ == "__main__":
    # For Colab, we need to set some environment variables for DeepSpeed
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "29500"
    os.environ["RANK"] = "0"
    os.environ["LOCAL_RANK"] = "0"
    os.environ["WORLD_SIZE"] = "1"

    # Setting explicit DeepSpeed config path
    import sys
    sys.argv.extend(["--deepspeed", "--deepspeed_config", "ds_config.json"])

    main()

[2025-03-06 01:51:54,187] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cpu (auto detect)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2025-03-06 01:54:38,554] [INFO] [logging.py:128:log_dist] [Rank -1] DeepSpeed info: version=0.16.4, git-hash=unknown, git-branch=unknown
[2025-03-06 01:54:38,555] [INFO] [comm.py:658:init_distributed] cdb=None
[2025-03-06 01:54:38,557] [INFO] [comm.py:689:init_distributed] Initializing TorchBackend in DeepSpeed with backend gloo


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Using /root/.cache/torch_extensions/py311_cu124 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py311_cu124/deepspeed_shm_comm...
Emitting ninja build file /root/.cache/torch_extensions/py311_cu124/deepspeed_shm_comm/build.ninja...
Building extension module deepspeed_shm_comm...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module deepspeed_shm_comm...


Time to load deepspeed_shm_comm op: 49.70282864570618 seconds
DeepSpeed deepspeed.ops.comm.deepspeed_shm_comm_op built successfully


AssertionError: DeepSpeed requires --deepspeed_config to specify configuration file