# Resnet with Colossal AI

build the environment for colab

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")
!ls

Mounted at /content/gdrive
gdrive	sample_data


In [2]:
import os
path = "/content/gdrive/My Drive/NUS/CS5260/Assignment/6"
os.chdir(path)
os.listdir(path)

['requirements.txt', 'data', 'Resnet-ColossalAI.ipynb']

In [3]:
!pip install -r requirements.txt

Collecting colossalai (from -r requirements.txt (line 1))
  Downloading colossalai-0.3.6.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pre-commit (from colossalai->-r requirements.txt (line 1))
  Downloading pre_commit-3.7.0-py2.py3-none-any.whl (204 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.2/204.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting fabric (from colossalai->-r requirements.txt (line 1))
  Downloading fabric-3.2.2-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting contexttimer (from colossalai->-r requirements.txt (line 1))
  Downloading contexttimer-0.3.3.tar.gz (4.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ninja (from colossalai->-r require

Build the model

In [4]:
import argparse
import os
from pathlib import Path

import torch
import torch.distributed as dist
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.optim import Optimizer
from torch.optim.lr_scheduler import MultiStepLR
from torch.utils.data import DataLoader
from tqdm import tqdm

import colossalai
from colossalai.accelerator import get_accelerator
from colossalai.booster import Booster
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
from colossalai.cluster import DistCoordinator
from colossalai.nn.optimizer import HybridAdam

  _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten)


In [5]:
# ==============================
# Prepare Hyperparameters
# ==============================
NUM_EPOCHS = 40
LEARNING_RATE = 1e-3

In [6]:
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
    # transform
    transform_train = transforms.Compose(
        [transforms.Pad(4), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32), transforms.ToTensor()]
    )
    transform_test = transforms.ToTensor()

    # CIFAR-10 dataset
    data_path = os.environ.get("DATA", "./data")
    with coordinator.priority_execution():
        train_dataset = torchvision.datasets.CIFAR10(
            root=data_path, train=True, transform=transform_train, download=True
        )
        test_dataset = torchvision.datasets.CIFAR10(
            root=data_path, train=False, transform=transform_test, download=True
        )

    # Data loader
    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    test_dataloader = plugin.prepare_dataloader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    return train_dataloader, test_dataloader

In [7]:
@torch.no_grad()
def evaluate(model: nn.Module, test_dataloader: DataLoader, coordinator: DistCoordinator) -> float:
    model.eval()
    correct = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
    total = torch.zeros(1, dtype=torch.int64, device=get_accelerator().get_current_device())
    for images, labels in test_dataloader:
        images = images.cuda()
        labels = labels.cuda()
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    dist.all_reduce(correct)
    dist.all_reduce(total)
    accuracy = correct.item() / total.item()
    if coordinator.is_master():
        print(f"Accuracy of the model on the test images: {accuracy * 100:.2f} %")
    return accuracy


def train_epoch(
    epoch: int,
    model: nn.Module,
    optimizer: Optimizer,
    criterion: nn.Module,
    train_dataloader: DataLoader,
    booster: Booster,
    coordinator: DistCoordinator,
):
    model.train()
    with tqdm(train_dataloader, desc=f"Epoch [{epoch + 1}/{NUM_EPOCHS}]", disable=not coordinator.is_master()) as pbar:
        for images, labels in pbar:
            images = images.cuda()
            labels = labels.cuda()
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            booster.backward(loss, optimizer)
            optimizer.step()
            optimizer.zero_grad()

            # Print log info
            pbar.set_postfix({"loss": loss.item()})

In [8]:
# some arguments for colossal ai initialization
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
os.environ['LOCAL_RANK'] = '0'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'

# ==============================
# Launch Distributed Environment
# ==============================
if not dist.is_initialized():
    colossalai.launch_from_torch(config={})
coordinator = DistCoordinator()

global LEARNING_RATE
LEARNING_RATE *= coordinator.world_size

# ==============================
# Instantiate Plugin and Booster
# ==============================
booster_kwargs = {}
plugin = TorchDDPPlugin()

booster = Booster(plugin=plugin, **booster_kwargs)

# ==============================
# Prepare Dataloader
# ==============================
train_dataloader, test_dataloader = build_dataloader(100, coordinator, plugin)

# ====================================
# Prepare model, optimizer, criterion
# ====================================
# resent50
model = torchvision.models.resnet18(num_classes=10)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
# optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# lr scheduler
lr_scheduler = MultiStepLR(optimizer, milestones=[20, 40, 60, 80], gamma=1 / 3)

# ==============================
# Boost with ColossalAI
# ==============================
model, optimizer, criterion, _, lr_scheduler = booster.boost(
    model, optimizer, criterion=criterion, lr_scheduler=lr_scheduler
)

# ==============================
# Train model
# ==============================
# start_epoch = args.resume if args.resume >= 0 else 0
start_epoch = 0
for epoch in range(start_epoch, NUM_EPOCHS):
    train_epoch(epoch, model, optimizer, criterion, train_dataloader, booster, coordinator)
    lr_scheduler.step()

accuracy = evaluate(model, test_dataloader, coordinator)
print(f"Test Accuracy: {accuracy *100:.2f} %")




Files already downloaded and verified
Files already downloaded and verified


Epoch [1/40]: 100%|██████████| 500/500 [00:35<00:00, 14.25it/s, loss=1.42]
Epoch [2/40]: 100%|██████████| 500/500 [00:34<00:00, 14.56it/s, loss=1.19]
Epoch [3/40]: 100%|██████████| 500/500 [00:34<00:00, 14.40it/s, loss=1.04]
Epoch [4/40]: 100%|██████████| 500/500 [00:34<00:00, 14.44it/s, loss=0.977]
Epoch [5/40]: 100%|██████████| 500/500 [00:34<00:00, 14.70it/s, loss=0.928]
Epoch [6/40]: 100%|██████████| 500/500 [00:34<00:00, 14.43it/s, loss=0.848]
Epoch [7/40]: 100%|██████████| 500/500 [00:33<00:00, 14.72it/s, loss=0.828]
Epoch [8/40]: 100%|██████████| 500/500 [00:34<00:00, 14.58it/s, loss=0.724]
Epoch [9/40]: 100%|██████████| 500/500 [00:34<00:00, 14.59it/s, loss=0.73]
Epoch [10/40]: 100%|██████████| 500/500 [00:33<00:00, 14.85it/s, loss=0.691]
Epoch [11/40]: 100%|██████████| 500/500 [00:34<00:00, 14.54it/s, loss=0.78]
Epoch [12/40]: 100%|██████████| 500/500 [00:33<00:00, 14.78it/s, loss=0.724]
Epoch [13/40]: 100%|██████████| 500/500 [00:34<00:00, 14.60it/s, loss=0.58]
Epoch [14/40]:

Accuracy of the model on the test images: 83.42 %
Test Accuracy: 83.42 %
