In [2]:
import os
import torch
import torch.nn as nn
import tempfile
import torch.distributed as dist
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
for key, value in os.environ.items():
    print(key, value)

COMMAND_MODE unix2003
CONDA_DEFAULT_ENV Taitanic
CONDA_EXE /Users/cuiyaodong/opt/miniconda3/bin/conda
CONDA_PREFIX /Users/cuiyaodong/opt/miniconda3/envs/Taitanic
CONDA_PROMPT_MODIFIER (Taitanic) 
CONDA_PYTHON_EXE /Users/cuiyaodong/opt/miniconda3/bin/python
CONDA_SHLVL 2
HOME /Users/cuiyaodong
HOMEBREW_CELLAR /opt/homebrew/Cellar
HOMEBREW_PREFIX /opt/homebrew
HOMEBREW_REPOSITORY /opt/homebrew
INFOPATH /opt/homebrew/share/info:
LESS -R
LOGNAME cuiyaodong
LSCOLORS Gxfxcxdxbxegedabagacad
LaunchInstanceID 49B93CDB-0AC1-4019-960B-45FE648FF010
MANPATH /opt/homebrew/share/man::
MallocNanoZone 0
OLDPWD /
ORIGINAL_XDG_CURRENT_DESKTOP undefined
PAGER cat
PATH /Users/cuiyaodong/opt/miniconda3/envs/Taitanic/bin:/Users/cuiyaodong/opt/miniconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Library/TeX/texbin:/Library/Apple/usr/bin
PWD /
SECURITYSESSIONID 186b2
SHELL /bin/zsh
SHLVL 2
SSH_AUTH_SOCK /private/tmp/com.apple.launc

## 基本使用

In [6]:
# 设置环境变量并初始化进程组
def set_up(rank, world_size):
    # 设置环境变量 
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12345'
    
    # 初始化进程组
    dist.init_process_group(backend='gloo',
                            rank=rank,
                            world_size=world_size)
# 设置训练结束后的进程终止函数
def clean_up():
    dist.destroy_process_group()

In [7]:
# 定义ToyModel的模型
class ToyModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(input_size, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, output_size)
        
    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))

In [9]:
# 定义训练函数，此处为多机多进程的训练
def basic_demo(rank, world_size):
    print('Currently is running on rank:', rank)
    set_up(rank, world_size)
    
    # 定义模型并将它放置于对应的GPU上，并用DDP包装为ddp模型
    model = ToyModel(10, 10).to(rank)
    ddp_model = DDP(model, device_ids=[rank])
       
    # 定义损失函数和优化器
    loss_fun = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # 定义训练的过程
    optimizer.zero_grad()
    data, label = torch.randn(20, 10).to(rank), torch.randn(20, 10).to(rank)
    loss = loss_fun(ddp_model(data), label)
    loss.backward()
    optimizer.step()
    
    clean_up()
    
# 采用mp.spawn进行进程的创建，并设置nprocs，join为True时默认等待所有进程结束后才继续执行
def run_basic(demo_fn, world_size):
    mp.spawn(fn=demo_fn,
            nprocs=world_size,
            args=(world_size,),
            join=True)

In [14]:
torch.cuda.device_count()

0

In [3]:
torch.cuda.is_available()

False

In [6]:
torch.cuda.device_count()

0