# Train文件

## 第1步 导入所需代码库

In [1]:
import json
import numpy as np
from pathlib import Path
import time
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from data_potential import CloudsDataset, ActiveLearningSampler
from model import RandLANet
from datetime import datetime
import os
from utils.metrics import compute_metrics
from train import evaluate

## 第2步 配置深度学习训练参数配

### 2.1 配置train参数
一个epoch里面训练的总点数为：batch_size * num_points_per_step

In [2]:
train_dict = {
    "epoch": 2,  # 训练总轮数
    "batch_size": 16,  # 训练的批次大小，即包含的点云份数
    "train_steps": 16,  # 这一个batch在一个epoch里训练多少次
    "val_steps": 4,  # 这一个batch在一个epoch里验证多少次，可以平滑验证结果
    "num_points_per_step": 10000,  # 每一份点云有多少个点
    "num_workers": 1, # 设置多少线程读取内存里的点云组织成batch
    "lr": 0.01,  # learning rate学习率
    "gpu": 0,  # 设置使用的gpu编号，默认0号gpu
    "checkpoints_dir": 'outputs/checkpoints',  # 网络模型输出目录
    "loggers_dir": 'outputs/loggers',  # 日志输出目录
    "save_freq": 5,  # 网络模型保存个数，只保存精度前五的网络模型，避免频繁输出开销
    "scheduler_gamma": 0.95,  # 调度器对学习率的衰减率
    "load_model_path": None,  # 可以加载之前训练的模型进来继续训练，输入模型的完整路径
}

### 2.2 配置network参数字典

In [3]:
network_dict = {
    "num_neighbors": 16,  # 在进行位置编码时搜索的邻域点个数
    "decimation": 4,  # 在encoding阶段下采样的比例，每次下采样采样点云个数除以4
}

### 2.3 配置data参数字典

In [4]:
data_dict = {
    "data_root": "E:/OpenGF",  # 原始数据根目录
    "num_classes": 2,  # 点云标注的类别数
    "grid_size": 1.0,  # 降采样格网尺寸，1.0代表1m * 1m的范围内取一个点
    "HasIntensity": True,  # 数据集是否有intensity属性，有也可以填False，表示不使用，默认使用
    "HasRGB": False  # 数据集是否有RGB属性，有也可以填False，表示不使用
}

### 2.4 合并字典

In [5]:
merged_dict = network_dict | train_dict | data_dict

## 第3步 创建logger和checkpoint文件夹

In [6]:
t0 = time.time()
now = datetime.now()
logs_dir = merged_dict["loggers_dir"]
logs_dir = os.path.join(logs_dir, f"{now.year}-{now.month}-{now.day}")
os.makedirs(logs_dir, exist_ok=True)
checkpoint_dir = os.path.join(merged_dict["checkpoints_dir"], f"{now.year}-{now.month}-{now.day}")
os.makedirs(checkpoint_dir, exist_ok=True)

## 第4步 载入数据集信息
打开记录数据集信息的metadata.json，获取class_weights信息

In [7]:
path = os.path.join(merged_dict["data_root"], f'processed-{merged_dict["grid_size"]:.2f}', 'metadata.json')
with open(path, 'rb') as f:
    data_raw = json.load(f)
data = torch.tensor(data_raw['class_weights'])
print(data)

tensor([96267835, 32387065])


## 第5步 设置训练设备
默认用GPU训练，在train.yaml文件下的gpu参数设置对应的显卡编号，默认为0号显卡

In [8]:
device = torch.device('cuda')
torch.cuda.set_device(int(merged_dict["gpu"]))
print("device:", device)

device: cuda


## 第6步 创建Train/Validation Dataloder
1. 创建dataset_train变量，将train数据导入内存
2. 创建train_sampler变量，对dataset_train里的数据进行采样
    - cfg.train.batch_size决定了一个batch有几份点云
    - cfg.train.train_steps决定了一个epoch有多少个batch
    - 设置transform=True，对train数据进行旋转、缩放、点云抖动，丰富样本
3. 创建train_loader变量
    - cfg.train.num_workers决定启用多少个子进程加载数据，与CPU核数有关
    - pin_memory=True,将数据预先加载到锁页内存（Pinned Memory） 中，避免数据被交换到硬盘（虚拟内存），从而显著提升从CPU内存到GPU显存的数据传输速度
    - drop_last=True,当数据集样本数无法被batch_size整除时，丢弃最后一个不完整的批次（样本数< batch_size），保证每个批次大小一致
4. 创建validation dataloader同上

In [9]:
dataset_train = CloudsDataset(merged_dict, split='train', transform=True)
train_sampler = ActiveLearningSampler(
    config=merged_dict,
    dataset=dataset_train,
    batch_size=merged_dict["batch_size"],
    step_size=merged_dict["train_steps"],
    split='train',
    transform=True
)
train_loader = DataLoader(
    train_sampler,
    batch_size=merged_dict["batch_size"],
    num_workers=merged_dict["num_workers"],
    pin_memory=True,
    drop_last=True
)
print(train_loader)
dataset_val = CloudsDataset(merged_dict, split='val')
val_sampler = ActiveLearningSampler(
    config=merged_dict,
    dataset=dataset_val,
    batch_size=merged_dict["batch_size"],
    step_size=merged_dict["val_steps"],
    split='val'
)
val_loader = DataLoader(
    val_sampler,
    batch_size=9,  # cfg.train.batch_size
    num_workers=merged_dict["num_workers"],
    pin_memory=True,
    drop_last=True
)

<torch.utils.data.dataloader.DataLoader object at 0x00000188250F3670>


## 第7步 初始化RandLANet model
- d_in 为输入特征维度，3:xyz 4:xyz+intensity 6:xyz+rgb 7:xyz+rgb+intensity
- num_classes 点云类别数
- num_neighbors 局部特征聚合模块邻近点搜索数
- decimation encoder部分下采样比例，decimation=4代表，每次取1/4
- device 决定是在GPU还是CPU，默认GPU

In [10]:
num_classes = merged_dict["num_classes"]
d_in = 3
if merged_dict["HasRGB"]:
    d_in += 3
if merged_dict["HasIntensity"]:
    d_in += 1
print("num of class:", d_in)
model = RandLANet(
    d_in,
    num_classes,
    num_neighbors=merged_dict["num_neighbors"],
    decimation=merged_dict["decimation"],
    device=device
).to(device)

num of class: 4


## 第8步  计算样本权重
以OpenGF为例，具有地面点和非地面点两类标签，势必导致样本不均衡，为了给予小样本足够关注，计算loss时赋予其较大权重

In [11]:
print(f"模型设备: {next(model.parameters()).device}")  # 输出应为 cuda:x

print('Computing weights...', end='\t')

frequency = data / torch.sum(data)
weights = 1.0 / torch.sqrt(frequency)
weights = weights.to(torch.float).to(device)
print(weights)

模型设备: cuda:0
Computing weights...	tensor([1.1560, 1.9931], device='cuda:0')


## 第9步 声明损失函数、优化器、调度器
- 损失函数用交叉熵损失函数CrossEntropyLoss
    - weights为样本权重
- 优化器为Adam
    - cfg.train.lr 为训练学习率，初始默认0.01
- 调度器为ExponentialLR，每轮按照固定比例衰减学习率
    -  cfg.train.scheduler_gamma 学习率衰减比例，初始为0.95

In [12]:
criterion = nn.CrossEntropyLoss(weight=weights).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=merged_dict["lr"])
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, merged_dict["scheduler_gamma"])

## 第10步 加载预训练模型
cfg.train.load中设置预训练模型路径

In [13]:
if merged_dict["load_model_path"]:
    load_path = Path(merged_dict["load_model_path"])
    path = max(list(load_path.glob('*.pth')))
    print(f'Loading {path}...')
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
print(merged_dict["load_model_path"])

None


## 第11步 遍历epoch训练
统计每轮的loss、accuracy、iou
### 11.1 遍历Dataloder中的batch
1. 一个batch一个batch的送入网络计算预测结果，与点云原始标签计算loss
2. 根据loss沿梯度方向反向传播
3. 收集每个batch的accuracy、iou、loss计算结果
### 11.2 统计一个epoch的训练和验证accuracy和iou
evaluate与train的训练过程相似，但无需梯度和反向传播，将train每个epoch导入的model进行验证，统计在验证数据集的accuracy和iou
### 11.3 记录每个epoch的信息写入log
### 11.4 记录每个epoch的网络模型，将验证集accuracy和miou最高的5个保存
为防止训练中断没记录网络模型，每一个epoch都会排序出精度前5的模型，有新模型就写入硬盘，删除被替代的模型

In [14]:
print(f"当前设备: {torch.cuda.current_device()} → {torch.cuda.get_device_name()}")
top_checkpoints = []
epochs = int(merged_dict["epoch"])
for epoch in range(1, epochs + 1):
    print(f'====== EPOCH {epoch:d}/{epochs:d} ======')
    t0 = time.time()
    # Train
    model.train()
    # metrics
    losses = []
    accuracies = []
    ious = []
    '''11.1 遍历Dataloder中的batch'''
    print(train_loader)
    for batch_idx, points in enumerate(train_loader):
        points = {k: v.to(device, non_blocking=True) for k, v in points.items()}
        feature = points['xyz']
        labels = points['class'].long()

        if merged_dict["HasRGB"]:
            rgb = points['rgb']
            feature = torch.cat((feature, rgb), dim=2)
        if merged_dict["HasIntensity"]:
            intensity = points['intensity'].unsqueeze(-1)
            feature = torch.cat((feature, intensity), dim=2)

        optimizer.zero_grad()

        scores = model(feature)
        loss = criterion(scores, labels)
        loss.backward()
        optimizer.step()

        losses.append(loss.cpu().item())
        if batch_idx % 10 == 0:  # 每10batch计算一次指标
            acc, miou = compute_metrics(scores, labels, num_classes)
            accuracies.append(acc)
            ious.append(miou)

    scheduler.step()
    '''11.2 统计一个epoch的训练和验证accuracy和iou'''
    accs = np.nanmean(np.array(accuracies), axis=0)
    ious = np.nanmean(np.array(ious), axis=0)
    val_loss, val_accs, val_ious = evaluate(
        model,
        val_loader,
        criterion,
        device,
        num_classes,
        merged_dict
    )

    loss_dict = {
        'Training loss': np.mean(losses),
        'Validation loss': val_loss
    }
    acc_dicts = {
        'Training accuracy': accs,
        'Validation accuracy': val_accs
    }
    iou_dicts = {
        'Training accuracy': ious,
        'Validation accuracy': val_ious
    }

    t1 = time.time()
    d = t1 - t0

    print('Accuracy     ', '   OA', sep=' | ')
    print('Training:    ', *[f'{accs:.3f}' if not np.isnan(accs) else '  nan'], sep=' | ')
    print('Validation:  ', *[f'{val_accs:.3f}' if not np.isnan(accs) else '  nan'], sep=' | ')
    print('----------------------')
    print('MIoU         ', ' mIoU', sep=' | ')
    print('Training:    ', *[f'{ious:.3f}' if not np.isnan(ious) else '  nan'], sep=' | ')
    print('Validation:  ', *[f'{val_ious:.3f}' if not np.isnan(ious) else '  nan'], sep=' | ')
    for k, v in loss_dict.items():
        print(f'{k}: {v:.7f}', end='\n')
    print('Time elapsed:', '{:.0f} s'.format(d) if d < 60 else '{:.0f} min {:02.0f} s'.format(*divmod(d, 60)))
    print('')
    '''11.3 记录每个epoch的信息写入log'''
    with SummaryWriter(logs_dir) as writer:
        # send results to tensorboard
        writer.add_scalars('Loss', loss_dict, epoch)

        writer.add_scalars('Per-class accuracy/Overall', acc_dicts, epoch)
        writer.add_scalars('Per-class IoU/Mean IoU', iou_dicts, epoch)

    '''11.4 记录每个epoch的网络模型，将验证集accuracy和miou最高的5个保存'''
    checkpoint_path = os.path.join(checkpoint_dir,
                                       f'checkpoint_{epoch:02d}_ACC_{val_accs:.2f}_MIOU_{val_ious:.2f}.pth')

    # 更新Top-5 Checkpoints
    if len(top_checkpoints) < 5 or val_ious > min(top_checkpoints, key=lambda x: x[0])[0]:
        path = checkpoint_path
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'mIoU': val_ious
        }, path)

        top_checkpoints.append((val_ious, path))
        top_checkpoints.sort(reverse=True, key=lambda x: x[0])
        # 清理旧checkpoint
        while len(top_checkpoints) > 5:
            _, old_path = top_checkpoints.pop()
            os.remove(old_path)

        top_checkpoints = top_checkpoints[:5]

当前设备: 0 → NVIDIA GeForce RTX 3080
<torch.utils.data.dataloader.DataLoader object at 0x00000188250F3670>




Accuracy      |    OA
Training:     | 0.649
Validation:   | 0.689
----------------------
MIoU          |  mIoU
Training:     | 0.448
Validation:   | 0.478
Training loss: 0.6104987
Validation loss: 0.7476123
Time elapsed: 35 s

<torch.utils.data.dataloader.DataLoader object at 0x00000188250F3670>
Accuracy      |    OA
Training:     | 0.800
Validation:   | 0.798
----------------------
MIoU          |  mIoU
Training:     | 0.618
Validation:   | 0.572
Training loss: 0.5092427
Validation loss: 0.4919806
Time elapsed: 28 s



## 第12步 打印所花时间

In [15]:
t1 = time.time()
d = t1 - t0
print('Done. Time elapsed:', '{:.0f} s.'.format(d) if d < 60 else '{:.0f} min {:.0f} s.'.format(*divmod(d, 60)))

Done. Time elapsed: 28 s.
