In [1]:
from torch.utils.data import Dataset
import os
from data_prep.BridgeDataLoader import LWBridgeDataset
import torch
import datetime
from pathlib import Path
from tqdm import tqdm
import numpy as np
import importlib
import models.pointnet2_sem_seg_msg as MODEL
from tool_utils.tool_utils import *
import provider
from torch.utils.tensorboard import SummaryWriter
import time

In [2]:
rootpath=os.getcwd()
datapath='/home/lycoricy/work/Pointcloud-bridge/data/bridge-5cls-fukushima'



## Defining the component label

In [3]:
classes = ['abutment', 'girder', 'deck', 'parapet', 'noise']
class2label = {cls: i for i, cls in enumerate(classes)} # {'abutment': 0, 'girder': 1, 'deck': 2, 'parapet': 3, 'noise': 4}
seg_classes = class2label
seg_label_to_cat = {} # {0: 'abutment', 1: 'girder', 2: 'deck', 3: 'parapet', 4: 'noise'}
for i, cat in enumerate(seg_classes.keys()):
    seg_label_to_cat[i] = cat

class2color = {'abutment': [229, 158, 221], 'girder':[0, 11, 195], 'deck': [173, 219, 225], 'parapet': [230, 0, 0], 'noise': [0, 169, 58]}
label2color = {classes.index(cls): class2color[cls] for cls in classes}

## Defining the parameter


In [4]:
NUM_CLASSES = 5  # Adjust based on your dataset
NUM_POINT = 4096
BATCH_SIZE = 24
ROOT_DIR = '.'  # Adjust this to your project root directory
# Configuration

class Config:
    model = 'pointnet2_sem_seg_msg'
    batch_size = 24
    epoch = 128
    learning_rate = 0.001
    gpu = '0'
    optimizer = 'Adam'
    log_dir = None
    decay_rate = 1e-4
    npoint = 4096
    step_size = 10
    lr_decay = 0.7

config = Config()

In [5]:
# Create directories
def create_directories():
    timestr = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
    experiment_dir = Path(ROOT_DIR) / 'log' / 'sem_seg' / (config.log_dir or timestr)
    experiment_dir.mkdir(parents=True, exist_ok=True)
    checkpoints_dir = experiment_dir / 'checkpoints'
    checkpoints_dir.mkdir(exist_ok=True)
    log_dir = experiment_dir / 'logs'
    log_dir.mkdir(exist_ok=True)
    return experiment_dir, checkpoints_dir, log_dir

## load datasets

In [6]:
# Load datasets
def load_datasets(root):
    print("Loading training data...")
    TRAIN_DATASET = LWBridgeDataset(split='train', data_root=root, num_point=NUM_POINT, 
                                    block_size=1.0, sample_rate=1.0, num_class=NUM_CLASSES)
    print("Loading test data...")
    TEST_DATASET = LWBridgeDataset(split='test', data_root=root, num_point=NUM_POINT, 
                                   block_size=1.0, sample_rate=1.0, num_class=NUM_CLASSES)
    
    trainDataLoader = torch.utils.data.DataLoader(TRAIN_DATASET, batch_size=BATCH_SIZE, 
                                                  shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    testDataLoader = torch.utils.data.DataLoader(TEST_DATASET, batch_size=BATCH_SIZE, 
                                                 shuffle=False, num_workers=2, pin_memory=True, drop_last=True)
    
    return TRAIN_DATASET, TEST_DATASET, trainDataLoader, testDataLoader

In [7]:
# Initialize models
def initialize_model(num_classes, experiment_dir):
    classifier = MODEL.get_model(num_classes).cuda()
    criterion = MODEL.get_loss().cuda()
    
    try:
        # 使用 weights_only=True 和 map_location
        checkpoint = torch.load(
            str(experiment_dir / 'checkpoints' / 'best_model.pth'),
            weights_only=True,
            map_location=torch.device('cuda')  # 或使用 'cuda' 如果你在 GPU 上运行
        )

        # 如果你只加载权重，你可能需要单独处理 epoch 信息
        start_epoch = 0  # 或者从配置文件中读取
        classifier.load_state_dict(checkpoint)
        print('Use pretrained model')
    except Exception as e:
        print(f'Error loading model: {e}')
        print('Starting training from scratch...')
        start_epoch = 0
        classifier.apply(weights_init)

    return classifier, criterion, start_epoch

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv2d') != -1:
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data, 0.0)
    elif classname.find('Linear') != -1:
        nn.init.xavier_normal_(m.weight.data)
        nn.init.constant_(m.bias.data, 0.0)

In [8]:
# Training function
def train(classifier, criterion, optimizer, trainDataLoader, logger):
    classifier.train()
    total_correct = 0
    total_seen = 0
    loss_sum = 0
    
    for points, target in tqdm(trainDataLoader, total=len(trainDataLoader), smoothing=0.9):
        # for my loss function, strcture-oriented loss (SOL)
        points_raw = points.float().cuda() # output.shape: [16, 4096, 9]
        target_SOL = target.long().cuda() # output.shape: [16, 4096]
        
        points = points.data.numpy() # points.shape: [16, 4096, 9]
        points[: , : , : 3] = provider.rotate_point_cloud_z(points[: , : , : 3])
        points = torch.Tensor(points)
        points, target = points.float().cuda(), target.long().cuda()
        
        # adjust the shape of 'points' to suit the input_size of 'classifier'
        points = points.transpose(2, 1) # output.shape: [16, 9, 4096]
        
        optimizer.zero_grad()
        seg_pred, trans_feat = classifier(points)
        # for my loss function, strcture-oriented loss (SOL)
        seg_pred_SOL = seg_pred # output.shape: [16, 4096, NUM_CLASSES]
        
        seg_pred = seg_pred.contiguous().view(-1, NUM_CLASSES)  # output.shape: ([16 * 4096, NUM_CLASSES])
        # target.view(-1, 1): [16 * 4096, 1], target.view(-1, 1)[ : , 0]: [16 * 4096]
        target = target.view(-1, 1)[:, 0]
        
        loss = criterion(seg_pred_SOL, target_SOL, points_raw, seg_pred, target)
        loss.backward()
        optimizer.step()
        
        pred_choice = seg_pred.cpu().data.max(1)[1].numpy()
        correct = np.sum(pred_choice == target.cpu().data.numpy())
        total_correct += correct
        total_seen += (BATCH_SIZE * NUM_POINT)
        loss_sum += loss.item()
    
    return loss_sum / len(trainDataLoader), total_correct / float(total_seen)

In [9]:
# Evaluation function
def evaluate(classifier, criterion, testDataLoader, num_classes):
    classifier.eval()
    total_correct = 0
    total_seen = 0
    loss_sum = 0
    total_seen_class = [0 for _ in range(num_classes)]
    total_correct_class = [0 for _ in range(num_classes)]
    total_iou_deno_class = [0 for _ in range(num_classes)]
    
    with torch.no_grad():
        for points, target in tqdm(testDataLoader, total=len(testDataLoader), smoothing=0.9):
            # for my loss function, strcture-oriented loss (SOL)
            points_raw = points.float().cuda() # output.shape: [16, 4096, 9]
            target_SOL = target.long().cuda() # output.shape: [16, 4096]
            
            points = points.data.numpy()
            points = torch.Tensor(points)
            points, target = points.float().cuda(), target.long().cuda()
            points = points.transpose(2, 1) # output.shape: [16, 9, 4096]
            classifier = classifier.eval()
            # seg_pred --> segmentation prediction, shape: [BATCH_SIZE, NUM_POINT, NUM_CLASSES] ([16, 4096, NUM_CLASSES])
            seg_pred, trans_feat = classifier(points)

            # for my loss function, strcture-oriented loss (SOL)
            seg_pred_SOL = seg_pred # output.shape: [16, 4096, NUM_CLASSES]

            pred_val = seg_pred.contiguous().cpu().data.numpy() # pred_val.shape: [16, 4096, NUM_CLASSES]
            seg_pred = seg_pred.contiguous().view(-1, NUM_CLASSES) # output.shape: [16 * 4096, NUM_CLASSES]
            batch_label = target.cpu().data.numpy() # batch_label.shape: [16, 4096]
            target = target.view(-1, 1)[: , 0] # output.shape: [16 * 4096]
            
            loss = criterion(seg_pred_SOL, target_SOL, points_raw, seg_pred, target)
            loss_sum += loss.item()
            pred_val = np.argmax(pred_val, 2)
            correct = np.sum((pred_val == batch_label))
            total_correct += correct
            total_seen += (BATCH_SIZE * NUM_POINT)
            
            for l in range(num_classes):
                total_seen_class[l] += np.sum((batch_label == l))
                total_correct_class[l] += np.sum((pred_val == l) & (batch_label == l))
                total_iou_deno_class[l] += np.sum(((pred_val == l) | (batch_label == l)))
    
    mIoU = np.mean(np.array(total_correct_class) / (np.array(total_iou_deno_class, dtype=np.float32) + 1e-6))
    return loss_sum / len(testDataLoader), total_correct / float(total_seen), mIoU

# Mian Structure

In [10]:
log_dir_TB = './log/Tensorboard'
writer = SummaryWriter(log_dir_TB)
# tensorboard --logdir=Partsize-identical/log/Tensorboard


In [11]:
experiment_dir, checkpoints_dir, log_dir = create_directories()
logger = setup_logging(log_dir, config.model)
logger.info("Starting the training process...")

#set Tensorboard
writer = SummaryWriter(log_dir_TB)

# Set GPU
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu

# Load datasets
TRAIN_DATASET, TEST_DATASET, trainDataLoader, testDataLoader = load_datasets(datapath)

logger.info(f"Number of training data: {len(TRAIN_DATASET)}")
logger.info(f"Number of test data: {len(TEST_DATASET)}")

Loading training data...
labelweights of train: [0.12851092 0.29946876 0.19900215 0.0422972  0.33072102]
Totally 48120 samples in train set.
Loading test data...
labelweights of test: [0.08364363 0.28175014 0.09126764 0.03026254 0.51307607]
Totally 1428 samples in test set.


In [12]:
# Initialize model
classifier, criterion, start_epoch = initialize_model(NUM_CLASSES, experiment_dir)
# Optimizer
if config.optimizer == 'Adam':
    optimizer = torch.optim.Adam(
        classifier.parameters(), 
        lr=config.learning_rate, 
        betas=(0.9, 0.999), 
        eps=1e-08, 
        weight_decay=config.decay_rate
    )
else:
    optimizer = torch.optim.SGD(classifier.parameters(), lr=config.learning_rate, momentum=0.9)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)

best_iou = 0

Error loading model: [Errno 2] No such file or directory: 'log/sem_seg/2024-10-03_21-07/checkpoints/best_model.pth'
Starting training from scratch...


In [13]:
# 打印模型结构
print(classifier)

get_model(
  (sa1): PointNetSetAbstractionMsg(
    (conv_blocks): ModuleList(
      (0): ModuleList(
        (0): Conv2d(12, 16, kernel_size=(1, 1), stride=(1, 1))
        (1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
        (2): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1))
      )
      (1): ModuleList(
        (0): Conv2d(12, 32, kernel_size=(1, 1), stride=(1, 1))
        (1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
        (2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1))
      )
    )
    (bn_blocks): ModuleList(
      (0): ModuleList(
        (0-1): 2 x BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): ModuleList(
        (0-1): 2 x BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
 

In [14]:
from torchsummary import summary
summary(classifier, input_size=(9, 4096)) 

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 16, 1024]             208
       BatchNorm2d-2         [-1, 16, 16, 1024]              32
            Conv2d-3         [-1, 16, 16, 1024]             272
       BatchNorm2d-4         [-1, 16, 16, 1024]              32
            Conv2d-5         [-1, 32, 16, 1024]             544
       BatchNorm2d-6         [-1, 32, 16, 1024]              64
            Conv2d-7         [-1, 32, 32, 1024]             416
       BatchNorm2d-8         [-1, 32, 32, 1024]              64
            Conv2d-9         [-1, 32, 32, 1024]           1,056
      BatchNorm2d-10         [-1, 32, 32, 1024]              64
           Conv2d-11         [-1, 64, 32, 1024]           2,112
      BatchNorm2d-12         [-1, 64, 32, 1024]             128
PointNetSetAbstractionMsg-13  [[-1, 3, 1024], [-1, 96, 1024]]               0
           Conv2d-14     

In [None]:
total_epochs = config.epoch
with tqdm(total=total_epochs, desc="Training Progress") as pbar:
    # Training loop
    for epoch in range(start_epoch, config.epoch):
        epoch_start_time = time.time()
        
        logger.info(f'Epoch {epoch+1}/{config.epoch}')
        
        lr = max(config.learning_rate * (config.lr_decay ** (epoch // config.step_size)), 1e-5)
        logger.info(f'Learning rate: {lr}')
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        #train
        train_loss, train_acc = train(classifier, criterion, optimizer, trainDataLoader, logger)
        logger.info(f'Train - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
        
        # 记录训练指标
        writer.add_scalar('Train/Loss', train_loss, epoch)
        writer.add_scalar('Train/Accuracy', train_acc, epoch)
        writer.add_scalar('Train/LearningRate', lr, epoch)
        
        if epoch % 5 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, str(checkpoints_dir / 'model.pth'))
        
        # Evaluate
        eval_loss, eval_acc, mIoU = evaluate(classifier, criterion, testDataLoader, NUM_CLASSES)
        logger.info(f'Eval - Loss: {eval_loss:.4f}, Accuracy: {eval_acc:.4f}, mIoU: {mIoU:.4f}')
        
        # calculate each epochs time
        epoch_time = time.time() - epoch_start_time
        
        # 更新学习率
        scheduler.step(eval_loss)  # 对于 ReduceLROnPlateau，传入验证损失
    
        # update the program bar
        pbar.update(1)
        pbar.set_postfix({
            'Epoch': f'{epoch+1}/{total_epochs}',
            'Train Loss': f'{train_loss:.4f}',
            'Train Acc': f'{train_acc:.4f}',
            'Eval Loss': f'{eval_loss:.4f}',
            'Eval Acc': f'{eval_acc:.4f}',
            'mIoU': f'{mIoU:.4f}',
            'Time': f'{epoch_time:.2f}s'
        })
        
        # 记录评估指标
        writer.add_scalar('Eval/Loss', eval_loss, epoch)
        writer.add_scalar('Eval/Accuracy', eval_acc, epoch)
        writer.add_scalar('Eval/mIoU', mIoU, epoch)
        
        if mIoU >= best_iou:
            best_iou = mIoU
            torch.save({
                'epoch': epoch,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'mIoU': mIoU,
            }, str(checkpoints_dir / 'best_model.pth'))
        
        logger.info(f'Best mIoU: {best_iou:.4f}')
    
# 关闭 SummaryWriter
writer.close()

Training Progress:   0%|          | 0/128 [00:00<?, ?it/s]
  0%|          | 0/2005 [00:00<?, ?it/s][A
  0%|          | 1/2005 [00:02<1:28:16,  2.64s/it][A
  0%|          | 2/2005 [00:03<25:42,  1.30it/s]  [A
  0%|          | 3/2005 [00:03<22:39,  1.47it/s][A
  0%|          | 4/2005 [00:04<17:46,  1.88it/s][A
  0%|          | 5/2005 [00:05<23:49,  1.40it/s][A
  0%|          | 6/2005 [00:05<21:15,  1.57it/s][A
  0%|          | 7/2005 [00:06<21:06,  1.58it/s][A
  0%|          | 8/2005 [00:07<27:03,  1.23it/s][A
  0%|          | 9/2005 [00:07<23:12,  1.43it/s][A
  0%|          | 10/2005 [00:08<23:10,  1.43it/s][A
  1%|          | 11/2005 [00:09<16:41,  1.99it/s][A
  1%|          | 12/2005 [00:09<16:22,  2.03it/s][A
  1%|          | 13/2005 [00:10<27:03,  1.23it/s][A
  1%|          | 14/2005 [00:10<19:08,  1.73it/s][A
  1%|          | 15/2005 [00:11<16:59,  1.95it/s][A
  1%|          | 19/2005 [00:11<03:39,  9.06it/s][A
  1%|          | 20/2005 [00:12<15:02,  2.20it/s][A
 

In [46]:
def main():
    experiment_dir, checkpoints_dir, log_dir = create_directories()
    logger = setup_logging(log_dir, config.model)
    logger.info("Starting the training process...")
    
    # Set GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu
    
    # Load datasets
    TRAIN_DATASET, TEST_DATASET, trainDataLoader, testDataLoader = load_datasets(datapath)
    
    logger.info(f"Number of training data: {len(TRAIN_DATASET)}")
    logger.info(f"Number of test data: {len(TEST_DATASET)}")
    
    # Initialize model
    classifier, criterion, start_epoch = initialize_model(NUM_CLASSES, experiment_dir)
    
    # Optimizer
    if config.optimizer == 'Adam':
        optimizer = torch.optim.Adam(
            classifier.parameters(), 
            lr=config.learning_rate, 
            betas=(0.9, 0.999), 
            eps=1e-08, 
            weight_decay=config.decay_rate
        )
    else:
        optimizer = torch.optim.SGD(classifier.parameters(), lr=config.learning_rate, momentum=0.9)
    
    best_iou = 0
    
    # Training loop
    for epoch in range(start_epoch, config.epoch):
        logger.info(f'Epoch {epoch+1}/{config.epoch}')
        
        lr = max(config.learning_rate * (config.lr_decay ** (epoch // config.step_size)), 1e-5)
        logger.info(f'Learning rate: {lr}')
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        
        train_loss, train_acc = train(classifier, criterion, optimizer, trainDataLoader, logger)
        logger.info(f'Train - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
        
        if epoch % 5 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, str(checkpoints_dir / 'model.pth'))
        
        eval_loss, eval_acc, mIoU = evaluate(classifier, criterion, testDataLoader, NUM_CLASSES)
        logger.info(f'Eval - Loss: {eval_loss:.4f}, Accuracy: {eval_acc:.4f}, mIoU: {mIoU:.4f}')
        
        if mIoU >= best_iou:
            best_iou = mIoU
            torch.save({
                'epoch': epoch,
                'model_state_dict': classifier.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'mIoU': mIoU,
            }, str(checkpoints_dir / 'best_model.pth'))
        
        logger.info(f'Best mIoU: {best_iou:.4f}')

In [47]:
if __name__ == '__main__':
    main()

Loading training data...
labelweights of train: [0.12851092 0.29946876 0.19900215 0.0422972  0.33072102]
Totally 48120 samples in train set.
Loading test data...
labelweights of test: [0.08364363 0.28175014 0.09126764 0.03026254 0.51307607]
Totally 1428 samples in test set.
No existing models, starting training from scratch...


  checkpoint = torch.load(str(experiment_dir / 'checkpoints/best_model.pth'))


NameError: name 'weights_init' is not defined