In [1]:
#default_exp train

# Train the network


In [2]:
import sys
sys.path.append("/home/qhs67/git/bachelorthesis_sven_thaele/code/")

In [3]:
#export
import logging
import torch
from torch import autograd
from torch.utils.tensorboard import SummaryWriter
import datetime

from pointpillars.utils.io import read_config, save_network_checkpoint, save_network
from pointpillars.data.dataset import VelTrainDataset, collate_fn, OverfitSampler
from pointpillars.modules.pointpillars import PointPillars, init_weights
from pointpillars.loss import PointPillarsLoss


In [4]:
#export
# time to identify the run on folders and checkpoints etc
ident_time = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

In [5]:
#export
# logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
log_formatter = logging.Formatter(log_format)

log_handler = logging.FileHandler("/home/qhs67/git/bachelorthesis_sven_thaele/code/pointpillars.log", mode='w')
log_handler.setFormatter(log_formatter)
logger.addHandler(log_handler)

In [6]:
#export
# tensorboard writer
run_folder = "/home/qhs67/git/bachelorthesis_sven_thaele/code/runs/{}/".format(ident_time)

writer = SummaryWriter(run_folder)

In [7]:
tens = torch.rand((10,10))
tens.__str__()


'tensor([[0.4479, 0.6738, 0.7426, 0.3690, 0.6762, 0.7121, 0.0316, 0.1435, 0.6628,\n         0.7707],\n        [0.7733, 0.5175, 0.4289, 0.0644, 0.7136, 0.4066, 0.0236, 0.3815, 0.0774,\n         0.3601],\n        [0.2550, 0.4316, 0.0791, 0.1322, 0.6743, 0.6415, 0.4585, 0.1059, 0.0959,\n         0.1547],\n        [0.8992, 0.5098, 0.9447, 0.5250, 0.8934, 0.7204, 0.5493, 0.9082, 0.6937,\n         0.3766],\n        [0.7474, 0.9609, 0.7570, 0.6120, 0.3702, 0.1869, 0.2984, 0.2591, 0.7336,\n         0.2029],\n        [0.7082, 0.6808, 0.8025, 0.9992, 0.7846, 0.9560, 0.4358, 0.2757, 0.0104,\n         0.4979],\n        [0.3730, 0.1065, 0.4406, 0.1950, 0.0024, 0.4755, 0.5494, 0.6395, 0.7281,\n         0.9876],\n        [0.5658, 0.1350, 0.2913, 0.0264, 0.8711, 0.2766, 0.2909, 0.9710, 0.9580,\n         0.2983],\n        [0.0583, 0.7612, 0.7287, 0.1307, 0.4188, 0.3796, 0.3439, 0.0260, 0.6932,\n         0.2852],\n        [0.8570, 0.3224, 0.0574, 0.7412, 0.3175, 0.3010, 0.2462, 0.2142, 0.3928,\n        

In [8]:
#export
def _train_setup():
    """

    """
    batch_size = 3
    init_lr = 2 * 10**-4
    #init_lr = 1 * 10**-4

    logger.info("Start network training..")
    torch.cuda.empty_cache()
    torch.multiprocessing.set_start_method('spawn')

    # TODO: move to config file
    conf = read_config()
    vel_folder = "/home/qhs67/git/bachelorthesis_sven_thaele/code/data/kitti/training/velodyne/training"
    label_folder ="/home/qhs67/git/bachelorthesis_sven_thaele/code/data/kitti/training/label_2/training"

    ds_train = VelTrainDataset(vel_folder, label_folder)
    """dl_train = torch.utils.data.DataLoader(ds_train,
                                           batch_size=batch_size,
                                           num_workers=1,
                                           collate_fn=collate_fn,
                                           shuffle=True)"""

    sampler = OverfitSampler(ds_train, batch_size, nb_samples=20, shuffle=True)
    for i in sampler:
        print(i)

    dl_train = torch.utils.data.DataLoader(ds_train,
                                           batch_size=batch_size,
                                           sampler=sampler,
                                           num_workers=0,
                                           collate_fn=collate_fn)
    
    # modules
    pointpillars = PointPillars(conf)
    loss_func = PointPillarsLoss()
    pointpillars.train()
    loss_func.train()

    # TODO: also init bias?
    #pointpillars.apply(init_weights)
    # move to gpu
    pointpillars.cuda()
    loss_func.cuda()

    optimizer = torch.optim.Adam(pointpillars.parameters(), lr=init_lr)
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 15, gamma=0.8, last_epoch=-1)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2000, gamma=0.8, last_epoch=-1)

    return pointpillars, loss_func, optimizer, scheduler, dl_train

In [9]:
#export
def _train_step(batch: torch.Tensor,
               pointpillars: torch.nn.Module,
               loss_func: torch.nn.Module,
               optimizer: torch.optim.Adam,
               epoch: int,
               i: int) -> torch.nn.Module:

    """
    Performs a training step
    """

    pil_batch, ind_batch, label_batch, label_mask = batch

    # -> forward pass through network
    preds = pointpillars(pil_batch, ind_batch, label_batch, label_mask)

    loss = loss_func(preds, writer, epoch, i)
    loss.backward()
    optimizer.step()

    del pil_batch, ind_batch, label_batch, preds

    return loss

In [10]:
#export
def validate(network: torch.nn.Module, loss_func: torch.nn.Module, epoch, nbr_val_batches: int = 300):
    """
    Validates the network on the loss function on the validation dataset
    """
    batch_size = 3

    validation_folder = "/home/qhs67/git/bachelorthesis_sven_thaele/code/data/kitti/training/velodyne/validation"
    label_folder ="/home/qhs67/git/bachelorthesis_sven_thaele/code/data/kitti/training/label_2/validation"

    with torch.no_grad():
        ds_val = VelTrainDataset(validation_folder, label_folder)
        dl_val = torch.utils.data.DataLoader(ds_val,
                                           batch_size=batch_size,
                                           num_workers=2,
                                           collate_fn=collate_fn,
                                           shuffle=True)

        running_val_loss = 0

        for i, batch in enumerate(dl_val):
            # stop after given number of data
            if i >= nbr_val_batches:
                break

            pil_batch, ind_batch, label_batch, label_mask = batch
            preds = network(pil_batch, ind_batch, label_batch, label_mask)
            loss = loss_func(preds)

            running_val_loss += loss.item()

            print("Val Epoch: {}, Batch {} with Loss {} and running Loss {}.".format(epoch, i, loss.item(), running_val_loss/(i+1)))

            #writer.add_scalar("Epoch {}/Validation Loss".format(epoch), running_val_loss/(i+1), i)
            #writer.flush()

        writer.add_scalar("Epochs/Validation Loss", running_val_loss/nbr_val_batches, epoch)
        writer.flush()


In [11]:
#export
def train(val: bool = False, save_nw: bool = False):
    """

    :param val: bool if validation should be used
    :param save_nw: bool if network state should be saved after training
    """
    n_epochs = 10000


    pointpillars, loss_func, optimizer, scheduler, dl_train = _train_setup()

    try:
        for epoch in range(n_epochs):
            running_loss = 0
            for i, batch in enumerate(dl_train):

                
                optimizer.zero_grad()
                loss = _train_step(batch, pointpillars, loss_func, optimizer, epoch, i)
                running_loss += loss.item()

                logger.debug("Epoch: {}, Batch {} with Loss {} and running Loss {}.".format(epoch, i, loss.item(), running_loss/(i+1)))
                print("Epoch: {}, Batch {} with Loss {} and running Loss {}.".format(epoch, i, loss.item(), running_loss/(i+1)))

                torch.cuda.empty_cache()
                writer.add_scalar("Epoch {}/Running Loss".format(epoch), running_loss/(i+1), i)
                writer.flush()

            # after epoch
            scheduler.step()
            writer.add_scalar("Epochs/Running Loss", running_loss/len(dl_train), epoch)
            writer.add_scalar("Epochs/Learning Rate", scheduler.get_last_lr()[0], epoch)
            writer.flush()

            if val:
                validate(pointpillars, loss_func, epoch, nbr_val_batches=10)

        # after training
        print('Finished Training')

    except Exception as e:
        print(e)
        logger.exception("An exception occured")
        save_network_checkpoint(pointpillars, optimizer, scheduler, loss, ident_time, epoch)
        exit()

    if save_nw:
        # save network to location from config
        save_network(pointpillars, ident_time, n_epochs)


    writer.close()

In [12]:
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_cached())
train(val=False, save_nw=True)

0
0
509
1330
1789
519
3472
2845
593
3030
3394
1104
2868
1210
2179
1772
1749
694
307
2068
697
1877




Sizes of tensors must match except in dimension 1. Got 8 and 1 (The offending index is 0)


UnboundLocalError: local variable 'loss' referenced before assignment

In [None]:
#export
if __name__ == '__main__':
    train(val=False, save_nw=True)



