## Required Imports

In [1]:
from utils.util import mkdir,seed_all
from omegaconf import OmegaConf
from cprint import *
from datasets.shape_net.shape_net_vox import ShapeNetVox
from models.dummy_classifier import DummyClassifier
import torch
from tqdm.notebook import tqdm
from torch.utils.tensorboard import SummaryWriter
from utils.visualizations import save_voxels
from PIL import Image
import os
import numpy as np
import datetime
%load_ext autoreload
%load_ext tensorboard
%autoreload 2


## Setup Expirement Folder

In [2]:
configs_path = "./configs/global_configs.yaml"
global_configs = OmegaConf.load(configs_path)

In [3]:
import time
seed_all(111)
training_config = global_configs["training"]
today = time.strftime("%Y-%m-%d")
cprint.ok(training_config)
description = training_config["description"]  # Describe Experiment params here
logs_dir = training_config["logs_dir"]
mkdir(logs_dir)
experiment_dir = f"{logs_dir}/{training_config['name']}/{training_config['experiment_id']}"
mkdir(experiment_dir)
loss_log_title = "Loss Log " + today

with open(f"{experiment_dir}/description.txt", "w") as file1:
    file1.write(description)

with open(f"{experiment_dir}/global_configs.txt", "w") as file1:
    file1.write(str(training_config))

with open(f"{experiment_dir}/loss_log.txt", "w") as file1:
    file1.write(loss_log_title)
    file1.write("\n")


mkdir(f"{experiment_dir}/checkpoints")
mkdir(f"{experiment_dir}/tb")
mkdir(f"{experiment_dir}/visuals")

[94m{'name': 'dummy_classification', 'description': 'Initial overfitting Experiment on shape net', 'extra_notes': 'none', 'experiment_id': 3, 'logs_dir': 'logs', 'is_train': True, 'device': 'cuda:0', 'batch_size': 8, 'n_epochs': 20, 'append_loss_every': 1, 'print_every': 1, 'validate_every': 5, 'save_every': 5, 'save_every_nepochs': 10, 'start_epoch': 0, 'start_iteration': 0, 'visualize_every': 5, 'load_ckpt': False, 'ckpt_path': 'test_path'}[0m
[94m- logs directory found[0m


[93m- Creating new directory logs/dummy_classification/3[0m
[93m- Creating new directory logs/dummy_classification/3/checkpoints[0m
[93m- Creating new directory logs/dummy_classification/3/tb[0m
[93m- Creating new directory logs/dummy_classification/3/visuals[0m


## Load Dataset & Dataloaders


In [4]:
# Change overfit param here & cat here
global_dataset_config = global_configs["dataset"]
local_dataset_config = global_dataset_config["shape_net_vox"]
dataset = ShapeNetVox(global_dataset_config, local_dataset_config)
print('length: ', len(dataset))
dataset[0]
# train_ds, valid_ds, test_ds = torch.utils.data.random_split(
#     dataset, [1,1,1])

train_ds,valid_ds, test_ds = dataset, dataset, dataset

train_dataloader = torch.utils.data.DataLoader(
    train_ds,   # Datasets return data one sample at a time; Dataloaders use them and aggregate samples into batches
    batch_size=training_config['batch_size'],   # The size of batches is defined here
    shuffle=True,    # Shuffling the order of samples is useful during training to prevent that the network learns to depend on the order of the input data
    num_workers=6,   # Data is usually loaded in parallel by num_workers
    pin_memory=True,  # This is an implementation detail to speed up data uploading to the GPU
    # worker_init_fn=train_dataset.worker_init_fn  TODO: Uncomment this line if you are using shapenet_zip on Google Colab
)

validation_dataloader = torch.utils.data.DataLoader(
    valid_ds,   # Datasets return data one sample at a time; Dataloaders use them and aggregate samples into batches
    batch_size=training_config['batch_size'],   # The size of batches is defined here
    shuffle=True,    # Shuffling the order of samples is useful during training to prevent that the network learns to depend on the order of the input data
    num_workers=6,   # Data is usually loaded in parallel by num_workers
    pin_memory=True,  # This is an implementation detail to speed up data uploading to the GPU
    # worker_init_fn=train_dataset.worker_init_fn  TODO: Uncomment this line if you are using shapenet_zip on Google Colab
)

length:  2


## Loading Model

In [5]:
model_configs = global_configs["model"]["dummy_classifier"]
model = DummyClassifier(model_configs)
device = torch.device('cpu')
if torch.cuda.is_available() and training_config['device'].startswith('cuda'):
    device = torch.device(training_config['device'])
    cprint.ok('Using device:', training_config['device'])
else:
    cprint.warn('Using CPU')

model.to(device)

if(training_config["load_ckpt"]):
    model.load_ckpt(training_config['ckpt_path'])

if(torch.cuda.is_available()):
    torch.cuda.mem_get_info()

[93mUsing CPU[0m


# Training

### Variables Setup

In [6]:
train_loss_running = 0.
best_loss_val = np.inf
model.train()
start_iteration = training_config["start_iteration"]
tb_dir = f"{experiment_dir}/tb"
writer = SummaryWriter(log_dir=tb_dir)
model_checkpoint_path = f"{experiment_dir}/checkpoints"
loss_log_name = f"{experiment_dir}/loss_log.txt"
visuals_path = f"{experiment_dir}/visuals"
last_loss = 0.

## Train Logic

In [7]:
def train_one_epoch(epoch, writer):
    global best_loss_val
    global last_loss
    global start_iteration
    train_loss_running = 0.
    iteration_count = 0
    for batch_idx, batch in tqdm(enumerate(train_dataloader)):
         iteration = epoch * len(train_dataloader) + batch_idx 
         if(iteration<= start_iteration):
            continue
         ShapeNetVox.move_batch_to_device(batch, device)
         model.step(batch)
         metrics = model.get_metrics()
         loss = metrics["loss"]
         train_loss_running += loss
         iteration_count += 1

         if iteration % training_config["append_loss_every"] == (training_config["append_loss_every"] - 1) or (epoch==0 and iteration==0):
            message = '(epoch: %d, iters: %d, loss: %.6f)' % (epoch, iteration, loss.item())
            with open(loss_log_name, "a") as log_file:
                log_file.write('%s\n' % message)
            print(loss)

         if iteration % training_config["visualize_every"] == (training_config["visualize_every"] - 1):
            # Do visualizations here
            cprint.ok("visuals here")
        
         if iteration % training_config['print_every'] == (training_config['print_every'] - 1) or (epoch==0 and iteration==0):
            avg_train_loss = train_loss_running / iteration_count
            cprint.warn(f'[{epoch:03d}/{batch_idx:05d}] train_loss: {avg_train_loss:.6f}')
            writer.add_scalar("Train/Loss", avg_train_loss, iteration)
            last_loss = avg_train_loss
            train_loss_running = 0.
            iteration_count = 0
        
         if iteration % training_config['save_every'] == (training_config['save_every'] - 1):
            model.save(model_checkpoint_path, "latest")

         if iteration % training_config['validate_every'] == (training_config['validate_every'] - 1) or (epoch == 0 and iteration == 0):
            cprint.ok("Running Validation")
            model.eval()
            loss_val = 0.
            index_batch = 0
            for batch_idx, batch_val in tqdm(enumerate(validation_dataloader)):
                ShapeNetVox.move_batch_to_device(batch_val, device)
                with torch.no_grad():
                    model.inference(batch_val)
                    metrics = model.get_metrics()
                    loss_val += metrics["loss"]
                    index_batch += 1
            avg_loss_val = loss_val / (index_batch)

            #Do visualizations here
            if avg_loss_val < best_loss_val:
                model.save(model_checkpoint_path, "best")
                best_loss_val = avg_loss_val
            
            cprint.warn(f'[{epoch:03d}/{batch_idx:05d}] val_loss: {avg_loss_val:.6f} | best_loss_val: {best_loss_val:.6f}')
            writer.add_scalar("Validation/Loss", avg_loss_val, iteration)
            writer.add_scalars('Validation/LossComparison',
                   { 'Training' : last_loss, 'Validation' : avg_loss_val },
                    iteration)
            writer.flush()
         return last_loss

## Run Training

In [8]:
start_epoch = training_config["start_epoch"]
for epoch in tqdm(range(training_config['n_epochs'])):
    if epoch < start_epoch:
        continue
    avg_loss = train_one_epoch(epoch, writer)
    # if(epoch % config["save_every_nepochs"]==0):
    model.save(model_checkpoint_path, epoch)
    model.update_lr()
    writer.close()

  0%|          | 0/20 [00:00<?, ?it/s]

0it [00:00, ?it/s]

logs/dummy_classification/3/checkpoints/epoch-0.ckpt created


0it [00:00, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x00000171CA3AB910>
Traceback (most recent call last):
  File "c:\Python310\lib\multiprocessing\connection.py", line 137, in __del__
    self._close()
  File "c:\Python310\lib\multiprocessing\connection.py", line 282, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid


NameError: name 'iou' is not defined