# Setup

In [1]:
%pip install positional-encodings[pytorch]

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pathlib
import os

os.chdir(pathlib.Path().absolute() / "..")
os.getcwd()

'c:\\Users\\connor\\programming\\MineGen'

In [3]:
from data import build_dataset
from layers import TransformerLayer, ResidualBlock, NestedTransformer

from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import DataLoader
from lightning import LightningModule
from einops.layers.torch import Rearrange
from einops import rearrange
from tqdm import tqdm
import torch
import math
import numpy as np
import lightning.pytorch as pl

  from .autonotebook import tqdm as notebook_tqdm


# Modelling

## Smaller schematic volumes

In [4]:
class ScaledModel(LightningModule):
    """Takes a schematic, partitions it into patches, 
    and applies a NestedTransformer to each patch.
    Does this dynamically with a variable patch size.
    """

    def __init__(self, input_size, patch_size=8, seq_len=16) -> None:
        super().__init__()

        if isinstance(patch_size, int):
            patch_size = (patch_size, patch_size, patch_size)

        self.patch_size = patch_size
        self.seq_len = seq_len
        self.C_embed = 16

        # Dynamically calculate the number of transformers needed
        num_hierarchies = [
            math.ceil(math.log2(input_dim / patch_dim))
            for input_dim, patch_dim in zip(input_size, patch_size)
        ]

        # Use the maximum number of hierarchies needed among all dimensions
        num_hierarchies = max(num_hierarchies) - 2
        print(f"Using {num_hierarchies} hierarchies")

        embed_dim = [self.C_embed * 2 ** i for i in range(num_hierarchies)]
        num_heads = [4 * 2 ** i for i in range(num_hierarchies)]
        num_layers = [2 * 2 ** i for i in range(num_hierarchies)]
        print(f"Embedding dimensions: {embed_dim}")
        print(f"Number of heads: {num_heads}")
        print(f"Number of layers: {num_layers}")


        # Create the in_channels list
        in_channels_list = [embed_dim[0]] + embed_dim[:-1]

        # Initialize the hierarchical transformers
        self.downscale = nn.ModuleList([
            NestedTransformer(2, e_dim, n_heads, n_layers, in_channels=in_channels)
            for e_dim, n_heads, n_layers, in_channels in zip(
                embed_dim, num_heads, num_layers, in_channels_list
            )
        ])

        # self.conv = nn.LazyConv3d(self.C_embed * 4 ** (num_hierarchies+1), 2)
        self.conv = nn.LazyConv3d(512, 1)

        self.upscale = nn.ModuleList([
            nn.LazyConvTranspose3d(e_dim, 2, stride=2)
            for e_dim in reversed(embed_dim)
        ])


        self.embedding = nn.Sequential(
            nn.Embedding(512, self.C_embed),
            Rearrange("b d h w c -> b c d h w")
        )

    
    def training_step(self, batch, batch_idx):
        x, y = batch
        loss = 0

        # Split the input into patches
        x = rearrange(x, "b (d p1) (h p2) (w p3) -> (b d h w) p1 p2 p3", p1=self.patch_size[0], p2=self.patch_size[1], p3=self.patch_size[2])
        x = x.split(self.seq_len)

        # Iterate over each patch
        for sample in x: 
            seq = self.embedding(sample.abs().long())

            # Downscale
            for transformer in self.downscale:
                seq = transformer(seq)
            
            # seq = self.conv(seq)
            
            # Upscale
            for conv in self.upscale:
                # Pass previous through residual block

                # Add to previous

                # Convolute
                seq = conv(seq)
            
            seq = self.conv(seq)

            loss += F.cross_entropy(seq, sample.abs().long())
            self.log("train_loss", loss)
            print(torch.cuda.mem_get_info())



        return loss
            

    
    def configure_optimizers(self):
        optimiser = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimiser

In [5]:
class lolModel(LightningModule):
    """Takes a schematic, partitions it into patches, 
    and applies a NestedTransformer to each patch.
    """

    def __init__(self) -> None:
        super().__init__()

        self.patch_size = (8, 8, 8)
        self.seq_len = 16
        self.C_embed = 16

        # Initialize the hierarchical transformers
        self.downscale = nn.Sequential(
            NestedTransformer(2, 16, 4, 2),
            NestedTransformer(2, 32, 8, 4, in_channels=16),
            # Rearrange("b c d h w -> b d h w c"),
            # nn.LayerNorm(32),
            nn.Conv3d(32, 64, 2)
        )

        self.conv = nn.LazyConv3d(512, 1)

        self.upscale = nn.Sequential(
            nn.LazyConvTranspose3d(64, 2, stride=2),
            nn.LazyConvTranspose3d(32, 2, stride=2),
            nn.LazyConvTranspose3d(16, 2, stride=2),
        )

        self.embedding = nn.Sequential(
            nn.Embedding(256, self.C_embed),
            Rearrange("b d h w c -> b c d h w")
        )

    def training_step(self, batch, batch_idx):
        x, y = batch

        # Split the input into patches
        x = rearrange(x, "b (d p1) (h p2) (w p3) -> (b d h w) p1 p2 p3", p1=self.patch_size[0], p2=self.patch_size[1], p3=self.patch_size[2])
        x = x.split(self.seq_len)
        # Iterate over each patch

        loss = 0
        
        for sample in x:
            seq = self.embedding(sample.abs().long())
            # Downscale
            seq = self.downscale(seq)
            # Upscale
            seq = self.upscale(seq)

            seq = self.conv(seq)

            loss += F.cross_entropy(seq, sample.abs().long())
            self.log("train_loss", loss)

            print(torch.cuda.mem_get_info())
        
        del x, y, seq, sample

        return loss

    def configure_optimizers(self):
        optimiser = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimiser


## Training

In [6]:
loader = torch.utils.data.Subset(build_dataset(None, threshold=32), range(0, 10))
loader = DataLoader(loader, batch_size=1, shuffle=True, num_workers=4, pin_memory=True)
trainer = pl.Trainer(max_epochs=1, profiler="simple",  accelerator="gpu", log_every_n_steps=1, precision=16, benchmark=True)

model = lolModel()
trainer.fit(model, loader)

  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params
-----------------------------------------
0 | downscale | Sequential | 35.8 K
1 | conv      | LazyConv3d | 0     
2 | upscale   | Sequential | 0     
3 | embedding | Sequential | 4.1 K 
-----------------------------------------
39.9 K    Trainable params
0         Non-trainable params
39.9 K    Total params
0.160     Total estimated model params size (MB)


Epoch 0:   0%|          | 0/10 [00:00<?, ?it/s] (5983174656, 8589606912)
(5978980352, 8589606912)
(5974786048, 8589606912)
(5972688896, 8589606912)
(5968494592, 8589606912)
(5964300288, 8589606912)
(5962203136, 8589606912)
(5958008832, 8589606912)
(5955911680, 8589606912)
(5951717376, 8589606912)
(5947523072, 8589606912)
(5945425920, 8589606912)
(5941231616, 8589606912)
(5939134464, 8589606912)
(5934940160, 8589606912)
(5930745856, 8589606912)
(5928648704, 8589606912)
(5924454400, 8589606912)
(5922357248, 8589606912)
(5918162944, 8589606912)
(5916065792, 8589606912)
(5911871488, 8589606912)
(5907677184, 8589606912)
(5905580032, 8589606912)
(5884608512, 8589606912)
(5844762624, 8589606912)
(5823791104, 8589606912)
(5781848064, 8589606912)
(5762973696, 8589606912)
(5721030656, 8589606912)
(5702156288, 8589606912)
(5660213248, 8589606912)
(5639241728, 8589606912)
(5599395840, 8589606912)
(5578424320, 8589606912)
(5538578432, 8589606912)
(5517606912, 8589606912)
(5477761024, 8589606912)
(5

RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED

In [None]:
loader = torch.utils.data.Subset(build_dataset(None, threshold=32), range(0, 10))
loader = DataLoader(loader, batch_size=1, shuffle=True, num_workers=4)
trainer = pl.Trainer(max_epochs=1, profiler="simple",  accelerator="gpu")

model = lolModel((128, 128, 128), patch_size=8, seq_len=16)
trainer.fit(model, loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TypeError: lolModel.__init__() got an unexpected keyword argument 'patch_size'

record hyperparameters and results - potential local optima

In [None]:
dataset = build_dataset(None)
N_SAMPLES = (len(dataset) // 10) * 1
loader = torch.utils.data.Subset(dataset, range(0, N_SAMPLES))
loader = DataLoader(loader, batch_size=1, shuffle=True, num_workers=4)
# logger = pl.loggers.TensorBoardLogger("runs", name="test")
trainer = pl.Trainer(max_epochs=5, profiler="simple")

model = LitModel(embed_dim=[64, 128, 256])
trainer.fit(model=model, train_dataloaders=loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                      | Type                | Params
-------------------------------------------------------------------
0  | hierarchical_transformers | ModuleList          | 594 K 
1  | conv_t_final              | LazyConvTranspose3d | 0     
2  | conv_transpose            | ModuleList          | 0     
3  | positional                | Summer              | 0     
4  | norm                      | LayerNorm           | 512   
5  | conv                      | LazyConv3d          | 0     
6  | projection                | Sequential          | 0     
7  | res_raw                   | Sequential          | 64    
8  | res_final                 | Sequential          | 43.3 K
9  | embed_layer               | Sequential          | 16.4 K
10 | conv_final                | LazyConv3d          | 

Epoch 0:   0%|          | 0/283 [00:00<?, ?it/s] torch.Size([1, 19, 32, 32, 32])


ValueError: Target size (torch.Size([1, 1, 19])) must be the same as input size (torch.Size([1, 19, 32, 32, 32]))

In [None]:
optimiser = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.001)
criterion = nn.CrossEntropyLoss()
dataset = build_dataset(None)
loader = torch.utils.data.Subset(dataset, range(0, 1000))
loader = DataLoader(loader, batch_size=1, shuffle=True, num_workers=4)

In [None]:
num_epochs = 100
writer = SummaryWriter()

model.zero_grad(set_to_none=True)
for epoch in range(num_epochs):
    train_loss = 0
    for schem_data, target in (pbar := tqdm(loader)):
        optimiser.zero_grad()
        
        # Move to GPU
        
        schem_data = schem_data.abs().to(device)
        # target = target.to(device)
        
        # Forward pass
        y_hat = model.forward(schem_data)
        # torch variable

        loss = criterion(y_hat, schem_data.long())
        # loss.requires_grad = True
        
        writer.add_scalar('Loss/train', loss.item(), epoch)
        loss.backward()
        optimiser.step()
        train_loss += loss.item()
        
        pbar.set_description(f"Epoch {epoch+1}, Training Loss: {train_loss:.6f}")

writer.flush()

  0%|          | 0/10 [00:15<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB (GPU 0; 8.00 GiB total capacity; 4.69 GiB already allocated; 1.26 GiB free; 4.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF