In [2]:
from google.colab import files
uploaded = files.upload()

Saving test_dataset0202.zip to test_dataset0202.zip


In [9]:
!mkdir datasets

mkdir: cannot create directory ‘datasets’: File exists


In [None]:
!unzip test_dataset0202.zip

In [11]:
!mv test_dataset/ datasets/

In [12]:
!mv decoders.yml datasets/test_dataset

# Formal

In [1]:
import yaml
import os
import yaml
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

In [4]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),  # size: 512x512
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # size: 256x256
            nn.Conv2d(16, 32, kernel_size=3, padding=1),  # size: 256x256
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # size: 128x128
            nn.Conv2d(32, 32, kernel_size=3, padding=1),  # size: 128x128
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),  # size: 64x64
            nn.Flatten(),
            nn.Linear(32 * 64 * 64, 1024),  # size: 1024
        )

    def forward(self, x):
        x = self.features(x)
        return x

In [5]:
class Decoder(nn.Module):
    def __init__(self, input_size, output_size):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

In [6]:
class MultiTailDecoder(nn.Module):
    def __init__(self, input_size, classification_sizes=None, regression_size=None):
        super(MultiTailDecoder, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        self.classification_tails = [nn.Linear(128, size) for size in classification_sizes] if classification_sizes else []
        self.regression_tail = nn.Linear(128, regression_size) if regression_size else None

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        classification_outputs = [tail(x) for tail in self.classification_tails] if self.classification_tails else []
        regression_output = self.regression_tail(x) if self.regression_tail else None
        return classification_outputs, regression_output

In [None]:
class ParamAwareMultiTailDecoder(nn.Module):
    def __init__(self, input_size, classification_params=None, regression_params=None):
        super(ParamAwareMultiTailDecoder, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()
        # {param_name: tail}
        self.classification_tails = nn.ModuleDict(
                {
                    param_name: nn.Sequential(
                        nn.Linear(128, size),
                        nn.Softmax(dim=1),
                    )  # TODO: refine decoder structure, e.g. add more layers
                    for param_name, size in classification_params.items()
                }
                if classification_params
                else {}
        )
        self.regression_tail = nn.ModuleDict(
            {
                param_name: nn.Linear(128, size)  # TODO: refine decoder structure, e.g. add more layers
                for param_name, size in regression_params.items()
            }
            if regression_params
            else {}
        )

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        classification_outputs = {
            param_name: tail(x) for param_name, tail in self.classification_tails.items()
        } if self.classification_tails else {}
        regression_output = {
            param_name: tail(x) for param_name, tail in self.regression_tail.items()
        } if self.regression_tail else {}
        return classification_outputs, regression_output

In [7]:
class EncoderDecoderModel(nn.Module):
    def __init__(self, encoder, decoders):
        super(EncoderDecoderModel, self).__init__()
        self.encoder = encoder
        self.decoders = decoders

    def forward(self, x):
        x = self.encoder(x)
        batch_size = x.size(0)  # Get the batch size
        x = x.view(batch_size, -1)  # Flatten the feature tensor, considering the batch size
        decoder_outputs = {decoder_name: decoder(x) for decoder_name, decoder in self.decoders.items()}
        return decoder_outputs  # note that the multi-tail decoder returns a list of outputs

In [8]:
class DAGDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_name: str, datasets_folder: str="./datasets", transform=None):
        self.dataset_name = dataset_name
        self.datasets_folder = datasets_folder
        self.dataset_path = os.path.join(self.datasets_folder, self.dataset_name)
        self.images_folder = os.path.join(self.dataset_path, "images")
        self.params_folder = os.path.join(self.dataset_path, "params")
        self.ranges_file_path = os.path.join(self.dataset_path, "ranges.yml")
        self.ranges = None
        self.decoders = None
        self.transform = transforms.Compose(
            [transforms.Resize((512, 512)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
            ) if transform is None else transform
        self.data = self.load_data()

    def load_data(self):
        # Load the ranges from the YAML file
        with open(self.ranges_file_path, 'r') as file:
            self.ranges = yaml.safe_load(file)
        # Load the decoders from the YAML file
        with open(os.path.join(self.dataset_path, "decoders.yml"), 'r') as file:
            self.decoders = yaml.safe_load(file)
        # read images and parameters
        data = []
        for image_name in os.listdir(self.images_folder):
            image_path = os.path.join(self.images_folder, image_name)
            param_path = os.path.join(self.params_folder, os.path.splitext(image_name)[0] + ".yml")
            with open(param_path, 'r') as file:
                param = yaml.safe_load(file)
            # normalize
            param = self.preprocess(param)
            param = self.format_target_to_decoders(param)
            data.append((image_path, param))
        return data

    def format_target_to_decoders(self, target):
        formatted_target = {}
        for decoder_name, decoder_params in self.decoders.items():
            formatted_target[decoder_name] = {
                "classification_targets": {},
                "regression_target": {}
            }
            for param_name in decoder_params:
                param_type = self.ranges[param_name]['type']
                if param_type == 'float' or param_type == 'int' or param_type == 'vector':
                    formatted_target[decoder_name]['regression_target'][param_name] = target[param_name]
                elif param_type == 'states' or param_type == 'bool':
                    formatted_target[decoder_name]['classification_targets'][param_name] = target[param_name]
        return formatted_target

    def preprocess(self, param):
        processed_param = {}
        # for float and vector: normalize with min max
        # for states, bool: convert to one hot
        # for ints: treat as float, but round back to int when saving as param
        for param_name, param_spec in self.ranges.items():
            if param_spec['type'] == 'float' or param_spec['type'] == 'int' or param_spec['type'] == 'vector':
                processed_param[param_name] = self.normalize(param[param_name], param_spec)
            elif param_spec['type'] == 'states' or param_spec['type'] == 'bool':
                processed_param[param_name] = self.one_hot(param[param_name], param_spec)
            else:
                raise ValueError(f"Unsupported parameter type: {param_spec['type']}")
        return processed_param

    def normalize(self, value, param_spec):
        if param_spec['type'] == 'float' or param_spec['type'] == 'int':
            return (value - param_spec['min']) / (param_spec['max'] - param_spec['min'])
        elif param_spec['type'] == 'vector':
            return [(value[i] - param_spec[f'{dim}min']) / (param_spec[f'{dim}max'] - param_spec[f'{dim}min']) for i, dim in enumerate(['x', 'y', 'z'])]
        else:
            raise ValueError(f"Unsupported parameter type: {param_spec['type']}")

    def one_hot(self, value, param_spec):
        if param_spec['type'] == 'states':
            index = param_spec['values'].index(value)
            return [1 if i == index else 0 for i in range(len(param_spec['values']))]
        elif param_spec['type'] == 'bool':
            # make bools onehot too to make it consistent
            return [1, 0] if value else [0, 1]
        else:
            raise ValueError(f"Unsupported parameter type: {param_spec['type']}")


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample, target = self.data[idx]
        sample = Image.open(sample).convert('L')

        if self.transform:
            sample = self.transform(sample)

        # convert target's values to tensor
        for decoder_name, decoder_outputs in target.items():
            # check if is tensor
            for classification_target in decoder_outputs['classification_targets']:
                if not torch.is_tensor(decoder_outputs['classification_targets'][classification_target]):
                    target[decoder_name]['classification_targets'][classification_target] = torch.tensor(decoder_outputs['classification_targets'][classification_target], dtype=torch.float32)
            for regression_target in decoder_outputs['regression_target']:
                if not torch.is_tensor(decoder_outputs['regression_target'][regression_target]):
                    target[decoder_name]['regression_target'][regression_target] = torch.tensor(decoder_outputs['regression_target'][regression_target], dtype=torch.float32)

        return sample, target

In [9]:
dataset = DAGDataset("test_dataset")

In [10]:
dataset.__getitem__(0)

(tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          ...,
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.],
          [1., 1., 1.,  ..., 1., 1., 1.]]]),
 {'Building Mass Decoder': {'classification_targets': [tensor([0., 1.])],
   'regression_target': tensor([0.2754, 0.3033, 0.5714])},
  'Facade Decoder': {'classification_targets': [],
   'regression_target': tensor([0.7500, 0.2500])},
  'Floor Ledge Decoder': {'classification_targets': [tensor([1., 0.])],
   'regression_target': tensor([0.4263, 0.1645, 0.7033, 0.8891])},
  'Roof Decoder': {'classification_targets': [tensor([0., 1., 0.])],
   'regression_target': tensor([0.5063, 0.8231, 0.9543])},
  'Window Ledge Decoder': {'classification_targets': [tensor([1., 0.])],
   'regression_target': tensor([0.3239, 0.3608, 0.2882, 0.3547, 0.5499, 0.1825])},
  'Window Main Decoder': {'classification_targets': [],
   'regression_t

In [11]:
# split into train val and test
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Create DataLoader
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [12]:
encoder = Encoder()

In [13]:
# Load the ranges from the YAML file
with open('./datasets/test_dataset/ranges.yml', 'r') as file:
    ranges = yaml.safe_load(file)

# Create a mapping between parameter names and output sizes
parameter_output_mapping = {}
for decoder_name, param_specs in ranges.items():
    if param_specs['type'] == 'float':
        parameter_output_mapping[decoder_name] = 1  # 1 for scalar
    elif param_specs['type'] == 'int':
        parameter_output_mapping[decoder_name] = 1  # 1 for scalar
    elif param_specs['type'] == 'vector':
        parameter_output_mapping[decoder_name] = 3  # 3 for x, y, z
    elif param_specs['type'] == 'states':
        parameter_output_mapping[decoder_name] = len(param_specs['values'])
    elif param_specs['type'] == 'bool':
        parameter_output_mapping[decoder_name] = 2  # 2 for binary encoding

In [14]:
# Load the decoders' params from the YAML file
with open('./datasets/test_dataset/decoders.yml', 'r') as file:
    decoders_params = yaml.safe_load(file)

decoders = nn.ModuleDict()
# initialize decoders with corresponding output tails
for decoder_name, param_names in decoders_params.items():
    classification_tails = {}
    regression_tails = {}
    for param_name in param_names:
        spec = ranges[param_name]
        # if type is bool or states, add to classification tails
        # if type is float, int or vector, add to regression tails
        if spec['type'] == 'bool' or spec['type'] == 'states':
            classification_tails[param_name] = parameter_output_mapping[param_name]
        else:
            regression_tails[param_name] = parameter_output_mapping[param_name]
    # add decoder to model
    decoders[decoder_name] = ParamAwareMultiTailDecoder(1024, classification_tails, regression_tails)

# decoders = nn.ModuleDict({
#     param_name: Decoder(4096, output_size)
#     for param_name, output_size in parameter_output_mapping.items()
# })

In [15]:
model = EncoderDecoderModel(encoder, decoders)

In [16]:
model

EncoderDecoderModel(
  (encoder): Encoder(
    (features): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): ReLU(inplace=True)
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (6): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): ReLU(inplace=True)
      (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (9): Flatten(start_dim=1, end_dim=-1)
      (10): Linear(in_features=131072, out_features=1024, bias=True)
    )
  )
  (decoders): ModuleDict(
    (Building Mass Decoder): MultiTailDecoder(
      (fc1): Linear(in_features=1024, out_features=256, bias=True)
      (relu1): ReLU()
      (fc2): Linear(in_features=256, out_features=128, bias=True)


In [28]:
# !pip install torchinfo
# from torchinfo import summary

# batch_size = 8
# summary(model, input_size=(batch_size, 1, 512, 512))

# Basically my model is too unique

In [None]:
# Load the switches from the YAML file
with open('./datasets/test_dataset/switches.yml', 'r') as file:
    switches = yaml.safe_load(file)

In [17]:
# Define loss function and optimizer
# for regression, use MSELoss, for classification, use CrossEntropyLoss
class EncDecsLoss(nn.Module):
    def __init__(self, decoders, switches_mapping: dict):
        super(EncDecsLoss, self).__init__()
        self.decoders = decoders
        self.switches_mapping = switches_mapping

    def forward(self, outputs, targets):
        loss = 0.0
        for decoder_name, decoder_output in outputs.items():
            loss += self.decoder_loss(decoder_output, targets[decoder_name])
        return loss

    def classification_loss(self, output, target):
        return nn.CrossEntropyLoss()(output, target)

    def regression_loss(self, output, target):
        return nn.MSELoss()(output, target)

    # def find_param_name_in(target, param_name):
    #     for decoder_name, decoder_outputs in target.items():
    #         for i, classification_target in enumerate(decoder_outputs['classification_targets']):
    #             if param_name ==
    #     return None

    def decoder_loss(self, decoder_output, target):
        classification_outputs = decoder_output[0]  # note that model outputs a tuple of list instead of dict of list
        regression_output = decoder_output[1]
        total_classification_loss = 0.0
        # if classification_outputs:
        for param_name, pred in classification_outputs.items():
            total_classification_loss += self.classification_loss(pred, target["classification_targets"][param_name])
            # TODO: should we add early termination for "Has" labels?
        # if regression_output:
        total_regression_loss = 0.0
        for param_name, pred in regression_output.items():
            regression_loss = self.regression_loss(pred, target["regression_target"][param_name])
            # use gt's 0 1 label to switch off the loss if needed
            switch_param_name = self.switches_mapping["Reversed Mapping"].get(param_name)
            if switch_param_name:
                switch_target = target["classification_targets"][switch_param_name]
                switch_index = torch.argmax(switch_target, dim=1)
                # make regression_loss same shape as switch_index
                regression_loss = torch.stack([regression_loss] * switch_index.size(0))
                regression_loss *= switch_index
                # average again
                regression_loss = torch.mean(regression_loss)
            total_regression_loss += regression_loss
        averaged_classification_loss = total_classification_loss / len(classification_outputs) if len(classification_outputs) > 0 else 0
        averaged_regression_loss = total_regression_loss / len(regression_output) if len(regression_output) > 0 else 0
        loss = averaged_classification_loss + averaged_regression_loss
        return loss

criterion = EncDecsLoss(decoders, switches)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
# Training loop with train and val
epochs = 10
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, targets = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 200 == 199:  # print every 200 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 2000:.3f}")
            running_loss = 0.0
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(val_loader, 0):
            inputs, targets = data
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
        print(f"Validation loss: {val_loss / len(val_loader)}")

print("Finished Training")

# Save your trained model if needed
torch.save(model.state_dict(), "encDecModel.pth")

  target[decoder_name]['classification_targets'][i] = torch.tensor(classification_target, dtype=torch.float32)
  target[decoder_name]['regression_target'] = torch.tensor(decoder_outputs['regression_target'], dtype=torch.float32)


Validation loss: 6.3280487060546875
Validation loss: 4.342294692993164
Validation loss: 4.482870578765869
Validation loss: 4.400471210479736
Validation loss: 4.300827503204346
Validation loss: 4.273218154907227
Validation loss: 4.354312419891357
Validation loss: 4.022950649261475
Validation loss: 4.459605693817139
Validation loss: 4.317946434020996
Finished Training


In [19]:
# Test the model
model.eval()
test_loss = 0.0
with torch.no_grad():
    for i, data in enumerate(test_loader, 0):
        inputs, targets = data
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item()
    print(f"Test loss: {test_loss / len(test_loader)}")
print("Finished Testing")

Test loss: 2.9995086193084717
Finished Testing
