# Training the Extended Mask2Former UAV-SOD Drone Dataset

In [1]:
# Import libraries
import os, json
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from src.data_set_up import SOD_Data
from models.extended_mask2former_model import ExtendedMask2Former


# Import data paths
map_path = "src/code_map.json"
data_info_path = "src/data_info/uav_data_preprocessing.json"
base_dir = "data/uav_sod_data/"

### Set up GPU growth

In [2]:
# Set device we are going to load the model and the data
# device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cpu"

### Set up basic static data

- Get the number of classes
- Get the mean and standard deviation 
- Create the data paths for the [train, test, validation]

In [3]:
# Load the classes of the UAV-SOD Drone dataset
map = open(map_path)
data = json.load(map)
classes = data["UAV_SOD_DRONE"]["CATEGORY_ID_TO_NAME"]
map.close() 

# The number of classes plus the background
number_classes = len(classes) + 1


# Load the mean and standard deviation for the train data
map = open(data_info_path)
data = json.load(map)
mean = data["uav_data"]["mean"]
standard_deviation = data["uav_data"]["std"]
map.close() 


# Define train, test and validation path
train_path = os.path.join(base_dir, "train")
test_path = os.path.join(base_dir, "test")
validation_path = os.path.join(base_dir, "validation")

### Dataset - Dataloader
- Collate function
- Data transformations
- DataLoader and Dataset

In [4]:
# Data transform function
data_transform = {
    "train": transforms.Compose([
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=standard_deviation)]),

    "test": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=standard_deviation)]), 
            
    "validation": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=standard_deviation)]) 
}



# Dataset and DataLoader
train_dataset      = SOD_Data(train_path +"/images", train_path + "/annotations", data_transform["train"])
test_dataset       = SOD_Data(test_path + "/images", test_path  + "/annotations", data_transform["test"])
validation_dataset = SOD_Data(validation_path + "/images", validation_path + "/annotations", data_transform["validation"])

train_loader      = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))
test_loader       = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
validation_loader = DataLoader(validation_dataset, batch_size=1, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

In [5]:
def validate(model, val_loader, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, targets in val_loader:
            images = torch.stack(images).to(device)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            outputs = model(images)
            loss = model.compute_loss(outputs, targets)
            val_loss += loss.item()
    return val_loss / len(val_loader)

In [6]:
# Model, Optimizer and Training Loop setup
model = ExtendedMask2Former(num_classes=number_classes).to(device)

# Hyperparameters
num_epochs = 1
learning_rate = 0.001
batch_size = 2


optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

Loaded pretrained weights for efficientnet-b7


In [7]:

def train(model, train_loader, optimizer, device):
    model.train()
    running_loss = 0.0
    for images, targets in train_loader:
        images = torch.stack(images).to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        outputs = model(images)
        loss = model.compute_loss(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    return epoch_loss

# def validate(model, val_loader, device):
#     model.eval()
#     val_loss = 0
#     with torch.no_grad():
#         for images, targets in val_loader:
#             images = torch.stack(images).to(device)
#             targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
#             outputs = model(images)
#             loss = model.compute_loss(outputs, targets)
#             val_loss += loss.item()
#     return val_loss / len(val_loader)

# def test(model, test_loader, device):
#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for images, targets in test_loader:
#             images = torch.stack(images).to(device)
#             targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
#             outputs = model(images)
#             loss = model.compute_loss(outputs, targets)
#             test_loss += loss.item()
#     return test_loss / len(test_loader)


for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}')
    
    # validation_loss = validate(model, validation_loader, device)
    # print(f'Validation Loss: {validation_loss:.4f}')

    scheduler.step()

# # Test the model
# test_loss = test(model, test_loader, device)
# print(f'Test Loss: {test_loss:.4f}')

The predicted bounding boxes are the following: tensor([[[[0.2909, 0.3229, 0.2440,  ..., 0.4697, 0.2492, 0.8057],
          [0.6816, 0.6946, 0.6302,  ..., 0.8351, 0.5562, 0.9536],
          [0.6461, 0.6557, 0.5852,  ..., 0.6913, 0.4583, 0.9304],
          ...,
          [0.6032, 0.6102, 0.7331,  ..., 0.6495, 0.6007, 0.9597],
          [0.3870, 0.4714, 0.5422,  ..., 0.6364, 0.4467, 0.9348],
          [0.9628, 0.8004, 0.8083,  ..., 0.8602, 0.8229, 0.9409]],

         [[0.4224, 0.4650, 0.4521,  ..., 0.3527, 0.3869, 0.3364],
          [0.4323, 0.6119, 0.5669,  ..., 0.4689, 0.5011, 0.2505],
          [0.4625, 0.6071, 0.6905,  ..., 0.6031, 0.5138, 0.2096],
          ...,
          [0.3382, 0.5180, 0.7071,  ..., 0.6228, 0.5109, 0.2079],
          [0.5390, 0.6420, 0.6864,  ..., 0.6680, 0.7315, 0.3809],
          [0.2552, 0.5285, 0.6171,  ..., 0.5703, 0.7086, 0.2987]],

         [[0.4385, 0.4278, 0.5073,  ..., 0.4690, 0.7127, 0.1003],
          [0.6292, 0.5177, 0.5973,  ..., 0.5160, 0.7637, 0.4

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.