# Training the Extended Mask2Former UAV-SOD Drone Dataset

In [1]:
# Import libraries
import pandas as pd
import os, json
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from src.data_set_up import SOD_Data
from models.extended_mask2former_model import ExtendedMask2Former
from models.efpn_backbone.anchors import Anchors
from src.helpers import train, validate, test


# Import data paths
map_path = "src/code_map.json"
data_info_path = "src/data_info/uav_data_preprocessing.json"
base_dir = "data/uav_sod_data/"

### Set up GPU growth

In [3]:
# Set device we are going to load the model and the data
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

## Set up basic static data

- Get the number of classes
- Get the mean and standard deviation 
- Create the data paths for the [train, test, validation]

In [4]:
# Load the classes of the UAV-SOD Drone dataset
map = open(map_path)
data = json.load(map)
classes = data["UAV_SOD_DRONE"]["CATEGORY_ID_TO_NAME"]
map.close() 

# The number of classes plus the background
number_classes = len(classes) + 1


# Load the mean and standard deviation for the train data
map = open(data_info_path)
data = json.load(map)
mean = data["uav_data"]["mean"]
standard_deviation = data["uav_data"]["std"]
map.close() 


# Define train, test and validation path
train_path = os.path.join(base_dir, "train")
test_path = os.path.join(base_dir, "test")
validation_path = os.path.join(base_dir, "validation")

## Dataset - Dataloader
- Collate function
- Data transformations
- DataLoader and Dataset

In [5]:
# Data transform function
data_transform = {
    "train": transforms.Compose([
            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=standard_deviation)]),

    "test": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=standard_deviation)]), 
            
    "validation": transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=standard_deviation)]) 
}


# Dataset and DataLoader
train_dataset      = SOD_Data(train_path +"/images", train_path + "/annotations", data_transform["train"])
test_dataset       = SOD_Data(test_path + "/images", test_path  + "/annotations", data_transform["test"])
validation_dataset = SOD_Data(validation_path + "/images", validation_path + "/annotations", data_transform["validation"])

train_loader      = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)), pin_memory=True)
test_loader       = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))
validation_loader = DataLoader(validation_dataset, batch_size=4, shuffle=False, collate_fn=lambda x: tuple(zip(*x)))

## Bounding Box Heuristics

In order to create accurate anchors we get the dataset's bounding box statistics, like mean and standard deviation in order to create representative anchors to help the model find the bounding boxes faster.


In [6]:
# Return a dictionary of the main statistics
bbox_stats = train_dataset.analyze_bounding_boxes()

# Get mean for width and height
mean_width = bbox_stats['mean_width']
mean_height = bbox_stats['mean_height']

# Get standard deviation for width and height
std_width = bbox_stats['std_width']
std_height = bbox_stats['std_height']

# Print statistics
print("Aspect Ratios:", sorted(set(bbox_stats['aspect_ratios'])))
print("Mean Width:", bbox_stats['mean_width'])
print("Mean Height:", bbox_stats['mean_height'])
print("Width Std Dev:", bbox_stats['std_width'])
print("Height Std Dev:", bbox_stats['std_height'])

Aspect Ratios: [0.0, 0.14285714285714285, 0.15384615384615385, 0.16666666666666666, 0.17647058823529413, 0.18421052631578946, 0.1875, 0.2, 0.2222222222222222, 0.22727272727272727, 0.23076923076923078, 0.23404255319148937, 0.23529411764705882, 0.23809523809523808, 0.24242424242424243, 0.25, 0.2608695652173913, 0.26548672566371684, 0.26666666666666666, 0.2727272727272727, 0.2777777777777778, 0.2786885245901639, 0.28, 0.2830188679245283, 0.2857142857142857, 0.288135593220339, 0.2894736842105263, 0.29411764705882354, 0.2962962962962963, 0.2972972972972973, 0.3, 0.30434782608695654, 0.30952380952380953, 0.3114754098360656, 0.3125, 0.3140495867768595, 0.3142857142857143, 0.3157894736842105, 0.3170731707317073, 0.3181818181818182, 0.3188405797101449, 0.3235294117647059, 0.3246753246753247, 0.328125, 0.3333333333333333, 0.34285714285714286, 0.34615384615384615, 0.34782608695652173, 0.35, 0.35135135135135137, 0.35294117647058826, 0.3548387096774194, 0.35555555555555557, 0.35714285714285715, 0.3

## Generate Anchors

In [7]:
# Based on the statistics above decide on the values of the statistics, the scales and the aspect ratios  
feature_map_shapes = [(18, 18)]
scales = [32]
aspect_ratios = [0.5, 1.0]

anchors = torch.tensor(Anchors.generate_anchors(feature_map_shapes, scales, aspect_ratios), dtype=torch.float32)

print("The number of anchors is: {}".format(anchors.size(0)))

The number of anchors is: 648


## Implement the ExtendedMask2Former model with all the parameters needed

In [8]:
# Initialise the ExtendedMask2Former model and load it to device
model = ExtendedMask2Former(num_classes=number_classes, num_anchors=anchors.size(0), device=device).to(device)
anchors = anchors.to(device)


# Hyperparameters selection
num_epochs = 1
learning_rate = 0.001
batch_size = 1

# Define the optimizer and the scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

Loaded pretrained weights for efficientnet-b7


### Train Loop

In [None]:
metrics_df = pd.DataFrame(columns=['epoch', 'train_loss', 'val_loss', 'precision', 'recall', 'AP', 'mAP'])

for epoch in range(num_epochs):
    train(model, train_loader, device, anchors, optimizer, number_classes)    
    scheduler.step()

### Validation Loop

### Test Loop