In [1]:
%cd '/content/drive/MyDrive/ICASSP_2025/without_finetune'

/content/drive/MyDrive/Aerial_Scene_Recognition/ClassificationAfterFinetune/without_finetune


In [2]:
import torch
import numpy as np
import random

seed = 43
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
import h5py
import numpy as np

with h5py.File('train_val_vision_embeddings.h5', 'r') as f:
    image_embeddings = f['train_val_vision_embeddings'][:]

with h5py.File('train_val_audio_embeddings.h5', 'r') as f:
    audio_embeddings = f['train_val_audio_embeddings'][:]

labels = np.load('train_val_labels_inputs.npy')

In [4]:
image_embeddings = torch.tensor(image_embeddings, dtype=torch.float32)
audio_embeddings = torch.tensor(audio_embeddings, dtype=torch.float32)
labels = torch.tensor(labels, dtype=torch.long)

In [5]:
dataset = torch.utils.data.TensorDataset(image_embeddings, audio_embeddings, labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True) # original batch size = 32, but 16 is better

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class FPN(nn.Module):
    def __init__(self, input_dim, feature_size=256):
        super(FPN, self).__init__()

        self.lateral = nn.Conv1d(in_channels=1, out_channels=1,kernel_size=3,stride=1,padding=1)

        self.smooth1 = nn.Conv1d(in_channels=1, out_channels=1,kernel_size=3,stride=1,padding=1)
        self.smooth2 = nn.Conv1d(in_channels=1, out_channels=1,kernel_size=3,stride=1,padding=1)
        self.smooth3 = nn.Conv1d(in_channels=1, out_channels=1,kernel_size=3,stride=1,padding=1)

    def forward(self, x):
        #print('x.shape:',x.shape)
        p1 = self.lateral(x)
        #print('p1.shape:',p1.shape)
        p2 = nn.functional.avg_pool1d(p1, kernel_size=2, stride=2)
        #print('p2.shape:',p2.shape)
        p3 = nn.functional.avg_pool1d(p2, kernel_size=2,stride=2)
        #print('p3.shape:',p3.shape)

        p2_upsampled = nn.functional.interpolate(p2, size=p1.size(-1), mode='nearest')
        #print('p2_upsampled.shape:',p2_upsampled.shape)
        p3_upsampled = nn.functional.interpolate(p3, size=p1.size(-1), mode='nearest')
        #print('p3_upsampled.shape:',p3_upsampled.shape)

        p_combined = p1 + p2_upsampled + p3_upsampled
        #print('p_combined.shape:',p_combined.shape)

        p_combined = self.smooth1(p_combined)
        p_combined = self.smooth2(p_combined)
        p_combined = self.smooth3(p_combined)

        return p_combined

class DualAttentionWithFPN(nn.Module):
    def __init__(self, input_dim, fpn_feature_size=1024, num_heads=8):
        super(DualAttentionWithFPN, self).__init__()
        self.fpn = FPN(input_dim, feature_size=fpn_feature_size)
        self.image_attn = nn.MultiheadAttention(fpn_feature_size, num_heads)
        self.audio_attn = nn.MultiheadAttention(fpn_feature_size, num_heads)
        self.cross_modal_attn = nn.MultiheadAttention(fpn_feature_size, num_heads)
        self.norm = nn.LayerNorm(fpn_feature_size)

    def forward(self, image_embeddings, audio_embeddings):
        image_features = self.fpn(image_embeddings)
        audio_features = self.fpn(audio_embeddings)

        img_attn_output, _ = self.image_attn(image_features, image_features, image_features)
        img_attn_output = self.norm(img_attn_output + image_features)

        audio_attn_output, _ = self.audio_attn(audio_features, audio_features, audio_features)
        audio_attn_output = self.norm(audio_attn_output + audio_features)

        combined_attn_output, _ = self.cross_modal_attn(img_attn_output, audio_attn_output, audio_attn_output)
        combined_attn_output = self.norm(combined_attn_output + img_attn_output)

        return combined_attn_output

class TransformerClassifierWithFPN(nn.Module):
    def __init__(self, input_dim, num_classes, fpn_feature_size=1024, num_heads=8, num_layers=2, dim_feedforward=2048, dropout=0.1):
        super(TransformerClassifierWithFPN, self).__init__()
        self.dual_attention_fpn = DualAttentionWithFPN(input_dim, fpn_feature_size, num_heads)
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=fpn_feature_size, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=num_layers)
        self.norm1 = nn.LayerNorm(fpn_feature_size)
        self.fc = nn.Linear(fpn_feature_size, num_classes)

    def forward(self, image_embeddings, audio_embeddings):
        combined_embeddings = self.dual_attention_fpn(image_embeddings.unsqueeze(1), audio_embeddings.unsqueeze(1))

        combined_embeddings = self.transformer_encoder(combined_embeddings)

        combined_embeddings = self.norm1(combined_embeddings.mean(dim=1))

        x = self.fc(combined_embeddings)

        return x

input_dim = image_embeddings.shape[1]
num_classes = 13
model = TransformerClassifierWithFPN(input_dim=input_dim, num_classes=num_classes, fpn_feature_size=1024)



In [7]:
import sys

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

num_epochs = 5
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    batch_count = 0
    for image_embeds, audio_embeds, targets in dataloader:
        batch_count += 1
        sys.stdout.write(f"\rBatch {batch_count}/{len(dataloader)}")
        sys.stdout.flush()
        optimizer.zero_grad()
        outputs = model(image_embeds, audio_embeds)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")


Batch 126/126Epoch 1, Average Loss: 2.5098656756537303
Batch 126/126Epoch 2, Average Loss: 2.4186147886609275
Batch 126/126Epoch 3, Average Loss: 2.405965216576107
Batch 126/126Epoch 4, Average Loss: 2.403404854592823
Batch 126/126Epoch 5, Average Loss: 2.4023543123214965


# **TESTING..**

In [8]:
with h5py.File('test_vision_embeddings.h5', 'r') as f:
    test_image_embeddings = f['test_vision_embeddings'][:]

with h5py.File('test_audio_embeddings.h5', 'r') as f:
    test_audio_embeddings = f['test_audio_embeddings'][:]

test_labels = np.load('test_labels_inputs.npy')

In [9]:
# test_image_embeddings = l2_normalize(test_image_embeddings)
# test_audio_embeddings = l2_normalize(test_audio_embeddings)

In [10]:
import torch

test_image_embeddings = torch.tensor(test_image_embeddings, dtype=torch.float32)
test_audio_embeddings = torch.tensor(test_audio_embeddings, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

In [11]:
test_dataset = torch.utils.data.TensorDataset(test_image_embeddings, test_audio_embeddings, test_labels)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [12]:
from sklearn.metrics import precision_score, recall_score, f1_score

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for image_embeds, audio_embeds, labels in test_dataloader:
        outputs = model(image_embeds, audio_embeds)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Precision: 0.0398
Recall: 0.1994
F1 Score: 0.0663


  _warn_prf(average, modifier, msg_start, len(result))
