In [None]:
%cd '/content/drive/MyDrive/ICASSP_2025/without_finetune'

/content/drive/MyDrive/Aerial_Scene_Recognition/ClassificationAfterFinetune/without_finetune


In [None]:
import torch
import numpy as np
import random

# Set the seed for reproducibility
seed = 43
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import h5py
import numpy as np

# Load image embeddings
with h5py.File('train_val_vision_embeddings.h5', 'r') as f:
    image_embeddings = f['train_val_vision_embeddings'][:]

# Load audio embeddings
with h5py.File('train_val_audio_embeddings.h5', 'r') as f:
    audio_embeddings = f['train_val_audio_embeddings'][:]

# Load labels
labels = np.load('train_val_labels_inputs.npy')

In [None]:
image_embeddings = torch.tensor(image_embeddings, dtype=torch.float32)
audio_embeddings = torch.tensor(audio_embeddings, dtype=torch.float32)
labels = torch.tensor(labels, dtype=torch.long)

In [None]:
dataset = torch.utils.data.TensorDataset(image_embeddings, audio_embeddings, labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True) # original batch size = 32, but 16 is better

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import math

# Dual Attention Mechanism with Feature Pyramid
class DualAttentionWithFeaturePyramid(nn.Module):
    def __init__(self, embed_dim, num_heads=8):
        super(DualAttentionWithFeaturePyramid, self).__init__()
        # Linear layers to create feature pyramid
        self.project_512 = nn.Linear(embed_dim, 512)
        self.project_256 = nn.Linear(512, 256)
        self.project_128 = nn.Linear(256, 128)

        self.image_attn = nn.MultiheadAttention(512, num_heads)
        self.audio_attn = nn.MultiheadAttention(512, num_heads)
        self.cross_modal_attn = nn.MultiheadAttention(512, num_heads)
        self.norm = nn.LayerNorm(512)

    def forward(self, image_embeddings, audio_embeddings):
        # Create feature pyramid for image embeddings
        img_pyramid_512 = self.project_512(image_embeddings)
        img_pyramid_256 = self.project_256(img_pyramid_512)
        img_pyramid_128 = self.project_128(img_pyramid_256)

        # Create feature pyramid for audio embeddings
        audio_pyramid_512 = self.project_512(audio_embeddings)
        audio_pyramid_256 = self.project_256(audio_pyramid_512)
        audio_pyramid_128 = self.project_128(audio_pyramid_256)

        # Self-attention on image embeddings (at 512 dimension)
        img_attn_output, _ = self.image_attn(img_pyramid_512, img_pyramid_512, img_pyramid_512)
        img_attn_output = self.norm(img_attn_output + img_pyramid_512)

        # Self-attention on audio embeddings (at 512 dimension)
        audio_attn_output, _ = self.audio_attn(audio_pyramid_512, audio_pyramid_512, audio_pyramid_512)
        audio_attn_output = self.norm(audio_attn_output + audio_pyramid_512)

        # Cross-attention between image and audio embeddings (at 512 dimension)
        combined_attn_output, _ = self.cross_modal_attn(img_attn_output, audio_attn_output, audio_attn_output)
        combined_attn_output = self.norm(combined_attn_output + img_attn_output)

        # Concatenate pyramid outputs (512, 256, 128 dimensions)
        pyramid_combined = torch.cat([combined_attn_output, img_pyramid_256, img_pyramid_128], dim=-1)

        return pyramid_combined


# Transformer Classifier with Feature Pyramid
class TransformerClassifierWithFeaturePyramid(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads=8, num_layers=2, dim_feedforward=2048, dropout=0.4):
        super(TransformerClassifierWithFeaturePyramid, self).__init__()
        self.dual_attention = DualAttentionWithFeaturePyramid(input_dim, num_heads)
        # Adjusting the input dimension to account for concatenated pyramid features (512+256+128)
        pyramid_dim = 512 + 256 + 128
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=pyramid_dim, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=num_layers)
        self.norm1 = nn.LayerNorm(pyramid_dim)
        self.fc = nn.Linear(pyramid_dim, num_classes)

    def forward(self, image_embeddings, audio_embeddings):
        # Apply dual attention with feature pyramid
        combined_embeddings = self.dual_attention(image_embeddings.unsqueeze(1), audio_embeddings.unsqueeze(1))

        # Pass through transformer encoder
        combined_embeddings = self.transformer_encoder(combined_embeddings)

        # Apply normalization
        combined_embeddings = self.norm1(combined_embeddings.mean(dim=1))

        # Final classification layer
        x = self.fc(combined_embeddings)

        return x

input_dim = image_embeddings.shape[1]  # Assuming image_embeddings and audio_embeddings have the same dimension
num_classes = 13  # Number of classes
model = TransformerClassifierWithFeaturePyramid(input_dim=input_dim, num_classes=num_classes)



In [None]:
import sys

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=1e-5)

# Training loop
num_epochs = 20
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    batch_count = 0
    for image_embeds, audio_embeds, targets in dataloader:
        batch_count += 1
        sys.stdout.write(f"\rBatch {batch_count}/{len(dataloader)}")
        sys.stdout.flush()
        optimizer.zero_grad()
        outputs = model(image_embeds, audio_embeds)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")


Batch 1003/1003Epoch 1, Average Loss: 2.28280349517035
Batch 1003/1003Epoch 2, Average Loss: 1.878329017167077
Batch 1003/1003Epoch 3, Average Loss: 1.49750339155896
Batch 1003/1003Epoch 4, Average Loss: 1.1704496658692212
Batch 1003/1003Epoch 5, Average Loss: 0.9390005184521347
Batch 1003/1003Epoch 6, Average Loss: 0.7807980781998616
Batch 1003/1003Epoch 7, Average Loss: 0.6796127285693186
Batch 1003/1003Epoch 8, Average Loss: 0.6003643114988879
Batch 1003/1003Epoch 9, Average Loss: 0.5478417207695848
Batch 1003/1003Epoch 10, Average Loss: 0.5024964632587146
Batch 1003/1003Epoch 11, Average Loss: 0.46603669464432174
Batch 1003/1003Epoch 12, Average Loss: 0.43203401896110916
Batch 1003/1003Epoch 13, Average Loss: 0.4148990849751686
Batch 1003/1003Epoch 14, Average Loss: 0.4006680487483324
Batch 1003/1003Epoch 15, Average Loss: 0.37289680306106093
Batch 1003/1003Epoch 16, Average Loss: 0.3609054411704822
Batch 1003/1003Epoch 17, Average Loss: 0.3454882942608085
Batch 1003/1003Epoch 18, 

# **TESTING..**

In [None]:
# Load test image embeddings
with h5py.File('test_vision_embeddings.h5', 'r') as f:
    test_image_embeddings = f['test_vision_embeddings'][:]

# Load test audio embeddings
with h5py.File('test_audio_embeddings.h5', 'r') as f:
    test_audio_embeddings = f['test_audio_embeddings'][:]

# Load test labels
test_labels = np.load('test_labels_inputs.npy')

# # Load test image embeddings
# with h5py.File('test_vision_embeddings.h5', 'r') as f:
#     test_image_embeddings = f['test_vision_embeddings'][:]

# # Load test audio embeddings
# with h5py.File('test_audio_embeddings.h5', 'r') as f:
#     test_audio_embeddings = f['test_audio_embeddings'][:]

# # Load test labels
# test_labels = np.load('test_labels_inputs.npy')

In [None]:
# test_image_embeddings = l2_normalize(test_image_embeddings)
# test_audio_embeddings = l2_normalize(test_audio_embeddings)

In [None]:
import torch

# Convert to PyTorch tensors
test_image_embeddings = torch.tensor(test_image_embeddings, dtype=torch.float32)
test_audio_embeddings = torch.tensor(test_audio_embeddings, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

In [None]:
test_dataset = torch.utils.data.TensorDataset(test_image_embeddings, test_audio_embeddings, test_labels)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Set model to evaluation mode
model.eval()

# Initialize lists to store true labels and predictions
all_preds = []
all_labels = []

# No gradient calculation needed during evaluation
with torch.no_grad():
    for image_embeds, audio_embeds, labels in test_dataloader:
        # Forward pass
        outputs = model(image_embeds, audio_embeds)

        # Get predicted class
        _, preds = torch.max(outputs, 1)

        # Append predictions and true labels to lists
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Calculate precision, recall, and F1 score
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Precision: 0.9343
Recall: 0.9313
F1 Score: 0.9276
