In [126]:
%cd /content/drive/MyDrive/Aerial_Scene_Recognition/

/content/drive/MyDrive/Aerial_Scene_Recognition


In [127]:
# !pip install -r requirements.txt --quiet

In [128]:
import os
current_directory = os.getcwd()

In [129]:
import numpy as np
label_list = np.load('train_val_labels_inputs.npy').tolist()

In [130]:
import h5py

In [131]:
import numpy as np
file_path = 'train_val_vision_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['train_val_vision_embeddings']

    # Load the data into a NumPy array
    vision_embeddings = np.array(dataset)

print("Embeddings loaded from 'train_val_vision_embeddings.h5'")
print(vision_embeddings.shape)  # Print the shape of the loaded embeddings

Embeddings loaded from 'train_val_vision_embeddings.h5'
(4012, 1024)


In [132]:
file_path = 'train_val_audio_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['train_val_audio_embeddings']

    # Load the data into a NumPy array
    audio_embeddings = np.array(dataset)

print("Embeddings loaded from 'train_val_audio_embeddings.h5'")
print(audio_embeddings.shape)  # Print the shape of the loaded embeddings

Embeddings loaded from 'train_val_audio_embeddings.h5'
(4012, 1024)


In [133]:
def l2_normalize(arr):
    norm = np.linalg.norm(arr)
    normalized_arr = arr / norm
    return normalized_arr

In [134]:
X = []
y = []
for i in range(vision_embeddings.shape[0]):
  concatenated_embedding = np.concatenate((l2_normalize(vision_embeddings[i]), l2_normalize(audio_embeddings[i])))
  # concatenated_embedding = vision_embeddings[i] + audio_embeddings[i]
  # np.save(f'concatenated_embedding_{i}.npy', concatenated_embedding)
  X.append(concatenated_embedding)
  y.append(label_list[i])

In [135]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=43)

In [136]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)

In [137]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1536)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

In [138]:
for i in range(X_train.shape[0]):
  X_train[i] = l2_normalize(X_train[i])
for i in range(X_val.shape[0]):
  X_val[i] = l2_normalize(X_val[i])

In [139]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear')  #'rbf
svm_classifier.fit(X_train, y_train)

In [140]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Assuming y_pred and y_test are already defined
y_pred = svm_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='weighted')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

Accuracy: 96.52%
Precision: 96.54%
Recall: 96.52%
F1 Score: 96.45%


# **Testing**

In [141]:
test_label_list = np.load('test_labels_inputs.npy').tolist()

In [142]:
file_path = 'test_vision_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['test_vision_embeddings']

    # Load the data into a NumPy array
    vision_embeddings = np.array(dataset)

print("Embeddings loaded from 'test_vision_embeddings.h5'")
print(vision_embeddings.shape)

Embeddings loaded from 'test_vision_embeddings.h5'
(1063, 1024)


In [143]:
file_path = 'test_audio_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['test_audio_embeddings']

    # Load the data into a NumPy array
    audio_embeddings = np.array(dataset)

print("Embeddings loaded from 'test_audio_embeddings.h5'")
print(audio_embeddings.shape)

Embeddings loaded from 'test_audio_embeddings.h5'
(1063, 1024)


In [144]:
X = []
y = []
for i in range(vision_embeddings.shape[0]):
  concatenated_embedding = np.concatenate((l2_normalize(vision_embeddings[i]), l2_normalize(audio_embeddings[i])))
  # concatenated_embedding = vision_embeddings[i] + audio_embeddings[i]
  # np.save(f'concatenated_embedding_{i}.npy', concatenated_embedding)
  X.append(concatenated_embedding)
  y.append(test_label_list[i])

In [145]:
X = sc.transform(X)
X = pca.transform(X)
for i in range(X.shape[0]):
  X[i] = l2_normalize(X[i])

In [146]:
y_pred = svm_classifier.predict(X)

accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='weighted')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

Accuracy: 85.23%
Precision: 85.29%
Recall: 85.23%
F1 Score: 84.72%
