In [1146]:
%cd /content/drive/MyDrive/Aerial_Scene_Recognition/

/content/drive/MyDrive/Aerial_Scene_Recognition


In [1147]:
# ! pip install pytorch-lightning --quiet

In [1148]:
from pytorch_lightning import seed_everything
seed_everything(43, workers=True)

INFO:lightning_fabric.utilities.seed:Seed set to 43


43

In [1149]:
import os
current_directory = os.getcwd()

In [1150]:
import numpy as np
label_list = np.load('train_val_labels_inputs.npy').tolist()

In [1151]:
import h5py

In [1152]:
import numpy as np
file_path = 'ClassificationAfterFinetune/train_val_vision_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['ClassificationAfterFinetune/train_val_vision_embeddings']

    # Load the data into a NumPy array
    vision_embeddings = np.array(dataset)

print("Embeddings loaded from 'train_val_vision_embeddings.h5'")
print(vision_embeddings.shape)  # Print the shape of the loaded embeddings

Embeddings loaded from 'train_val_vision_embeddings.h5'
(4012, 1024)


In [1153]:
file_path = 'ClassificationAfterFinetune/train_val_audio_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['ClassificationAfterFinetune/train_val_audio_embeddings']

    # Load the data into a NumPy array
    audio_embeddings = np.array(dataset)

print("Embeddings loaded from 'train_val_audio_embeddings.h5'")
print(audio_embeddings.shape)  # Print the shape of the loaded embeddings

Embeddings loaded from 'train_val_audio_embeddings.h5'
(4012, 1024)


In [1154]:
X = []
y = []
for i in range(vision_embeddings.shape[0]):
  concatenated_embedding = np.array([vision_embeddings[i], audio_embeddings[i]])
  X.append(concatenated_embedding)
  y.append(label_list[i])

In [1155]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=43)

In [1156]:
X_train, X_val, y_train, y_val = np.array(X_train), np.array(X_val), np.array(y_train), np.array(y_val)

In [1157]:
vision_emb_train_only = []
audio_emb_train_only = []
for i in range(X_train.shape[0]):
  vision_emb_train_only.append(X_train[i][0])
  audio_emb_train_only.append(X_train[i][1])

In [1158]:
vision_emb_val_only = []
audio_emb_val_only = []
for i in range(X_val.shape[0]):
  vision_emb_val_only.append(X_val[i][0])
  audio_emb_val_only.append(X_val[i][1])

In [1159]:
vision_emb_train_only = np.array(vision_emb_train_only)
audio_emb_train_only = np.array(audio_emb_train_only)

vision_emb_val_only = np.array(vision_emb_val_only)
audio_emb_val_only = np.array(audio_emb_val_only)

In [1160]:
np.var(vision_emb_train_only)

0.00097654416

In [1161]:
np.var(audio_emb_train_only)

0.3052579

In [1162]:
from sklearn.feature_selection import VarianceThreshold
#####################################################################################################################
var_thr_vision = VarianceThreshold(threshold=0.00099)
var_thr_audio = VarianceThreshold(threshold=0.4)

In [1163]:
vision_emb_train_only = var_thr_vision.fit_transform(vision_emb_train_only)
audio_emb_train_only = var_thr_audio.fit_transform(audio_emb_train_only)

vision_emb_val_only = var_thr_vision.transform(vision_emb_val_only)
audio_emb_val_only = var_thr_audio.transform(audio_emb_val_only)

In [1164]:
def l2_normalize(x):
    return x / np.linalg.norm(x)

In [1165]:
X_train_temp = []
for i in range(X_train.shape[0]):
  X_train_temp.append(np.concatenate((l2_normalize(vision_emb_train_only[i]), l2_normalize(audio_emb_train_only[i]))))
X_train = np.array(X_train_temp)

X_val_temp = []
for i in range(X_val.shape[0]):
  X_val_temp.append(np.concatenate((l2_normalize(vision_emb_val_only[i]), l2_normalize(audio_emb_val_only[i]))))
X_val = np.array(X_val_temp)

In [1166]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [1167]:
print("After variance threshold, number of features = ",X_train.shape[1])

After variance threshold, number of features =  434


In [1168]:
from sklearn.decomposition import PCA
#####################################################################################################################
pca = PCA(n_components=int(X_train.shape[1]*0.2))
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

In [1169]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel='rbf')  #'rbf
svm_classifier.fit(X_train, y_train)

In [1170]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

y_pred = svm_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='weighted')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

Accuracy: 97.51%
Precision: 97.64%
Recall: 97.51%
F1 Score: 97.49%


# **Testing**

In [1171]:
test_label_list = np.load('test_labels_inputs.npy').tolist()

In [1172]:
file_path = 'ClassificationAfterFinetune/test_vision_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['test_vision_embeddings']

    # Load the data into a NumPy array
    vision_embeddings = np.array(dataset)

print("Embeddings loaded from 'test_vision_embeddings.h5'")
print(vision_embeddings.shape)

Embeddings loaded from 'test_vision_embeddings.h5'
(1063, 1024)


In [1173]:
file_path = 'ClassificationAfterFinetune/test_audio_embeddings.h5'

# Open the HDF5 file
with h5py.File(file_path, 'r') as h5f:
    # Access the dataset
    dataset = h5f['test_audio_embeddings']

    # Load the data into a NumPy array
    audio_embeddings = np.array(dataset)

print("Embeddings loaded from 'test_audio_embeddings.h5'")
print(audio_embeddings.shape)

Embeddings loaded from 'test_audio_embeddings.h5'
(1063, 1024)


In [1174]:
vision_embeddings = var_thr_vision.transform(vision_embeddings)
audio_embeddings = var_thr_audio.transform(audio_embeddings)

In [1175]:
X = []
y = []
for i in range(vision_embeddings.shape[0]):
  concatenated_embedding = np.concatenate((l2_normalize(vision_embeddings[i]), l2_normalize(audio_embeddings[i])))
  X.append(concatenated_embedding)
  y.append(test_label_list[i])

In [1176]:
X= scaler.transform(X)

In [1177]:
X = pca.transform(X)

In [1178]:
y_pred = svm_classifier.predict(X)

accuracy = accuracy_score(y, y_pred)

# Calculate precision, recall, and F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average='weighted')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

Accuracy: 86.64%
Precision: 86.73%
Recall: 86.64%
F1 Score: 86.09%
