## Imports

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc, roc_auc_score, classification_report, confusion_matrix
import re
from collections import defaultdict
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.preprocessing import label_binarize
import os
from tqdm import tqdm
import shap
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from collections import Counter
import zipfile
import gdown
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from datasets import TestDataset
from utils import apply_smote, find_knn_per_class


ImportError: cannot import name 'find_knn_per_class' from 'utils' (C:\Users\Admin\Documents\Classification_of_Black_Sea_Harbour_Porpoise_Acoustic_Signals\utils.py)

## Data loading and preprocessing

In [7]:
dataset = TestDataset("data_files/new click trains (Bulgaria)")
test_x, test_y, test_meta = dataset.get_labeled()

print(f"Test samples: {len(test_x)}")
data = np.load("data_files/full_acoustic_dataset.npz", allow_pickle=True)
labeled_x = data['labeled_x']
labeled_y = data['labeled_y']
unlabeled_x = data['unlabeled_x']
unlabeled_meta = data['unlabeled_meta']
labeled_meta = data['labeled_meta']

labeled = np.concatenate((labeled_y, test_y), axis=0)
distribution = Counter(labeled)
total = len(labeled)
print("\nTest Distribution:")
for label, count in distribution.items():
    pct = (count / total) * 100
    print(f"Class {label}: {count} ({pct:.2f}%)")

Test samples: 64

Test Distribution:
Class 0: 195 (34.51%)
Class 1: 370 (65.49%)


## Upsampling

In [None]:
n_to_add_per_class = 3000

all_new_x = []
all_new_y = []
for cls in np.unique(labeled_y):
    results, new_x, new_y, unlabeled_x, unlabeled_meta = find_knn_per_class(
        labeled_x, labeled_y,
        unlabeled_x, unlabeled_meta,
        target_class=cls,
        n_to_add=n_to_add_per_class - len(labeled_y[np.where(labeled_y == cls)]), 
        distance_threshold=2000
    )
    print(f"Class {cls}: added {len(new_y)} new samples.")
    all_new_x.append(new_x)
    all_new_y.append(new_y)

pseudo_x = np.vstack(all_new_x)
pseudo_y = np.concatenate(all_new_y)

print("Final up-sampled size:", pseudo_x.shape)
print("Remaining unlabeled:", unlabeled_x.shape)

# === Combine and apply SMOTE ===
X_resampled = np.concatenate([labeled_x, pseudo_x])
y_resampled = np.concatenate([labeled_y, pseudo_y])

print("Original class distribution:", Counter(y_resampled))
X_resampled, y_resampled = apply_smote(X_resampled, y_resampled)
print("Resampled class distribution:", Counter(y_resampled))

## Data preprocessing

In [None]:
scaler = StandardScaler()
X_all = scaler.fit_transform(X_resampled)
test_x = scaler.transform(test_x)
indices = np.arange(X_all.shape[0])
rng = np.random.default_rng(seed=42) 
rng.shuffle(indices)

X_all = X_all[indices]
y_all = y_resampled[indices]

train_x, val_x, train_y, val_y = train_test_split(X_all, y_all, test_size=0.3, random_state=42)

class0_indices = np.where(test_y == 0)[0]
class1_indices = np.where(test_y == 1)[0]

np.random.seed(42)
selected_class1_indices = np.random.choice(class1_indices, size=len(class0_indices), replace=False)

balanced_indices = np.concatenate([class0_indices, selected_class1_indices])
np.random.shuffle(balanced_indices)

test_x = test_x[balanced_indices]
test_y = test_y[balanced_indices]
test_meta = test_meta[balanced_indices]

distribution = Counter(train_y)
total = len(train_y)
print("\nTrain Distribution:")
for label, count in distribution.items():
    pct = (count / total) * 100
    print(f"Class {label}: {count} ({pct:.2f}%)")
    
distribution = Counter(val_y)
total = len(val_y)
print("\nValidation Distribution:")
for label, count in distribution.items():
    pct = (count / total) * 100
    print(f"Class {label}: {count} ({pct:.2f}%)")
    
distribution = Counter(test_y)
total = len(test_y)
print("\nTest Distribution:")
for label, count in distribution.items():
    pct = (count / total) * 100
    print(f"Class {label}: {count} ({pct:.2f}%)")
    

# Models training

## Random Forest

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    bootstrap=True,
    random_state=42
)
rf_model.fit(train_x, train_y)

probs = rf_model.predict_proba(test_x)
preds = rf_model.predict(test_x)

print("Evaluation:")
print(f"Accuracy: {accuracy_score(test_y, preds):.4f}")
print(f"F1 Score (macro): {f1_score(test_y, preds, average='macro'):.4f}")
print(f"F1 Score (weighted): {f1_score(test_y, preds, average='weighted'):.4f}")
print("\nPer-Class Classification Report:")
print(classification_report(test_y, preds, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(test_y, preds))

## XGBoost

In [None]:
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.005,
    subsample=0.9,
    colsample_bytree=0.8,
    eval_metric='logloss',
)

xgb_model.fit(train_x, train_y)

probs = xgb_model.predict_proba(test_x)
preds = xgb_model.predict(test_x)

print("Evaluation:")
print(f"Accuracy: {accuracy_score(test_y, preds):.4f}")
print(f"F1 Score (macro): {f1_score(test_y, preds, average='macro'):.4f}")
print(f"F1 Score (weighted): {f1_score(test_y, preds, average='weighted'):.4f}")
print("\nPer-Class Classification Report:")
print(classification_report(test_y, preds, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(test_y, preds))