## Reqirements

In [1]:
!pip install -r requirements.txt




[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: C:\Users\Admin\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


## Imports

In [2]:
import joblib
import torch
import numpy as np
from models import predict_ae, predict_proba_ae
from datasets import AcousticDataset
from sklearn.ensemble import IsolationForest
from collections import Counter, defaultdict
from models import AutoencoderClassifier

## Data and model loading

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = joblib.load("save/scaler.pkl")
train_dataset = AcousticDataset(
    "Train_details/B Balchik 2020 11 14 FPOD_6288 file0 PART 79d 23h 19m train details")

unlabeled_x, unlabeled_meta = train_dataset.get_unlabeled()
model = AutoencoderClassifier(unlabeled_x.shape[1], dropout_rate=0.2).to(device)
model.load_state_dict(torch.load("save/final_ae_model.pth"))
model.eval()


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL models.AutoencoderClassifier was not an allowed global by default. Please use `torch.serialization.add_safe_globals([models.AutoencoderClassifier])` or the `torch.serialization.safe_globals([models.AutoencoderClassifier])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

## Filtering

Filters the unlabeled data using the Isolation Forest, to keep only the entries which are promising to belong to one of the classes.

In [None]:
# Define feature columns to be used for filtering unlabeled data.
FEATURE_COLUMNS = [
    'ClksThisMin', 'medianKHz', 'avSPL', 'avPkAt',
    'AvPRF', 'avEndF', 'tWUTrisk', 'nActualClx', 'nRisingIPIs',
    'TrDur_us', 'nICIrising', 'MinICI_us', 'midpointICI', 'MaxICI_us',
    'ClkNofMinICI', 'ClkNofMaxICI', 'NofClstrs', 'avClstrNx8', 'avPkIPI',
    'BeforeIPIratio', 'PreIPIratio', 'Post1IPIratio', 'Post2IPIratio', 'EndIPIratio'
]

def filter_unlabeled_with_isolation_forest(unlabeled_features, unlabeled_meta, contamination=0.08, random_state=42):
    """
    Filters out likely outliers from unlabeled data using Isolation Forest.

    Args:
        unlabeled_features (np.ndarray): Unlabeled feature matrix (samples x features).
        unlabeled_meta (np.ndarray): Corresponding metadata for each sample.
        contamination (float): Expected proportion of outliers in the data.
        random_state (int): Seed for reproducibility.

    Returns:
        Tuple[np.ndarray, np.ndarray]: Filtered feature array and corresponding metadata.
    """
    iso_forest = IsolationForest(contamination=contamination, random_state=random_state)
    outlier_flags = iso_forest.fit_predict(unlabeled_features)

    # Keep only samples predicted as outliers (-1)
    is_outlier = outlier_flags == -1
    filtered_features = unlabeled_features[is_outlier]
    filtered_meta = unlabeled_meta[is_outlier]

    return filtered_features, filtered_meta

filtered_unlabeled, filtered_meta = filter_unlabeled_with_isolation_forest(unlabeled_x, unlabeled_meta)

## Classification

Applies the trained classification.

In [None]:
# === Predict on full unlabeled set ===
filtered_unlabeled = scaler.transform(filtered_unlabeled)
unlabeled_probs = predict_proba_ae(model, filtered_unlabeled, device=device)
unlabeled_preds = predict_ae(model, filtered_unlabeled, device=device)

confidence_threshold = 0.95

# Generate final predictions and mask for confident samples
final_unlabeled_preds = []
final_unlabeled_meta = []
for pred, prob, meta in zip(unlabeled_preds, unlabeled_probs, filtered_meta):
    if max(prob) >= confidence_threshold:
        final_unlabeled_preds.append(pred)
        final_unlabeled_meta.append(meta)
    else:
        final_unlabeled_preds.append(2)  # Assign to class 2 (noise)
        final_unlabeled_meta.append(meta)

final_unlabeled_preds = np.array(final_unlabeled_preds)
final_unlabeled_meta = np.array(final_unlabeled_meta)

# === Print stats ===
distribution = Counter(final_unlabeled_preds)
total = len(final_unlabeled_preds)
print("\nFinal Unlabeled Predictions Distribution:")
for label, count in distribution.items():
    pct = (count / total) * 100
    print(f"Class {label}: {count} ({pct:.2f}%)")
print(len(final_unlabeled_preds), len(final_unlabeled_meta))

## Postprocessing

Applies the time-based grouping for entries of class 1.

In [None]:
def postprocess_class1_sequences(unlabeled_x, unlabeled_y, unlabeled_meta, time_gap_threshold_us=15_000_000):
    """
    Postprocess predicted class 1 (communication) sequences by checking temporal continuity.

    This function groups signals by location and ensures that only sequences of class 1 instances
    with short temporal gaps are kept as class 1. Isolated class 1 instances are relabeled as class 2 (noise).

    Args:
        unlabeled_x (np.ndarray): Unlabeled feature array (samples x features).
        unlabeled_y (np.ndarray): Predicted class labels (0, 1, or 2) for the samples.
        unlabeled_meta (np.ndarray): Metadata for each sample, expected to include
                                     (Location, Minute, Microseconds).
        time_gap_threshold_us (int): Maximum time gap (in microseconds) allowed between
                                     consecutive class 1 samples to consider them part of a sequence.

    Returns:
        np.ndarray: Updated labels where short/isolated class 1 samples are reassigned to class 2.
    """
    # List of feature column names and the index of the 'TrDur_us' (duration) column
    feature_column_names = [
        'ClksThisMin', 'medianKHz', 'avSPL', 'avPkAt',
        'AvPRF', 'avEndF', 'tWUTrisk', 'nActualClx', 'nRisingIPIs',
        'TrDur_us', 'nICIrising', 'MinICI_us', 'midpointICI', 'MaxICI_us',
        'ClkNofMinICI', 'ClkNofMaxICI', 'NofClstrs', 'avClstrNx8', 'avPkIPI',
        'BeforeIPIratio', 'PreIPIratio', 'Post1IPIratio', 'Post2IPIratio', 'EndIPIratio'
    ]
    trdur_idx = feature_column_names.index('TrDur_us')

    # Group samples by location, mapping each to a list of (start_time, end_time, index)
    data_by_location = defaultdict(list)
    for idx, meta in enumerate(unlabeled_meta):
        location, minute_str, micro_str = meta
        start_us = np.int64(int(minute_str)) * 60 * 1_000_000 + np.int64(int(micro_str))
        duration_us = np.int64(unlabeled_x[idx, trdur_idx])
        end_us = start_us + duration_us
        data_by_location[location].append((start_us, end_us, idx))

    # Sort each location's entries by time
    for loc in data_by_location:
        data_by_location[loc].sort()

    # Copy labels to modify safely
    new_labels = unlabeled_y.copy()

    # Process each location independently
    for loc, entries in data_by_location.items():
        # Get only the entries currently labeled as class 1
        class1_entries = [(s, e, i) for (s, e, i) in entries if unlabeled_y[i] == 1]
        class1_entries.sort()

        current_sequence = []
        for j in range(len(class1_entries)):
            s, e, idx = class1_entries[j]
            if not current_sequence:
                current_sequence.append((s, e, idx))
            else:
                prev_s, prev_e, _ = current_sequence[-1]
                # If current start is close enough to previous end, extend the sequence
                if s - prev_e <= time_gap_threshold_us:
                    current_sequence.append((s, e, idx))
                else:
                    # End the current sequence and relabel accordingly
                    if len(current_sequence) >= 2:
                        for _, _, seq_idx in current_sequence:
                            new_labels[seq_idx] = 1  # Keep as class 1
                    else:
                        for _, _, seq_idx in current_sequence:
                            new_labels[seq_idx] = 2  # Reassign to noise
                    current_sequence = [(s, e, idx)]

        # Process any remaining sequence at the end
        if current_sequence:
            if len(current_sequence) >= 2:
                for _, _, seq_idx in current_sequence:
                    new_labels[seq_idx] = 1
            else:
                for _, _, seq_idx in current_sequence:
                    new_labels[seq_idx] = 2

    return new_labels

# Example usage
new_labels = postprocess_class1_sequences(
    filtered_unlabeled,         # feature array
    final_unlabeled_preds,      # predicted labels (from model)
    filtered_meta               # corresponding metadata
)


In [None]:
distribution = Counter(new_labels)
total = len(new_labels)
print("\nFinal Unlabeled Predictions Distribution:")
for label, count in distribution.items():
    pct = (count / total) * 100
    print(f"Class {label}: {count} ({pct:.2f}%)")