# Behavior Classification using Explainable Active Learning Model

## 1. Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

import os
from dotenv import load_dotenv
load_dotenv()
DLC_DATASET_PATH = os.getenv('DLC_DATASET_PATH')

In [None]:
FRAME_RATE = 30
SAMPLE_RATE = 10
MULTI_ANIMAL = False
LIKELIHOOD_THRESHOLD = 0.6
BODY_PARTS = ["tailbase", "earR", "earL", "msBase", "msTop", "centroid", "cleft", "cright"]
RULE_BASED_LABELS = ["shuttles_label_naive1", "shuttles_label_naive2", "shuttles_label_naive3", "shuttles_label_hardcoded", "freezing_label"]
labels = RULE_BASED_LABELS+["other"]

In [None]:
# Multi-level header
df_pose_labels = pd.read_csv(DLC_DATASET_PATH + "/labelled_DLC.csv", header=[0, 1], index_col=0)

# Flatten the multi-level header into single strings
df_pose_labels.columns = ['_'.join(col).strip() for col in df_pose_labels.columns.values]

# Remove second level header if it is empty
df_pose_labels.columns = [col.split("_Unnamed")[0] if "_Unnamed" in col else col for col in df_pose_labels.columns.values]

# Remove NaN columns and rows
df_pose_labels = df_pose_labels.dropna(axis=1, how='all')
df_pose_labels = df_pose_labels.dropna(axis=0, how='any')

print("Shape of df_pose_labels:", df_pose_labels.shape)
df_pose_labels.head()

In [None]:
df_labels = df_pose_labels[RULE_BASED_LABELS]

print("Shape of df_labels:", df_labels.shape)
df_labels.head()

In [None]:
# all unlabeled are placed into "other"
df_labels_other = df_labels.copy()
df_labels_other["other"] = 0

# find all unlabeled
unlabeled_data = df_labels_other.sum(axis=1) == 0

# change all unlabeled to 1 in other
df_labels_other.loc[unlabeled_data, "other"] = 1

print("Shape of df_labels_other:", df_labels_other.shape)
df_labels_other.head()

In [None]:
df_pose = df_pose_labels.drop(columns=RULE_BASED_LABELS)

print("Shape of df_pose:", df_pose.shape)
df_pose.head()

In [None]:
df_pose_labels_other = pd.concat([df_pose, df_labels_other], axis=1)

print("Shape of df_pose_labels_other:", df_pose_labels_other.shape)
df_pose_labels_other.head()

## 2. Preprocessing using A-SOiD Pipeline

### 2.1 Filter by Likelihood

Smooths out unreliable keypoint coordinates based on confidence values (likelihoods). This is a "hold last good value" filter to avoid jittery or missing points.

The `adp_filt` function is rewrite from the `adp_filt` function in A-SOiD (likelihood adaptive filtering).

In [None]:
# likelihood adaptive filtering
def adp_filt(df_pose_labels_other, idx_coord, idx_llh, llh_value, labels):
    """
    For body parts with likelihood values below the threshold, copy the previous valid row's x, y coordinates.
    Labels are not modified during filtering and are set back after filtering.

    Parameters:
        df_pose_labels_other (pd.DataFrame): The input DataFrame containing x, y, likelihood, and pose labels.
        idx_coord (list): Indices of selected body parts (x and y columns).
        idx_llh (list): Indices of likelihood columns.
        llh_value (float): Threshold for likelihood filtering.
        labels (list): List of pose labels to retain in the output.

    Returns:
        pd.DataFrame: DataFrame with invalid x, y coordinates corrected.
        dict: Statistics on filtered body parts and likelihood values below the threshold.
    """
    # Convert DataFrame to numpy arrays for x, y, likelihood, and pose labels
    data_x_coord = np.array(df_pose_labels_other.iloc[:, idx_coord[::2]])
    data_y_coord = np.array(df_pose_labels_other.iloc[:, idx_coord[1::2]])
    data_llh = np.array(df_pose_labels_other.iloc[:, idx_llh])
    original_labels = df_pose_labels_other[labels].copy()  # Preserve original labels

    # Initialize statistics
    llh_below_threshold = 0
    total_llh = data_llh.size  # Total number of likelihood values
    body_part_stats = {df_pose_labels_other.columns[idx_coord[::2][i]]: 0 for i in range(len(idx_coord[::2]))}

    # Iterate through body parts and correct invalid x, y coordinates
    for x in range(data_llh.shape[1]):  # Iterate over each body part
        for i in range(1, data_llh.shape[0]):  # Start from the second row
            if data_llh[i, x] < llh_value:  # If likelihood is below the threshold
                llh_below_threshold += 1
                body_part_stats[df_pose_labels_other.columns[idx_coord[::2][x]]] += 1
                # Copy the previous row's x, y coordinates for this body part
                data_x_coord[i, x] = data_x_coord[i - 1, x]
                data_y_coord[i, x] = data_y_coord[i - 1, x]

    # Replace the x, y columns in the DataFrame with corrected values
    for idx, col_idx in enumerate(idx_coord[::2]):  # Update x columns
        df_pose_labels_other.iloc[:, col_idx] = data_x_coord[:, idx]
    for idx, col_idx in enumerate(idx_coord[1::2]):  # Update y columns
        df_pose_labels_other.iloc[:, col_idx] = data_y_coord[:, idx]

    # Restore the original labels
    df_pose_labels_other[labels] = original_labels

    # Calculate likelihood ratio
    llh_ratio = llh_below_threshold / total_llh

    # Prepare statistics
    stats = {
        "llh_below_threshold": llh_below_threshold,
        "total_llh": total_llh,
        "llh_below_threshold_ratio": llh_ratio,
        "body_part_stats": body_part_stats
    }

    return df_pose_labels_other, stats

In [None]:
# Get numeric indices of columns ending with "_likelihood"
idx_llh = [i for i, col in enumerate(df_pose_labels_other.columns) if col.endswith("_likelihood")]
labels = RULE_BASED_LABELS+["other"]
# Get numeric indices of columns of x and y coordinates
idx_coord = [i for i, col in enumerate(df_pose_labels_other.columns) if not col.endswith("_likelihood") and col not in labels]

filt_df_pose_labels_other, stats = adp_filt(
    df_pose_labels_other=df_pose_labels_other,
    idx_coord=idx_coord,
    idx_llh=idx_llh,
    llh_value=LIKELIHOOD_THRESHOLD,
    labels=labels
)

print("Filtered DataFrame's shape:")
print(filt_df_pose_labels_other.shape)
print("\nStatistics:")
print(stats)

In [None]:
filt_df_pose = filt_df_pose_labels_other.drop(columns=RULE_BASED_LABELS + ["other"])

print("Filtered DataFrame's shape without labels:")
print(filt_df_pose.shape)
filt_df_pose.head()

In [None]:
filt_df_labels_other = filt_df_pose_labels_other[RULE_BASED_LABELS + ["other"]]

print("Filtered DataFrame's shape with labels:")
print(filt_df_labels_other.shape)
filt_df_labels_other.head()

## 3. Feature Engineering

## 4. Rule-based Labeling (skipped for now)

## 5. Semi-supervised metric learning → low-dimensional vector embedding space

## 6. Clustering (active learning loop skipped for now)

## 7. Train a decision tree classification model on engineered features to predict cluster IDs. This way we can get explainable feature importance (Explainability Model, interpretable)

## 8. Train a classification model (maybe also decision tree) on embedded data to classify behaviors (Performance Model, accurate but abstract)