In [1]:
import itertools
import json, ast
from sklearn.base import clone, ClassifierMixin, BaseEstimator
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, StratifiedShuffleSplit
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import pandas as pd
import joblib
import os, math
import gc
import warnings
from collections import defaultdict
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Hardware check
import torch
print("="*60)
print("HARDWARE CHECK")
print("="*60)
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
print("="*60)

HARDWARE CHECK
GPU Available: True
GPU Name: Tesla T4


In [2]:
class CFG:
    # mode = "validate"
    mode = "submit"

    model_save_dir = "/kaggle/working"
    # model_save_dir = "D:/UET/ML/mouse_behavior/social-action-recognition-in-mice"

    train_csv_path = "/kaggle/input/MABe-mouse-behavior-detection/train.csv" 
    test_csv_path = "/kaggle/input/MABe-mouse-behavior-detection/test.csv"
    train_annotation_path = "/kaggle/input/MABe-mouse-behavior-detection/train_annotation"
    train_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/train_tracking"
    test_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/test_tracking"

    # train_csv_path = "D:/UET/ML/mouse_behavior/data/train.csv" 
    # test_csv_path = "D:/UET/ML/mouse_behavior/data/test.csv"
    # train_annotation_path = "D:/UET/ML/mouse_behavior/data/train_annotation"
    # train_tracking_path = "D:/UET/ML/mouse_behavior/data/train_tracking"
    # test_tracking_path = "D:/UET/ML/mouse_behavior/data/test_tracking"

    drop_body_parts =  [
        'headpiece_bottombackleft', 'headpiece_bottombackright', 'headpiece_bottomfrontleft', 'headpiece_bottomfrontright', 
        'headpiece_topbackleft', 'headpiece_topbackright', 'headpiece_topfrontleft', 'headpiece_topfrontright', 
        'spine_1', 'spine_2', 'tail_middle_1', 'tail_middle_2', 'tail_midpoint'
    ]

    # Threshold range: typically 0.20-0.40, with 0.27 as a good starting point
    # Higher thresholds = fewer false positives, more false negatives
    # Lower thresholds = more false positives, fewer false negatives
    action_thresholds = {
        "default": 0.27,           # Global fallback threshold
        "single_default": 0.26,    # Default for single mouse behaviors- lowered to improve recall
        "pair_default": 0.28,      # Default for pair behaviors - slightly higher to reduce false positives
        "single": {
            "rear": 0.30,          # Higher threshold - distinctive behavior, reduce false positives
            "groom": 0.28,         # Slightly higher - common behavior, needs good confidence
            "sniff": 0.25,         # Lower threshold - subtle behavior, improve recall
            "dig": 0.29,           # Higher threshold - distinctive behavior
            "eat": 0.27,           # Standard threshold - balanced precision/recall
            "drink": 0.27,         # Standard threshold - balanced precision/recall
            "sleep": 0.24,         # Lower threshold - rare but important, improve recall
        },
        "pair": {
            "attack": 0.24,        # Lower threshold - rare but critical behavior, maximize recall
            "mount": 0.28,         # Higher threshold - distinctive behavior, reduce false positives
            "sniff": 0.26,         # Lower threshold - subtle social behavior, improve recall
            "groom": 0.27,         # Standard threshold - balanced precision/recall
            "chase": 0.25,         # Lower threshold - important social behavior, improve recall
            "follow": 0.26,        # Lower threshold - subtle behavior, improve recall
            "approach": 0.27,      # Standard threshold - balanced precision/recall
        }
    }

In [3]:
train_csv = pd.read_csv(CFG.train_csv_path) 
test_csv = pd.read_csv(CFG.test_csv_path)

In [4]:
mask_lab = train_csv["lab_id"].str.startswith("MABe22")
mask_behavior = train_csv["behaviors_labeled"].isna() | (train_csv["behaviors_labeled"].str.strip() == "")
mask_drop = mask_lab | mask_behavior

train = train_csv[~mask_drop]
body_parts_list = list(np.unique(train.body_parts_tracked))

## Data Generator

In [5]:
def generate_mouse_data(datasubset, mode, traintest_directory=None, generate_single=True, generate_pair=True):
    """
    Yields:
        (mode, X, meta, y)
        mode: "single" hoặc "pair"
        X: raw features DataFrame
        meta: metadata DataFrame
        y: labels (đối với train mode) hoặc action list (đối với test mode)
    """

    if traintest_directory is None:
        traintest_directory = f"/kaggle/input/MABe-mouse-behavior-detection/{mode}_tracking"
        # traintest_directory = f"D:/UET/ML/mouse_behavior/data/{mode}_tracking"

    for idx, row in datasubset.iterrows():
        lab_id = row.lab_id
        video_id = row.video_id
        pix_per_cm = row.pix_per_cm_approx
        fps = row.frames_per_second

        # Bỏ qua MABe22 labs hoặc missing behaviors
        if lab_id.startswith("MABe22"):
            continue
        if mode == "train" and (pd.isna(row.behaviors_labeled) or str(row.behaviors_labeled).strip() == ""):
            continue

        path = f"{traintest_directory}/{lab_id}/{video_id}.parquet"

        # Load tracking
        vid = pd.read_parquet(path)

        # Bỏ bớt bodyparts
        if len(np.unique(vid.bodypart)) > 5:
            vid = vid[~vid.bodypart.isin(CFG.drop_body_parts)]

        pvid = vid.pivot(
            index="video_frame",
            columns=["mouse_id", "bodypart"],
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).T.sort_index().T
        pvid /= pix_per_cm

        del vid
        gc.collect()

        mouse_ids = pvid.columns.get_level_values(0).unique().tolist()

        # Tìm behaviors tracked trong CSV file
        vid_behaviors = json.loads(row.behaviors_labeled)
        vid_behaviors = sorted(list({b.replace("'", "") for b in vid_behaviors}))
        vid_behaviors = [b.split(',') for b in vid_behaviors]
        vid_behaviors = pd.DataFrame(vid_behaviors, columns=["agent", "target", "action"])

        # Load annotation (đối với training mode)
        if mode == "train":
            try: 
                annot = pd.read_parquet(path.replace("train_tracking", "train_annotation"))
            except FileNotFoundError:
                continue
        else:
            annot = None


        # Build data cho single mouse 
        if generate_single:
            vid_behaviors_subset = vid_behaviors.query("target == 'self'")

            for mouse_id_str in vid_behaviors_subset.agent.unique():
                try:
                    mouse_id = int(mouse_id_str.replace("mouse", ""))
                
                    if mouse_id not in mouse_ids:
                        continue

                    vid_agent_actions = np.unique(vid_behaviors_subset.query("agent == @mouse_id_str").action)

                    # Single mouse raw features - toạ độ bodyparts
                    single_mouse = pvid.loc[:, mouse_id]
                    assert len(single_mouse) == len(pvid)
                
                    # Single mouse meta data
                    meta = pd.DataFrame({
                        "video_id": video_id,
                        "agent_id": mouse_id_str,
                        "target_id": "self",
                        "video_frame": single_mouse.index,
                        "frames_per_second": fps
                    })

                    # Single mouse labels
                    if mode == "train":
                        labels = pd.DataFrame(0.0, index=single_mouse.index, columns=vid_agent_actions)
                    
                        annot_subset = annot.query("(agent_id == @mouse_id) & (target_id == @mouse_id)")

                        for i in range(len(annot_subset)):
                            annot_row = annot_subset.iloc[i]
                            labels.loc[annot_row["start_frame"]:annot_row["stop_frame"], annot_row.action] = 1.0
                        yield "single", single_mouse, meta, labels
                    else:
                        yield "single", single_mouse, meta, vid_agent_actions

                except (KeyError, ValueError):
                    pass

        # Build data cho mouse pair
        if generate_pair:
            vid_behaviors_subset = vid_behaviors.query("target != 'self'")

            if len(vid_behaviors_subset) > 0:
                for agent, target in itertools.permutations(np.unique(pvid.columns.get_level_values("mouse_id")), 2):
                    agent_str = f"mouse{agent}"
                    target_str = f"mouse{target}"

                    vid_agent_actions = np.unique(vid_behaviors_subset.query("(agent == @agent_str) & (target == @target_str)").action)

                    if len(vid_agent_actions) == 0:
                        continue

                    # Mouse pair raw features - toạ độ bodyparts của cặp chuột
                    mouse_pair = pd.concat([pvid[agent], pvid[target]], axis=1, keys=["A", "B"])  # Raw coordinates
                    assert len(mouse_pair) == len(pvid)

                    # Mouse pair meta data
                    meta = pd.DataFrame({
                        "video_id": video_id,
                        "agent_id": agent_str,
                        "target_id": target_str,
                        "video_frame": pvid.index,
                        "frames_per_second": fps
                    })

                    # Mouse pair labels
                    if mode == "train":
                        labels = pd.DataFrame(0.0, index=pvid.index, columns=vid_agent_actions)
                    
                        annot_subset = annot.query("(agent_id == @agent) & (target_id == @target)")

                        for i in range(len(annot_subset)):
                            annot_row = annot_subset.iloc[i]
                            labels.loc[annot_row["start_frame"]:annot_row["stop_frame"], annot_row.action] = 1.0
                        yield "pair", mouse_pair, meta, labels
                    else:
                        yield "pair", mouse_pair, meta, vid_agent_actions

## Feature Engineering

In [6]:
def _fps_from_meta(meta_df, fallback_lookup, default_fps=30.0):
    """Get FPS with proper fallback chain"""
    if "frames_per_second" in meta_df.columns:
        fps_val = meta_df["frames_per_second"].iloc[0]
        if pd.notnull(fps_val) and fps_val > 0:
            return float(fps_val)
    
    vid = meta_df["video_id"].iloc[0]
    if vid in fallback_lookup:
        return float(fallback_lookup[vid])
    
    return default_fps

def _scale(n_frames_at_30fps, fps, ref=30.0):
    """Scale window size by FPS"""
    return max(1, int(round(n_frames_at_30fps * float(fps) / ref)))


In [7]:
def calculate_centers(df):
    """
    Đảm bảo "body_center" tồn tại với mọi chuột hoặc bodyparts combination.
    Xử lý cả cột 2-level (bodypart, coord) (từng chuột) và 3-level (mouse_id, bodypart, coord) (cặp chuột).
    
    Fallback logic:
    1. Nếu nose và tail_base tồn tại → midpoint(nose, tail_base)
    2. Else if head và tail_base tồn tại → midpoint(head, tail_base)
    3. Else if chỉ tail_base tồn tại → use tail_base
    4. Else → không tính được body_center
    """
    cols = df.columns

    # Cột 2-level (bodypart, coord)
    if cols.nlevels == 2:
        if ("body_center", "x") not in df.columns or ("body_center", "y") not in df.columns:
            if ("nose", "x") in df.columns and ("tail_base", "x") in df.columns:
                df[("body_center", "x")] = (df[("nose", "x")] + df[("tail_base", "x")]) / 2
                df[("body_center", "y")] = (df[("nose", "y")] + df[("tail_base", "y")]) / 2
            elif ("head", "x") in df.columns and ("tail_base", "x") in df.columns:
                df[("body_center", "x")] = (df[("head", "x")] + df[("tail_base", "x")]) / 2
                df[("body_center", "y")] = (df[("head", "y")] + df[("tail_base", "y")]) / 2
            elif ("tail_base", "x") in df.columns:
                df[("body_center", "x")] = df[("tail_base", "x")]
                df[("body_center", "y")] = df[("tail_base", "y")]
            else:
                # no valid bodyparts → fill NaN
                df[("body_center", "x")] = np.nan
                df[("body_center", "y")] = np.nan

    # Cột 3-level (mouse_id, bodypart, coord)
    elif cols.nlevels == 3:
        mice = sorted(list(set(c[0] for c in cols)))

        for m in mice:
            has_body_center = ((m, "body_center", "x") in cols) and ((m, "body_center", "y") in cols)
            if not has_body_center:
                if ((m, "nose", "x") in cols) and ((m, "tail_base", "x") in cols):
                    df[(m, "body_center", "x")] = (df[(m, "nose", "x")] + df[(m, "tail_base", "x")]) / 2
                    df[(m, "body_center", "y")] = (df[(m, "nose", "y")] + df[(m, "tail_base", "y")]) / 2
                elif ((m, "head", "x") in cols) and ((m, "tail_base", "x") in cols):
                    df[(m, "body_center", "x")] = (df[(m, "head", "x")] + df[(m, "tail_base", "x")]) / 2
                    df[(m, "body_center", "y")] = (df[(m, "head", "y")] + df[(m, "tail_base", "y")]) / 2
                elif ((m, "tail_base", "x") in cols):
                    df[(m, "body_center", "x")] = df[(m, "tail_base", "x")]
                    df[(m, "body_center", "y")] = df[(m, "tail_base", "y")]
                else:
                    df[(m, "body_center", "x")] = np.nan
                    df[(m, "body_center", "y")] = np.nan
    return df

def calculate_speed_lag(df, part, fps, lag=10, mouse=None):
    cols = df.columns
    if mouse is not None:
        x = df[(mouse, part, "x")]
        y = df[(mouse, part, "y")]
    else:
        x = df[(part, "x")]
        y = df[(part, "y")]

    if x.isna().all() or y.isna().all():
        # all missing → return zeros
        return pd.Series(0, index=df.index)

    dx = x.diff(lag)
    dy = y.diff(lag)
    speed = np.sqrt(dx**2 + dy**2) * fps
    return speed.fillna(0)

# Tính các thống kê của 1 đại lượng theo nhiều cửa sổ thời gian
def calculate_window_stats(df, metric, name, fps, scales=[30, 90]):
    """
    Thêm rolling statistics cho bất kỳ series nào.
    
    metric : pd.Series (ví dụ speed, distance, curvature...)
    fps    : frames_per_second
    scales : list window sizes quy đổi theo 30fps → mặc định [30, 90] = short và long term
    """
    res = pd.DataFrame(index=df.index)
    for scale in scales:
        ws = _scale(scale, fps)
        roll = metric.rolling(ws, min_periods=max(1, ws//4))

        res[f"{name}_mean_{scale}"] = roll.mean()
        res[f"{name}_std_{scale}"]  = roll.std()
        res[f"{name}_min_{scale}"]  = roll.min()
        res[f"{name}_max_{scale}"]  = roll.max()

    return res

# Tính onset - offset features: Onset = thay đổi từ {lag} frame trước -> frame hiện tại. Offset = thay đổi từ frame hiện tại -> {lag} frame tương lai
def add_onset_offset(metric: pd.Series, name: str, lag_list=[3, 5]):
    out = {}
    for lag in lag_list:
        out[f"{name}_onset_lag{lag}"]  = metric - metric.shift(lag)
        out[f"{name}_offset_lag{lag}"] = metric.shift(-lag) - metric
    return pd.DataFrame(out)

In [8]:
def build_single_features(single_mouse_df, body_parts_tracked, meta_fps):
    single_mouse_df = calculate_centers(single_mouse_df)

    # Get actual bodypart columns
    available_body_parts = single_mouse_df.columns.get_level_values(0).unique()
    
    # === Shape and Position Features ===
    # Euclidean distances giữa các cặp bodyparts
    X = pd.DataFrame({
        f"{p1}+{p2}": np.sqrt(
            (single_mouse_df[(p1, "x")] - single_mouse_df[(p2, "x")])**2 + 
            (single_mouse_df[(p1, "y")] - single_mouse_df[(p2, "y")])**2
        )
        for p1, p2 in itertools.combinations(body_parts_tracked, 2)
        if p1 in available_body_parts and p2 in available_body_parts
    })
    
    expected_cols = [f"{p1}+{p2}" for p1, p2 in itertools.combinations(body_parts_tracked, 2)]
    X = X.reindex(columns=expected_cols, copy=False)
    
    # Elongation (chỉ khi required bodyparts tồn tại)
    if "nose" in available_body_parts and "tail_base" in available_body_parts and "ear_left" in available_body_parts and "ear_right" in available_body_parts:
        X["elong"] = X["nose+tail_base"] / (X["ear_left+ear_right"] + 1e-6)
    else:
        X["elong"] = 0.0
    
    # Body angle (chỉ khi nose, tail_base, body_center tồn tại)
    if all(bp in available_body_parts for bp in ["nose", "tail_base", "body_center"]):
        v1_x = single_mouse_df[("nose","x")] - single_mouse_df[("body_center","x")]
        v1_y = single_mouse_df[("nose","y")] - single_mouse_df[("body_center","y")]
        v2_x = single_mouse_df[("tail_base","x")] - single_mouse_df[("body_center","x")]
        v2_y = single_mouse_df[("tail_base","y")] - single_mouse_df[("body_center","y")]
        X["body_angle"] = (v1_x*v2_x + v1_y*v2_y) / (np.sqrt(v1_x**2+v1_y**2) * np.sqrt(v2_x**2+v2_y**2) + 1e-6)
    else:
        X["body_angle"] = 0.0
    
    # === Movement Features: speed/accelerate/energy ===
    if "body_center" in available_body_parts:
        speed = np.sqrt(
            single_mouse_df[("body_center", "x")].diff()**2 +
            single_mouse_df[("body_center", "y")].diff()**2
        )
        X["speed"] = speed
        X["accelerate"] = X["speed"].diff().fillna(0)
        X["energy"] = (speed**2).rolling(window=5).sum().fillna(0) # intensity of movement
    else:
        X["speed"] = 0.0
        X["accelerate"] = 0.0
        X["energy"] = 0.0
    
    # Đối với available bodyparts cụ thể
    for p in ["body_center"]:
        if p in available_body_parts:
            # Speed lag features
            lag_speed = calculate_speed_lag(single_mouse_df, p, fps=meta_fps)
            X[f"speed_{p}_lag_10"] = lag_speed
            X = pd.concat([X, add_onset_offset(lag_speed, f"speed_{p}")], axis=1)
            
            # Rolling stats
            speed = np.sqrt(single_mouse_df[(p, "x")].diff()**2 + single_mouse_df[(p, "y")].diff()**2) * float(meta_fps)
            res = calculate_window_stats(single_mouse_df, speed, f"speed_{p}", fps=meta_fps)
            X = pd.concat([X, res], axis=1)

            # Curvature
            lag = _scale(10, meta_fps)
            shifted_x = single_mouse_df[(p,"x")].shift(lag)
            shifted_y = single_mouse_df[(p,"y")].shift(lag)
            curv = np.sqrt((shifted_x - single_mouse_df[(p,"x")])**2 + 
                                                    (shifted_y - single_mouse_df[(p,"y")])**2)
            X[f"curvature_{p}_lag_{lag}"] = curv
            X = pd.concat([X, add_onset_offset(curv, f"curvature_{p}")], axis=1)

                
        else:
            # Dummy columns
            X[f"speed_{p}_lag_10"] = 0.0
            for lag in [3, 5]:
                X[f"speed{p}_onset_lag{lag}"]  = 0.0
                X[f"speed_{p}_offset_lag{lag}"] = 0.0
            
            
            for scale in [30, 90]:
                X[f"speed_{p}_mean_{scale}"] = 0.0
                X[f"speed_{p}_std_{scale}"] = 0.0
                X[f"speed_{p}_min_{scale}"] = 0.0
                X[f"speed_{p}_max_{scale}"] = 0.0
                
            lag = _scale(10, meta_fps)
            X[f"curvature_{p}_lag_{lag}"] = 0.0

            for lag in [3, 5]:
                X[f"curvature_{p}_onset_lag{lag}"]  = 0.0
                X[f"curvature_{p}_offset_lag{lag}"] = 0.0
    
    return X.astype(np.float32, copy=False).fillna(0)


def build_pair_features(mouse_pair_df, body_parts_tracked, meta_fps):
    mouse_pair_df = calculate_centers(mouse_pair_df)
    
    # Get bodyparts for both mice
    avail_A = mouse_pair_df["A"].columns.get_level_values(0).unique()
    avail_B = mouse_pair_df["B"].columns.get_level_values(0).unique()
    
    # Pairwise distances
    X = pd.DataFrame({
        f"A_{p1}+B_{p2}": np.sqrt(
            (mouse_pair_df[("A", p1, "x")] - mouse_pair_df[("B", p2, "x")])**2 +
            (mouse_pair_df[("A", p1, "y")] - mouse_pair_df[("B", p2, "y")])**2
        )
        for p1, p2 in itertools.product(body_parts_tracked, repeat=2)
        if p1 in avail_A and p2 in avail_B
    })
    
    expected_cols = [f"A_{p1}+B_{p2}" for p1, p2 in itertools.product(body_parts_tracked, repeat=2)]
    X = X.reindex(columns=expected_cols, copy=False)
    
    # Relative orientation (chỉ khi nose and tail_base tồn tại ở cả 2 chuột)
    if all(bp in avail_A for bp in ["nose","tail_base"]) and all(bp in avail_B for bp in ["nose","tail_base"]):
        vec_A_x = mouse_pair_df[("A", "nose", "x")] - mouse_pair_df[("A", "tail_base", "x")]
        vec_A_y = mouse_pair_df[("A", "nose", "y")] - mouse_pair_df[("A", "tail_base", "y")]
        vec_B_x = mouse_pair_df[("B", "nose", "x")] - mouse_pair_df[("B", "tail_base", "x")]
        vec_B_y = mouse_pair_df[("B", "nose", "y")] - mouse_pair_df[("B", "tail_base", "y")]
        X["relative_orientation"] = (vec_A_x*vec_B_x + vec_A_y*vec_B_y) / (
            np.sqrt(vec_A_x**2 + vec_A_y**2) * np.sqrt(vec_B_x**2 + vec_B_y**2) + 1e-6
        )
    elif all(bp in avail_A for bp in ["head","tail_base"]) and all(bp in avail_B for bp in ["head","tail_base"]):
        vec_A_x = mouse_pair_df[("A", "head", "x")] - mouse_pair_df[("A", "tail_base", "x")]
        vec_A_y = mouse_pair_df[("A", "head", "y")] - mouse_pair_df[("A", "tail_base", "y")]
        vec_B_x = mouse_pair_df[("B", "head", "x")] - mouse_pair_df[("B", "tail_base", "x")]
        vec_B_y = mouse_pair_df[("B", "head", "y")] - mouse_pair_df[("B", "tail_base", "y")]
        X["relative_orientation"] = (vec_A_x*vec_B_x + vec_A_y*vec_B_y) / (
            np.sqrt(vec_A_x**2 + vec_A_y**2) * np.sqrt(vec_B_x**2 + vec_B_y**2) + 1e-6
        )
    else:
        X["relative_orientation"] = 0.0
    
    # Center distance và approach
    if "body_center" in avail_A and "body_center" in avail_B:
        dist_center = np.sqrt(
            (mouse_pair_df[("A", "body_center", "x")] - mouse_pair_df[("B", "body_center", "x")])**2 +
            (mouse_pair_df[("A", "body_center", "y")] - mouse_pair_df[("B", "body_center", "y")])**2
        )
        X = pd.concat([X, add_onset_offset(dist_center, "dist_center")], axis=1)
        
        approach = dist_center.diff().fillna(0)
        X["approach_A"] = approach
        X["approach_B"] = approach
        X = pd.concat([X, add_onset_offset(approach, "approach")], axis=1)
        
        # Relative center distance stats
        res = calculate_window_stats(mouse_pair_df, dist_center**2, "center_distance", fps=meta_fps)
        X = pd.concat([X, res], axis=1)
        
        # Relative speed
        speed_A = calculate_speed_lag(mouse_pair_df, "body_center", meta_fps, mouse="A")
        speed_B = calculate_speed_lag(mouse_pair_df, "body_center", meta_fps, mouse="B")
        X["speed_A_lag_10"] = speed_A
        X["speed_B_lag_10"] = speed_B
        rel = (speed_A - speed_B).abs()
        X["relative_speed_A_B_lag_10"] = rel
        X = pd.concat([X, add_onset_offset(rel, "relative_speed")], axis=1)

        # Ngưỡng khoảng cách
        thresholds = {
            "very_close": 20,
            "close": 40,
            "medium": 60
        }
        X["very_close"] = (dist_center < thresholds["very_close"]).astype(float)
        X["close"] = ((dist_center >= thresholds["very_close"]) & (dist_center < thresholds["close"])).astype(float)
        X["medium"] = ((dist_center >= thresholds["close"]) & (dist_center < thresholds["medium"])).astype(float)
        X["far"] = (dist_center >= thresholds["medium"]).astype(float)

    else:
        # Dummy columns
        for lag in [3, 5]:
            X[f"dist_center_onset_lag{lag}"]  = 0.0
            X[f"dist_center_offset_lag{lag}"] = 0.0

        X["approach_A"] = 0.0
        X["approach_B"] = 0.0
        for lag in [3, 5]:
            X[f"approach_onset_lag{lag}"]  = 0.0
            X[f"approach_offset_lag{lag}"] = 0.0
        
        for scale in [30, 90]:
            X[f"center_distance_mean_{scale}"] = 0.0
            X[f"center_distance_std_{scale}"] = 0.0
            X[f"center_distance_min_{scale}"] = 0.0
            X[f"center_distance_max_{scale}"] = 0.0
        
        X["speed_A_lag_10"] = 0.0
        X["speed_B_lag_10"] = 0.0
        X["relative_speed_A_B_lag_10"] = 0.0
        for lag in [3, 5]:
            X[f"relative_speed_onset_lag{lag}"]  = 0.0
            X[f"relative_speed_offset_lag{lag}"] = 0.0

        X["very_close"] = 0.0
        X["close"] = 0.0
        X["medium"] = 0.0
        X["far"] = 0.0   
    
    return X.astype(np.float32, copy=False).fillna(0)

## Training Pipeline

In [9]:
class StratifiedSubsetClassifier(ClassifierMixin, BaseEstimator):
    """
    Wrapper class để subsamples data trước khi train
    """
    def __init__(self, estimator, n_samples=None):
        self.estimator = estimator
        self.n_samples = n_samples

    def _to_numpy(self, X):
        try:
            return X.to_numpy(np.float32, copy=False)
        except AttributeError:
            return np.asarray(X, dtype=np.float32)

    def fit(self, X, y):
        X_np = self._to_numpy(X)
        y = np.asarray(y).ravel()

        # Handle edge case: labels có thể là {0, 2} thay vì {0, 1}
        uniq = np.unique(y[~pd.isna(y)])
        if set(uniq.tolist()) == {0, 2}:
            y = (y > 0).astype(np.int8)

        # Nếu X < n_samples thì không cần Stratified Shuffle Split
        if self.n_samples is None or len(X_np) <= int(self.n_samples):
            self.estimator.fit(X_np, y)
        else:
            sss = StratifiedShuffleSplit(n_splits=1, train_size=int(self.n_samples), random_state=42)
            try:
                idx, _ = next(sss.split(np.zeros_like(y), y)) # dummy X, để đảm bảo stratification trên label
                self.estimator.fit(X_np[idx], y[idx])
            except Exception as e:
                if "best_split_info_left_count" in str(e):
                    self.estimator.set_params(device_type="cpu")
                    self.estimator.fit(X_np[idx], y[idx])
                else:
                    # Fallback: simple step sampling
                    step = max(len(X_np) // int(self.n_samples), 1)
                    self.estimator.fit(X_np[::step], y[::step])

        try:
            self.classes_ = np.asarray(self.estimator.classes_)
        except Exception:
            self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X):
        X_np = self._to_numpy(X)
        try:
            P = self.estimator.predict_proba(X_np)
        except Exception:
            # Handle single class case
            if len(self.classes_) == 1:
                n = len(X_np)
                c = int(self.classes_[0])
                if c == 1:
                    return np.column_stack([np.zeros(n, dtype=np.float32), np.ones(n, dtype=np.float32)])
                else:
                    return np.column_stack([np.ones(n, dtype=np.float32), np.zeros(n, dtype=np.float32)])
            return np.full((len(X_np), 2), 0.5, dtype=np.float32)

        P = np.asarray(P)
        if P.ndim == 1:
            P1 = P.astype(np.float32)
            return np.column_stack([1.0 - P1, P1])
        if P.shape[1] == 1 and len(self.classes_) == 2:
            P1 = P[:, 0].astype(np.float32)
            return np.column_stack([1.0 - P1, P1])
        return P

    def predict(self, X):
        X_np = self._to_numpy(X)
        try:
            return self.estimator.predict(X_np)
        except Exception:
            return np.argmax(self.predict_proba(X_np), axis=1)

In [10]:
def create_ensemble_models(mode="single"):
    """
    Ensemble đơn giản với 3-model ensemble
    """
    models = []
    
    # Check GPU availability
    gpu_available = torch.cuda.is_available()
    
    if mode == "single":
        n_samples_base = 2_000_000
    else:
        n_samples_base = 900_000

    # Model 1: LightGBM
    # models.append(
    #     StratifiedSubsetClassifier(
    #         lgb.LGBMClassifier(
    #             n_estimators=400,
    #             learning_rate=0.07,
    #             num_leaves=31,
    #             subsample=0.8,
    #             colsample_bytree=0.9,
    #             device_type="gpu" if gpu_available else "cpu",
    #             verbose=-1,
    #             random_state=42
    #         ),
    #         n_samples=int(n_samples_base / 1.3)
    #     )
    # )
    
    # Model 2: LightGBM
    models.append(
        StratifiedSubsetClassifier(
            lgb.LGBMClassifier(
                n_estimators=300,
                learning_rate=0.1,
                num_leaves=63,
                max_depth=8,
                device_type="gpu" if gpu_available else "cpu",
                verbose=-1,
                random_state=42
            ),
            n_samples=int(n_samples_base)
        )
    )
    
    # Model 3: XGBoost
    models.append(
        StratifiedSubsetClassifier(
            xgb.XGBClassifier(
                n_estimators=400,
                learning_rate=0.08,
                max_depth=6,
                tree_method="gpu_hist" if gpu_available else "hist",
                device="cuda" if gpu_available else "cpu",
                random_state=42
            ),
            n_samples=int(n_samples_base)
        )
    )

    return models

In [11]:
def select_threshold_map(thresholds, mode: str):
    """
    Tạo 1 defaultdict có mode(single/pair)-aware và action-specific thresholds
    """
    if isinstance(thresholds, dict):
        # Kiểm tra mode-aware structure tồn tại
        if ("single" in thresholds) or ("pair" in thresholds) or \
           ("single_default" in thresholds) or ("pair_default" in thresholds):
            # Mode-aware thresholds
            base_default = float(thresholds.get("default", 0.27))
            mode_default = float(thresholds.get(f"{mode}_default", base_default))
            mode_overrides = thresholds.get(mode, {}) or {}
            
            out = defaultdict(lambda: mode_default)
            out.update({str(k): float(v) for k, v in mode_overrides.items()})
            return out
        
        # Plain per-action dict
        out = defaultdict(lambda: float(thresholds.get("default", 0.27)))
        out.update({str(k): float(v) for k, v in thresholds.items() if k != "default"})
        return out
    
    # Fallback: constant threshold
    return defaultdict(lambda: 0.27)

## Predict functions

In [12]:
def predict_actions_ensemble(X, models_dict, actions):
    """
    Dự đoán nhiều action với ensemble models.
    
    Args:
        X: feature DataFrame
        models: dict of trained classifiers {action: model}
        actions: danh sách actions để predict
    
    Returns:
        DataFrame với xác suất mỗi actions
    """
    
    proba_df = pd.DataFrame(index=X.index)
    X_np = X.to_numpy(np.float32, copy=False)
    
    for action in actions:
        if action not in models_dict:
            # Action not trained in this section
            proba_df[action] = 0.0
            continue
        
        model_list = models_dict[action]
        
        try:
            # Get predictions from all models in ensemble
            probs_list = []
            for model in model_list:
                try:
                    prob = model.predict_proba(X_np)[:, 1]
                    probs_list.append(prob)
                except Exception:
                    pass
            
            if len(probs_list) > 0:
                # Average ensemble predictions
                proba_df[action] = np.mean(probs_list, axis=0)
            else:
                proba_df[action] = 0.0
                
        except Exception as e:
            proba_df[action] = 0.0
    
    return proba_df


def predict_multiclass_adaptive(pred, meta, action_thresholds):
    """
    Đổi frame probabilities thành chuỗi frame có action đó
    + Adaptive thresholding cho từng action + temporal smoothing
    Args:
        pred: DataFrame (num_frames, num_actions) with probabilities
        meta: DataFrame with video_frame, agent_id, target_id
        thresholds: dict of thresholds per action
    Returns:
        DataFrame with columns: video_id, agent_id, target_id, action, start_frame, stop_frame
    """
    # Apply temporal smoothing
    pred_smoothed = pred.rolling(window=5, min_periods=1, center=True).mean()

    # Determine mode (single/pair)
    mode = "pair"
    try:
        if "target_id" in meta.columns and meta["target_id"].eq("self").all():
            mode = "single"
    except Exception:
        pass

    # Lấy action có xác suất cao nhất mỗi frame
    ama = np.argmax(pred_smoothed.values, axis=1)

    # Lấy threshold map cho mode tương ứng
    th_map = select_threshold_map(action_thresholds, mode)

    # Áp dụng thresholds
    max_probs = pred_smoothed.max(axis=1).values
    threshold_mask = np.zeros(len(pred_smoothed), dtype=bool)
    
    for i, action in enumerate(pred_smoothed.columns):
        action_mask = (ama == i)
        threshold = th_map[action]
        threshold_mask |= (action_mask & (max_probs >= threshold))
    ama = np.where(threshold_mask, ama, -1)
    ama = pd.Series(ama, index=meta.video_frame.values)
    
    # Detect changes
    changes_mask = (ama != ama.shift(1)).values
    ama_changes = ama[changes_mask]
    meta_changes = meta[changes_mask]
    mask = ama_changes.values >= 0
    mask[-1] = False
    
    submission_part = pd.DataFrame({
        'video_id': meta_changes["video_id"].values[mask],
        'agent_id': meta_changes["agent_id"].values[mask],
        'target_id': meta_changes["target_id"].values[mask],
        'action': pred.columns[ama_changes.values[mask]],
        'start_frame': ama_changes.index[mask],
        'stop_frame': ama_changes.index[1:][mask[:-1]]
    })
    
    # Fix stop_frame với mỗi bộ video/agent/target
    stop_video_id = meta_changes["video_id"].values[1:][mask[:-1]]
    stop_agent_id = meta_changes["agent_id"].values[1:][mask[:-1]]
    stop_target_id = meta_changes["target_id"].values[1:][mask[:-1]]
    
    for i in range(len(submission_part)):
        video_id = submission_part.video_id.iloc[i]
        agent_id = submission_part.agent_id.iloc[i]
        target_id = submission_part.target_id.iloc[i]
        
        if i < len(stop_video_id):
            if stop_video_id[i] != video_id or stop_agent_id[i] != agent_id or stop_target_id[i] != target_id:
                new_stop_frame = meta.query("(video_id == @video_id)").video_frame.max() + 1
                submission_part.iat[i, submission_part.columns.get_loc("stop_frame")] = new_stop_frame
        else:
            meta.query("(video_id == @video_id)").video_frame.max() + 1
            submission_part.iat[i, submission_part.columns.get_loc("stop_frame")] = new_stop_frame
    
    # Lọc events rất ngắn (nhiễu)
    duration = submission_part.stop_frame - submission_part.start_frame
    submission_part = submission_part[duration >= 3].reset_index(drop=True)
    
    if len(submission_part) > 0:
        assert (submission_part.stop_frame > submission_part.start_frame).all(), "stop <= start"
    
    return submission_part

In [13]:
def clean_submission(submission, dataset, mode, traintest_directory=None):
    """
    Làm sạch submission:
    1. Bỏ các chuỗi start_frame >= stop_frame
    2. Bỏ các chuỗi bị lặp (cùng agent-target)
    3. Điền video trống với dummy predictions
    """
    if traintest_directory is None:
        traintest_directory = f"/kaggle/input/MABe-mouse-behavior-detection/{mode}_tracking"
        # traintest_directory = f"D:/UET/ML/mouse_behavior/data/{mode}_tracking"
    
    # Bỏ invalid frames
    submission = submission[submission.start_frame < submission.stop_frame].copy()

    # Bỏ rows có NaN
    submission = submission.dropna(subset=["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"])
    
    # Bỏ chuỗi bị lặp
    group_list = []
    for _, group in submission.groupby(["video_id", "agent_id", "target_id"]):
        group = group.sort_values("start_frame")
        mask = np.ones(len(group), dtype=bool)
        last_stop_frame = 0
        for i, (_, row) in enumerate(group.iterrows()):
            if row["start_frame"] < last_stop_frame:
                mask[i] = False
            else:
                last_stop_frame = row["stop_frame"]
        group_list.append(group[mask])
    
    submission = pd.concat(group_list)

    if len(group_list) > 0:
        submission = pd.concat(group_list, ignore_index=True)
    else:
        submission = pd.DataFrame(columns=["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"])
    
    # Điền video trống
    s_list = []
    for idx, row in dataset.iterrows():
        lab_id = row["lab_id"]
        if lab_id.startswith('MABe22'):
            continue
        
        video_id = row["video_id"]
        if (submission.video_id == video_id).any():
            continue

        if type(row.behaviors_labeled) != str:
            continue
        
        print(f"Video {video_id} has no predictions, filling...")
        
        path = f"{traintest_directory}/{lab_id}/{video_id}.parquet"
        try:
            vid = pd.read_parquet(path)
        except:
            continue
        
        vid_behaviors = json.loads(row["behaviors_labeled"])
        vid_behaviors = sorted(list({b.replace("'", "") for b in vid_behaviors}))
        vid_behaviors = [b.split(',') for b in vid_behaviors]
        vid_behaviors = pd.DataFrame(vid_behaviors, columns=["agent", "target", "action"])
        
        start_frame = vid.video_frame.min()
        stop_frame = vid.video_frame.max() + 1
        
        for (agent, target), actions in vid_behaviors.groupby(["agent", "target"]):
            batch_length = int(np.ceil((stop_frame - start_frame) / len(actions)))
            for i, (_, action_row) in enumerate(actions.iterrows()):
                batch_start = start_frame + i * batch_length
                batch_stop = min(batch_start + batch_length, stop_frame)
                s_list.append((video_id, agent, target, action_row["action"], batch_start, batch_stop))
    
    if len(s_list) > 0:
        submission = pd.concat([
            submission,
            pd.DataFrame(s_list, columns=["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"])
        ])
    
    submission = submission.reset_index(drop=True)
    return submission

## Pipeline train-and-submit

In [14]:
def train_models_and_submit_ensemble(body_parts_tracked_str, mode_train, X_all, y_all, meta_all, n_samples):
    """
    Theo từng section/body_part_tracked, train binary classifiers cho từng action (trên bộ train) và predict (trên bộ test).
    """

    models = []
    
    # Check GPU availability
    gpu_available = torch.cuda.is_available()

    X_all_np = X_all.to_numpy(np.float32, copy=False)
    del X_all
    gc.collect()

    # NEW: tạo Ensemble model
    # models = create_ensemble_models(mode=mode_train)

    # Model 2: LightGBM
    models.append(make_pipeline(
        StratifiedSubsetClassifier(
            lgb.LGBMClassifier(
                n_estimators=300,
                learning_rate=0.1,
                num_leaves=63,
                max_depth=8,
                device_type="gpu" if gpu_available else "cpu",
                verbose=-1,
                random_state=42
            ), n_samples=int(n_samples), # new: train on all samples
        )
    ))
    
    # Model 3: XGBoost
    models.append(make_pipeline(
        StratifiedSubsetClassifier(
            xgb.XGBClassifier(
                n_estimators=400,
                learning_rate=0.08,
                max_depth=6,
                tree_method="gpu_hist" if gpu_available else "hist",
                device="cuda" if gpu_available else "cpu",
                random_state=42
            ), n_samples=int(n_samples) # new: train on all samples
        )
    ))

    model_list = []
    for action in y_all.columns:
        y_raw = y_all[action].to_numpy()
        mask = ~pd.isna(y_raw)
        y_action = y_raw[mask].astype(int)
        
        if not (y_action == 0).all() and np.sum(y_action) >= 5:
            trained = []
            idx = np.flatnonzero(mask)
            for m in models:
                m_clone = clone(m)
                m_clone.fit(X_all_np[idx], y_action)
                trained.append(m_clone)
            model_list.append((action, trained))

    del X_all_np
    gc.collect()


    # Build test data của body_part_tracked hiện tại
    body_parts_tracked = json.loads(body_parts_tracked_str)
    if len(body_parts_tracked) > 5:
        body_parts_tracked = [b for b in body_parts_tracked if b not in CFG.drop_body_parts]
    
    test_subset = test_csv[test_csv.body_parts_tracked == body_parts_tracked_str]

    # Predict single
    generator = generate_mouse_data(
        test_subset, 
        mode="test", 
        generate_single=(mode_train == "single"), 
        generate_pair=(mode_train == "pair")
    )

    # Tạo fps_lookup cho test set
    fps_lookup = (
        test_subset[["video_id", "frames_per_second"]]
        .drop_duplicates("video_id")
        .set_index("video_id")["frames_per_second"]
        .to_dict()
    )

    for mode_test, data_test, meta_test, actions_test in generator:
        assert mode_test == mode_train
        fps_i = _fps_from_meta(meta_test, fps_lookup, default_fps=30.0)

        # Feature engineering
        if mode_test == "single":
            X_test = build_single_features(data_test, body_parts_tracked, fps_i).astype(np.float32)
        else:
            X_test = build_pair_features(data_test, body_parts_tracked, fps_i).astype(np.float32)

        X_test_np = X_test.to_numpy(np.float32, copy=False)
        del data_test
        gc.collect()

        # Predict
        # preds = predict_actions_ensemble(X_test, model_list, actions_test)

        pred = pd.DataFrame(index=meta_test.video_frame)
        for action, trained in model_list:
            if action in actions_test:
                probs = [m.predict_proba(X_test_np)[:, 1] for m in trained]
                pred[action] = np.average(probs, axis=0)

        del X_test_np
        gc.collect()

        # if len(preds.columns) > 0:
        if pred.shape[1] != 0:
            sub_part = predict_multiclass_adaptive(pred, meta_test, CFG.action_thresholds)
            submission_list.append(sub_part)

    #     del X_test
    #     gc.collect()

    # return submission_parts

## Execution

In [15]:
print("\n" + "="*60)
print("TRAIN-AND-SUBMIT PIPELINE")
print("="*60)

# thresholds_all = {"single": {}, "pair": {}}
submission_list = []

for section in range(len(body_parts_list)):
    # Lấy body_parts_tracked trong số 9 bộ của toàn dataset
    body_parts_tracked_str = body_parts_list[section]

    try:
        body_parts_tracked = json.loads(body_parts_tracked_str)

        if len(body_parts_tracked) > 5:
            body_parts_tracked = [b for b in body_parts_tracked if b not in CFG.drop_body_parts]

        # Lấy các rows/videos được thu với body_parts_tracked tương ứng
        train_subset = train[train.body_parts_tracked == body_parts_tracked_str]

        if train_subset.empty:
            print("\nNo videos in this section, skipping...")
            continue

        print("\n" + "="*60)
        print(f"SECTION {section}/{len(body_parts_list)-1} (9 sections total): {len(body_parts_tracked)} bodyparts, {len(train_subset)} videos")
        print("="*60)

        fps_lookup = (
            train_subset[["video_id", "frames_per_second"]]
            .drop_duplicates("video_id")
            .set_index("video_id")["frames_per_second"]
            .to_dict()
        )
    
        single_mouse = []
        single_meta = []
        single_y = []

        pair_mouse = []
        pair_meta = []
        pair_y = []

        # Accumulate generated data
        for mode, data, meta, labels in generate_mouse_data(train_subset, mode="train"):
            video_id = meta["video_id"].iloc[0]
            fps = fps_lookup.get(video_id, 30.0)

            if mode == "single":
                single_mouse.append(data)
                single_meta.append(meta)
                single_y.append(labels)

            else:
                pair_mouse.append(data)
                pair_meta.append(meta)
                pair_y.append(labels)

        # Single models
        if len(single_mouse) > 0:
            print(f"Processing {len(single_mouse)} single mouse videos...")
            single_X = []

            for data_i, meta_i, in zip(single_mouse, single_meta):
                fps_i = _fps_from_meta(meta_i, fps_lookup, default_fps=30.0)

                X_i = build_single_features(data_i, body_parts_tracked, fps_i).astype(np.float32)
                single_X.append(X_i)
        
            X_all = pd.concat(single_X, ignore_index=True)
            y_all = pd.concat(single_y, ignore_index=True)
            meta_all = pd.concat(single_meta, ignore_index=True)
        
            print(f"Shape: {X_all.shape[0]} frames × {X_all.shape[1]} features")
        
            # Train ENSEMBLE + ADAPTIVE THRESHOLDING và thực hiện predict
            train_models_and_submit_ensemble(body_parts_tracked_str, "single", X_all, y_all, meta_all, 2_000_000)

            del X_all, y_all, meta_all, single_X, single_mouse, single_meta, single_y
            gc.collect()

        # Train pair models
        if len(pair_mouse) > 0:
            print(f"Processing {len(pair_mouse)} pair mouse videos...")
            pair_X = []
            
            for data_i, meta_i in zip(pair_mouse, pair_meta):
                fps_i = _fps_from_meta(meta_i, fps_lookup, default_fps=30.0)

                X_i = build_pair_features(data_i, body_parts_tracked, fps_i).astype(np.float32)
                pair_X.append(X_i)   

            X_all = pd.concat(pair_X, ignore_index=True)
            y_all = pd.concat(pair_y, ignore_index=True)
            meta_all = pd.concat(pair_meta, ignore_index=True)  
        
            print(f"Shape: {X_all.shape[0]} frames × {X_all.shape[1]} features")
        
            # Train ENSEMBLE + ADAPTIVE THRESHOLDING + thực hiện predict
            train_models_and_submit_ensemble(body_parts_tracked_str, "pair", X_all, y_all, meta_all, 900_000)

            del X_all, y_all, meta_all, pair_X, pair_mouse, pair_y, pair_meta
            gc.collect()

    except Exception as e:
        print(f"***Exception*** {str(e)[:100]}")

# Save thresholds
# joblib.dump(thresholds_all, f"{CFG.model_save_dir}/thresholds.pkl")

print("\n" + "="*60)
print(f"Training complete!")
print("="*60)


# Tạo final submission
if len(submission_list) > 0:
    submission = pd.concat(submission_list, ignore_index=True)
else:
    # Empty fallback
    print("WARNING: No predictions generated!")
    submission = pd.DataFrame({
        "video_id": [], "agent_id": [], "target_id": [],
        "action": [], "start_frame": [], "stop_frame": []
    })

# Làm sạch submission
submission = clean_submission(submission, test_csv, "test", CFG.test_tracking_path)
    
# Thêm row_id
submission.insert(0, "row_id", range(len(submission)))
    
# Save
submission.to_csv("submission.csv", index=False)
    
print("\n" + "="*60)
print("SUBMISSION COMPLETE!")
print("="*60)
print(f"Total events: {len(submission):,}")
print(f"Unique videos: {submission.video_id.nunique()}")
print(f"Actions: {submission.action.value_counts().to_dict()}")
print(f"Saved to: submission.csv")


TRAIN-AND-SUBMIT PIPELINE

SECTION 0/8 (9 sections total): 9 bodyparts, 7 videos
Processing 22 single mouse videos...
Shape: 544859 frames × 59 features




Processing 60 pair mouse videos...
Shape: 1524906 frames × 111 features

SECTION 1/8 (9 sections total): 10 bodyparts, 21 videos
Processing 32 single mouse videos...
Shape: 478728 frames × 68 features
Processing 41 pair mouse videos...
Shape: 613716 frames × 130 features

SECTION 2/8 (9 sections total): 9 bodyparts, 10 videos
Processing 37 single mouse videos...
Shape: 1941885 frames × 60 features
Processing 106 pair mouse videos...
Shape: 5607030 frames × 111 features

SECTION 3/8 (9 sections total): 8 bodyparts, 42 videos
Processing 76 pair mouse videos...
Shape: 2210177 frames × 94 features


[LightGBM] [Fatal] Check failed: (best_split_info.left_count) > (0) at /tmp/lightgbm/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 852 .




SECTION 4/8 (9 sections total): 7 bodyparts, 74 videos
Processing 76 pair mouse videos...
Shape: 960574 frames × 79 features

SECTION 5/8 (9 sections total): 5 bodyparts, 19 videos
Processing 22 single mouse videos...
Shape: 708496 frames × 33 features
Processing 38 pair mouse videos...
Shape: 10212910 frames × 55 features


[LightGBM] [Fatal] Check failed: (best_split_info.left_count) > (0) at /tmp/lightgbm/LightGBM/lightgbm-python/src/treelearner/serial_tree_learner.cpp, line 852 .



***Exception*** Check failed: (best_split_info.left_count) > (0) at /tmp/lightgbm/LightGBM/lightgbm-python/src/treel

SECTION 6/8 (9 sections total): 4 bodyparts, 17 videos
Processing 34 single mouse videos...
Shape: 899134 frames × 29 features
Processing 34 pair mouse videos...
Shape: 899134 frames × 46 features

SECTION 7/8 (9 sections total): 7 bodyparts, 634 videos
Processing 115 single mouse videos...
Shape: 3020371 frames × 45 features
Processing 677 pair mouse videos...
Shape: 12259207 frames × 79 features

SECTION 8/8 (9 sections total): 5 bodyparts, 24 videos
Processing 9 single mouse videos...
Shape: 329777 frames × 33 features
Processing 46 pair mouse videos...
Shape: 1700260 frames × 55 features

Training complete!

SUBMISSION COMPLETE!
Total events: 636
Unique videos: 1
Actions: {'rear': 318, 'avoid': 143, 'approach': 86, 'chase': 60, 'attack': 27, 'chaseattack': 2}
Saved to: submission.csv
