In [1]:
import itertools
import json, ast
from sklearn.base import clone
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import f1_score
import xgboost as xgb
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import joblib
import os, math
import gc
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Hardware check
import torch
print("="*60)
print("HARDWARE CHECK")
print("="*60)
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
print("="*60)

HARDWARE CHECK
GPU Available: True
GPU Name: Tesla T4


In [2]:
class CFG:
    # mode = "validate"
    mode = "submit"

    if mode == "validate":
        model_save_dir = "/kaggle/working"
    else:
        model_save_dir = "/kaggle/input/lgbm-model-mabe-results"

    train_csv_path = "/kaggle/input/MABe-mouse-behavior-detection/train.csv" 
    test_csv_path = "/kaggle/input/MABe-mouse-behavior-detection/test.csv"
    train_annotation_path = "/kaggle/input/MABe-mouse-behavior-detection/train_annotation"
    train_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/train_tracking"
    test_tracking_path = "/kaggle/input/MABe-mouse-behavior-detection/test_tracking"

    drop_body_parts =  [
        'headpiece_bottombackleft', 'headpiece_bottombackright', 'headpiece_bottomfrontleft', 'headpiece_bottomfrontright', 
        'headpiece_topbackleft', 'headpiece_topbackright', 'headpiece_topfrontleft', 'headpiece_topfrontright', 
        'spine_1', 'spine_2', 'tail_middle_1', 'tail_middle_2', 'tail_midpoint'
    ]

In [3]:
train_csv = pd.read_csv(CFG.train_csv_path) 
test_csv = pd.read_csv(CFG.test_csv_path)

In [4]:
mask_lab = train_csv["lab_id"].str.startswith("MABe22")
mask_behavior = train_csv["behaviors_labeled"].isna() | (train_csv["behaviors_labeled"].str.strip() == "")
mask_drop = mask_lab | mask_behavior

train = train_csv[~mask_drop]
body_parts_list = list(np.unique(train.body_parts_tracked))

## Data Generator

In [5]:
def generate_mouse_data(datasubset, mode, traintest_directory=None, generate_single=True, generate_pair=True):
    """
    Yields:
        (mode, X, meta, y)
        mode: "single" hoặc "pair"
        X: raw features DataFrame
        meta: metadata DataFrame
        y: labels (đối với train mode) hoặc action list (đối với test mode)
    """

    if traintest_directory is None:
        traintest_directory = f"/kaggle/input/MABe-mouse-behavior-detection/{mode}_tracking"

    for idx, row in datasubset.iterrows():
        lab_id = row.lab_id
        video_id = row.video_id
        pix_per_cm = row.pix_per_cm_approx
        fps = row.frames_per_second

        # Bỏ qua MABe22 labs hoặc missing behaviors
        if lab_id.startswith("MABe22"):
            continue
        if mode == "train" and (pd.isna(row.behaviors_labeled) or str(row.behaviors_labeled).strip() == ""):
            continue

        path = f"{traintest_directory}/{lab_id}/{video_id}.parquet"

        # Load tracking
        try:
            vid = pd.read_parquet(path)
        except FileNotFoundError:
            continue

        # Bỏ bớt bodyparts
        if len(np.unique(vid.bodypart)) > 5:
            vid = vid[~vid.bodypart.isin(CFG.drop_body_parts)]

        pvid = vid.pivot(
            index="video_frame",
            columns=["mouse_id", "bodypart"],
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).T.sort_index().T
        pvid /= pix_per_cm

        del vid
        gc.collect()

        mouse_ids = pvid.columns.get_level_values(0).unique().tolist()

        # Tìm behaviors tracked trong CSV file
        vid_behaviors = json.loads(row.behaviors_labeled)
        vid_behaviors = sorted(list({b.replace("'", "") for b in vid_behaviors}))
        vid_behaviors = [b.split(',') for b in vid_behaviors]
        vid_behaviors = pd.DataFrame(vid_behaviors, columns=["agent", "target", "action"])

        # Load annotation (đối với training mode)
        if mode == "train":
            try: 
                annot = pd.read_parquet(path.replace("train_tracking", "train_annotation"))
            except FileNotFoundError:
                continue
        else:
            annot = None


        # Build data cho single mouse 
        if generate_single:
            vid_behaviors_subset = vid_behaviors.query("target == 'self'")

            for mouse_id_str in vid_behaviors_subset.agent.unique():
                try:
                    mouse_id = int(mouse_id_str.replace("mouse", ""))
                
                    if mouse_id not in mouse_ids:
                        continue

                    vid_agent_actions = np.unique(vid_behaviors_subset.query("agent == @mouse_id_str").action)

                    # Single mouse raw features - toạ độ bodyparts
                    single_mouse = pvid.loc[:, mouse_id]
                    assert len(single_mouse) == len(pvid)
                
                    # Single mouse meta data
                    meta = pd.DataFrame({
                        "video_id": video_id,
                        "agent_id": mouse_id_str,
                        "target_id": "self",
                        "video_frame": single_mouse.index,
                    })

                    # Single mouse labels
                    if mode == "train":
                        labels = pd.DataFrame(0.0, index=single_mouse.index, columns=vid_agent_actions)
                    
                        annot_subset = annot.query("(agent_id == @mouse_id) & (target_id == @mouse_id)")

                        for i in range(len(annot_subset)):
                            annot_row = annot_subset.iloc[i]
                            labels.loc[annot_row["start_frame"]:annot_row["stop_frame"], annot_row.action] = 1.0
                        yield "single", single_mouse, meta, labels
                    else:
                        yield "single", single_mouse, meta, vid_agent_actions

                except (KeyError, ValueError):
                    pass

        # Build data cho mouse pair
        if generate_pair:
            vid_behaviors_subset = vid_behaviors.query("target != 'self'")

            if len(vid_behaviors_subset) > 0:
                for agent, target in itertools.permutations(np.unique(pvid.columns.get_level_values("mouse_id")), 2):
                    agent_str = f"mouse{agent}"
                    target_str = f"mouse{target}"

                    vid_agent_actions = np.unique(vid_behaviors_subset.query("(agent == @agent_str) & (target == @target_str)").action)

                    if len(vid_agent_actions) == 0:
                        continue

                    # Mouse pair raw features - toạ độ bodyparts của cặp chuột
                    mouse_pair = pd.concat([pvid[agent], pvid[target]], axis=1, keys=["A", "B"])  # Raw coordinates
                    assert len(mouse_pair) == len(pvid)

                    # Mouse pair meta data
                    meta = pd.DataFrame({
                        "video_id": video_id,
                        "agent_id": agent_str,
                        "target_id": target_str,
                        "video_frame": pvid.index,
                    })

                    # Mouse pair labels
                    if mode == "train":
                        labels = pd.DataFrame(0.0, index=pvid.index, columns=vid_agent_actions)
                    
                        annot_subset = annot.query("(agent_id == @agent) & (target_id == @target)")

                        for i in range(len(annot_subset)):
                            annot_row = annot_subset.iloc[i]
                            labels.loc[annot_row["start_frame"]:annot_row["stop_frame"], annot_row.action] = 1.0
                        yield "pair", mouse_pair, meta, labels
                    else:
                        yield "pair", mouse_pair, meta, vid_agent_actions

## Feature Engineering

In [6]:
def _fps_from_meta(meta_df, fallback_lookup, default_fps=30.0):
    """Get FPS with proper fallback chain"""
    if "frames_per_second" in meta_df.columns:
        fps_val = meta_df["frames_per_second"].iloc[0]
        if pd.notnull(fps_val) and fps_val > 0:
            return float(fps_val)
    
    vid = meta_df["video_id"].iloc[0]
    if vid in fallback_lookup:
        return float(fallback_lookup[vid])
    
    return default_fps

In [7]:
def calculate_centers(df):
    """
    Đảm bảo "body_center" tồn tại với mọi chuột hoặc bodyparts combination.
    Xử lý cả cột 2-level (bodypart, coord) (từng chuột) và 3-level (mouse_id, bodypart, coord) (cặp chuột).
    
    Fallback logic:
    1. Nếu nose và tail_base tồn tại → midpoint(nose, tail_base)
    2. Else if head và tail_base tồn tại → midpoint(head, tail_base)
    3. Else if chỉ tail_base tồn tại → use tail_base
    4. Else → không tính được body_center
    """
    cols = df.columns

    # Cột 2-level (bodypart, coord)
    if cols.nlevels == 2:
        if ("body_center", "x") not in df.columns or ("body_center", "y") not in df.columns:
            if ("nose", "x") in df.columns and ("tail_base", "x") in df.columns:
                df[("body_center", "x")] = (df[("nose", "x")] + df[("tail_base", "x")]) / 2
                df[("body_center", "y")] = (df[("nose", "y")] + df[("tail_base", "y")]) / 2
            elif ("head", "x") in df.columns and ("tail_base", "x") in df.columns:
                df[("body_center", "x")] = (df[("head", "x")] + df[("tail_base", "x")]) / 2
                df[("body_center", "y")] = (df[("head", "y")] + df[("tail_base", "y")]) / 2
            elif ("tail_base", "x") in df.columns:
                df[("body_center", "x")] = df[("tail_base", "x")]
                df[("body_center", "y")] = df[("tail_base", "y")]
            else:
                # no valid bodyparts → fill NaN
                df[("body_center", "x")] = np.nan
                df[("body_center", "y")] = np.nan

    # Cột 3-level (mouse_id, bodypart, coord)
    elif cols.nlevels == 3:
        mice = sorted(list(set(c[0] for c in cols)))

        for m in mice:
            has_body_center = ((m, "body_center", "x") in cols) and ((m, "body_center", "y") in cols)
            if not has_body_center:
                if ((m, "nose", "x") in cols) and ((m, "tail_base", "x") in cols):
                    df[(m, "body_center", "x")] = (df[(m, "nose", "x")] + df[(m, "tail_base", "x")]) / 2
                    df[(m, "body_center", "y")] = (df[(m, "nose", "y")] + df[(m, "tail_base", "y")]) / 2
                elif ((m, "head", "x") in cols) and ((m, "tail_base", "x") in cols):
                    df[(m, "body_center", "x")] = (df[(m, "head", "x")] + df[(m, "tail_base", "x")]) / 2
                    df[(m, "body_center", "y")] = (df[(m, "head", "y")] + df[(m, "tail_base", "y")]) / 2
                elif ((m, "tail_base", "x") in cols):
                    df[(m, "body_center", "x")] = df[(m, "tail_base", "x")]
                    df[(m, "body_center", "y")] = df[(m, "tail_base", "y")]
                else:
                    df[(m, "body_center", "x")] = np.nan
                    df[(m, "body_center", "y")] = np.nan
    return df

def calculate_speed_lag(df, part, fps, lag=10, mouse=None):
    cols = df.columns
    if mouse is not None:
        x = df[(mouse, part, "x")]
        y = df[(mouse, part, "y")]
    else:
        x = df[(part, "x")]
        y = df[(part, "y")]

    if x.isna().all() or y.isna().all():
        # all missing → return zeros
        return pd.Series(0, index=df.index)

    dx = x.diff(lag)
    dy = y.diff(lag)
    speed = np.sqrt(dx**2 + dy**2) * fps
    return speed.fillna(0)

# Tính các thống kê của 1 đại lượng theo nhiều cửa sổ thời gian
def calculate_window_stats(df, metric, name, fps, scales=[60, 90, 120]):
    """
    Thêm rolling statistics cho bất kỳ series nào.
    
    metric : pd.Series (ví dụ speed, distance, curvature...)
    fps    : frames_per_second
    scales : list window sizes quy đổi theo 30fps → mặc định [60, 90, 120]
    """
    res = pd.DataFrame(index=df.index)
    for scale in scales:
        ws = max(1, int(round(scale * float(fps) / 30)))
        roll = metric.rolling(ws, min_periods=max(1, ws//4))

        res[f"{name}_mean_{scale}"] = roll.mean()
        res[f"{name}_std_{scale}"]  = roll.std()
        res[f"{name}_min_{scale}"]  = roll.min()
        res[f"{name}_max_{scale}"]  = roll.max()

    return res

In [8]:
def build_single_features(single_mouse_df, body_parts_tracked, meta_fps):
    single_mouse_df = calculate_centers(single_mouse_df)

    # Get actual bodypart columns
    available_body_parts = single_mouse_df.columns.get_level_values(0).unique()
    
    # === Shape and Position Features ===
    # Euclidean distances giữa các cặp bodyparts
    X = pd.DataFrame({
        f"{p1}+{p2}": np.sqrt(
            (single_mouse_df[(p1, "x")] - single_mouse_df[(p2, "x")])**2 + 
            (single_mouse_df[(p1, "y")] - single_mouse_df[(p2, "y")])**2
        )
        for p1, p2 in itertools.combinations(body_parts_tracked, 2)
        if p1 in available_body_parts and p2 in available_body_parts
    })
    
    expected_cols = [f"{p1}+{p2}" for p1, p2 in itertools.combinations(body_parts_tracked, 2)]
    X = X.reindex(columns=expected_cols, copy=False)
    
    # Elongation (chỉ khi required bodyparts tồn tại)
    if "nose" in available_body_parts and "tail_base" in available_body_parts and "ear_left" in available_body_parts and "ear_right" in available_body_parts:
        X["elong"] = X["nose+tail_base"] / (X["ear_left+ear_right"] + 1e-6)
    else:
        X["elong"] = 0.0
    
    # Body angle (chỉ khi nose, tail_base, body_center tồn tại)
    if all(bp in available_body_parts for bp in ["nose", "tail_base", "body_center"]):
        v1_x = single_mouse_df[("nose","x")] - single_mouse_df[("body_center","x")]
        v1_y = single_mouse_df[("nose","y")] - single_mouse_df[("body_center","y")]
        v2_x = single_mouse_df[("tail_base","x")] - single_mouse_df[("body_center","x")]
        v2_y = single_mouse_df[("tail_base","y")] - single_mouse_df[("body_center","y")]
        X["body_angle"] = (v1_x*v2_x + v1_y*v2_y) / (np.sqrt(v1_x**2+v1_y**2) * np.sqrt(v2_x**2+v2_y**2) + 1e-6)
    else:
        X["body_angle"] = 0.0
    
    # === Movement Features ===
    if "body_center" in available_body_parts:
        X["speed"] = np.sqrt(
            single_mouse_df[("body_center", "x")].diff()**2 +
            single_mouse_df[("body_center", "y")].diff()**2
        )
        X["accelerate"] = X["speed"].diff()
    else:
        X["speed"] = 0.0
        X["accelerate"] = 0.0
    
    # For available bodyparts
    for p in ["body_center", "ear_left", "ear_right"]:
        if p in available_body_parts:
            # Speed lag features
            X[f"speed_{p}_lag_10"] = calculate_speed_lag(single_mouse_df, p, fps=meta_fps)

            # Rolling stats
            speed = np.sqrt(single_mouse_df[(p, "x")].diff()**2 + single_mouse_df[(p, "y")].diff()**2) * float(meta_fps)
            res = calculate_window_stats(single_mouse_df, speed, f"speed_{p}", fps=meta_fps)
            X = pd.concat([X, res], axis=1)

            # Curvature
            lag = max(1, int(round(10 * float(meta_fps) / 30)))
            shifted_x = single_mouse_df[(p,"x")].shift(lag)
            shifted_y = single_mouse_df[(p,"y")].shift(lag)
            X[f"curvature_{p}_lag_{lag}"] = np.sqrt((shifted_x - single_mouse_df[(p,"x")])**2 + 
                                                              (shifted_y - single_mouse_df[(p,"y")])**2)
        else:
            # Dummy columns
            X[f"speed_{p}_lag_10"] = 0.0
            
            for scale in [40, 60, 80]:
                X[f"speed_{p}_mean_{scale}"] = 0.0
                X[f"speed_{p}_std_{scale}"] = 0.0
                X[f"speed_{p}_min_{scale}"] = 0.0
                X[f"speed_{p}_max_{scale}"] = 0.0
                
            lag = max(1, int(round(10 * float(meta_fps) / 30)))
            X[f"curvature_{p}_lag_{lag}"] = 0.0

    X = X.T.drop_duplicates().T
    
    return X.astype(np.float32, copy=False).fillna(0)


def build_pair_features(mouse_pair_df, body_parts_tracked, meta_fps):
    mouse_pair_df = calculate_centers(mouse_pair_df)
    
    # Get bodyparts for both mice
    avail_A = mouse_pair_df["A"].columns.get_level_values(0).unique()
    avail_B = mouse_pair_df["B"].columns.get_level_values(0).unique()
    
    # Pairwise distances
    X = pd.DataFrame({
        f"A_{p1}+B_{p2}": np.sqrt(
            (mouse_pair_df[("A", p1, "x")] - mouse_pair_df[("B", p2, "x")])**2 +
            (mouse_pair_df[("A", p1, "y")] - mouse_pair_df[("B", p2, "y")])**2
        )
        for p1, p2 in itertools.product(body_parts_tracked, repeat=2)
        if p1 in avail_A and p2 in avail_B
    })
    
    expected_cols = [f"A_{p1}+B_{p2}" for p1, p2 in itertools.product(body_parts_tracked, repeat=2)]
    X = X.reindex(columns=expected_cols, copy=False)
    
    # Relative orientation (chỉ khi nose (không có nose thì head) và tail_base tồn tại ở cả 2 chuột)
    if all(bp in avail_A for bp in ["nose","tail_base"]) and all(bp in avail_B for bp in ["nose","tail_base"]):
        vec_A_x = mouse_pair_df[("A", "nose", "x")] - mouse_pair_df[("A", "tail_base", "x")]
        vec_A_y = mouse_pair_df[("A", "nose", "y")] - mouse_pair_df[("A", "tail_base", "y")]
        vec_B_x = mouse_pair_df[("B", "nose", "x")] - mouse_pair_df[("B", "tail_base", "x")]
        vec_B_y = mouse_pair_df[("B", "nose", "y")] - mouse_pair_df[("B", "tail_base", "y")]
        X["relative_orientation"] = (vec_A_x*vec_B_x + vec_A_y*vec_B_y) / (
            np.sqrt(vec_A_x**2 + vec_A_y**2) * np.sqrt(vec_B_x**2 + vec_B_y**2) + 1e-6
        )
    elif all(bp in avail_A for bp in ["head","tail_base"]) and all(bp in avail_B for bp in ["head","tail_base"]):
        vec_A_x = mouse_pair_df[("A", "head", "x")] - mouse_pair_df[("A", "tail_base", "x")]
        vec_A_y = mouse_pair_df[("A", "head", "y")] - mouse_pair_df[("A", "tail_base", "y")]
        vec_B_x = mouse_pair_df[("B", "head", "x")] - mouse_pair_df[("B", "tail_base", "x")]
        vec_B_y = mouse_pair_df[("B", "head", "y")] - mouse_pair_df[("B", "tail_base", "y")]
        X["relative_orientation"] = (vec_A_x*vec_B_x + vec_A_y*vec_B_y) / (
            np.sqrt(vec_A_x**2 + vec_A_y**2) * np.sqrt(vec_B_x**2 + vec_B_y**2) + 1e-6
        )
    else:
        X["relative_orientation"] = 0.0
    
    # Các đại lượng dựa trên body_center
    if "body_center" in avail_A and "body_center" in avail_B:
        dist_center = np.sqrt(
            (mouse_pair_df[("A", "body_center", "x")] - mouse_pair_df[("B", "body_center", "x")])**2 +
            (mouse_pair_df[("A", "body_center", "y")] - mouse_pair_df[("B", "body_center", "y")])**2
        )

        # Approach
        approach = dist_center.diff().fillna(0)
        X["approach_A"] = approach
        X["approach_B"] = approach
        
        # Relative distance stats
        res = calculate_window_stats(mouse_pair_df, dist_center**2, "center_distance", fps=meta_fps)
        X = pd.concat([X, res], axis=1)
        
        # Relative speed
        speed_A = calculate_speed_lag(mouse_pair_df, "body_center", meta_fps, mouse="A")
        speed_B = calculate_speed_lag(mouse_pair_df, "body_center", meta_fps, mouse="B")
        X["speed_A_lag_10"] = speed_A
        X["speed_B_lag_10"] = speed_B
        X["relative_speed_A_B_lag_10"] = (speed_A - speed_B).abs()

        # Ngưỡng khoảng cách
        thresholds = {
            "very_close": 20,
            "close": 40,
            "medium": 60
        }
        X["very_close"] = (dist_center < thresholds["very_close"]).astype(float)
        X["close"] = ((dist_center >= thresholds["very_close"]) & (dist_center < thresholds["close"])).astype(float)
        X["medium"] = ((dist_center >= thresholds["close"]) & (dist_center < thresholds["medium"])).astype(float)
        X["far"] = (dist_center >= thresholds["medium"]).astype(float)

    else:
        # Dummy columns
        X["approach_A"] = 0.0
        X["approach_B"] = 0.0
        
        for scale in [40, 60, 80]:
            X[f"center_distance_mean_{scale}"] = 0.0
            X[f"center_distance_std_{scale}"] = 0.0
            X[f"center_distance_min_{scale}"] = 0.0
            X[f"center_distance_max_{scale}"] = 0.0
        
        X["speed_A_lag_10"] = 0.0
        X["speed_B_lag_10"] = 0.0
        X["relative_speed_A_B_lag_10"] = 0.0

        X["very_close"] = 0.0
        X["close"] = 0.0
        X["medium"] = 0.0
        X["far"] = 0.0

    X = X.T.drop_duplicates().T
    
    return X.astype(np.float32, copy=False).fillna(0)

## Training Pipeline

In [9]:
def train_action_models(X_all, y_all, meta_all, section_idx, mode, n_splits=3):
    """
    Train binary classifiers cho từng action với cross-validation.
    
    Returns:
        f1_scores: list các tuple (section, mode, action, f1_score)
        thresholds: dict mapping action -> optimal threshold
    """
    
    save_dir = f"/kaggle/working/models/{section_idx}/{mode}"
    os.makedirs(save_dir, exist_ok=True)
    
    f1_scores = []
    thresholds = {}
    actions = y_all.columns
    
    print(f"\nTraining {len(actions)} action classifiers...")
    
    for action in actions:
        print(f"\n[{mode.upper()} | {action}]", end=" ")
        
        # Lấy các frame được labeled là action này
        action_mask = ~y_all[action].isna()
        if action_mask.sum() == 0:
            print("No labels, skipping")
            continue
        
        y = y_all[action][action_mask].values.astype(np.int8)
        X = X_all[action_mask].values.astype(np.float32)
        groups = meta_all["video_id"][action_mask].values
        
        if y.sum() == 0:
            print("-> No positive samples, skipping")
            continue
        
        # Kiểm tra có đủ groups cho CV không
        unique_groups = np.unique(groups)
        n_groups = len(unique_groups)
        
        if n_groups < 2:
            print(f"-> Only {n_groups} video, training without CV")
            
            # Train without CV
            clf = LGBMClassifier(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, subsample=0.8, 
                min_child_samples=20, reg_alpha=0.1, reg_lambda=0.1, force_col_wise=True,
                random_state=42, n_jobs=-1, verbose=-1
            )
            clf.fit(X, y)
            
            joblib.dump(clf, f"{save_dir}/{action}_model.pkl")
            joblib.dump(0.5, f"{save_dir}/{action}_threshold.pkl")
            
            thresholds[action] = 0.5
            f1_scores.append((section_idx, mode, action, -1.0))
            
            del clf
            gc.collect()
            continue
        
        # Cross-validation
        n_splits_use = min(n_splits, n_groups)
        print(f"-> CV with {n_splits_use} folds", end=" ")
        
        gkf = StratifiedKFold(n_splits=n_splits_use, shuffle=True)
        oof_preds = np.zeros(len(y), dtype=np.float32)
        
        for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
            clf = LGBMClassifier(
                n_estimators=100, learning_rate=0.05, max_depth=5,
                num_leaves=31, subsample=0.8,
                min_child_samples=20, reg_alpha=0.1, reg_lambda=0.1, force_col_wise=True,
                random_state=42, n_jobs=-1, verbose=-1
            )
            clf.fit(X[train_idx], y[train_idx])
            oof_preds[val_idx] = clf.predict_proba(X[val_idx])[:, 1]
            
            del clf
            gc.collect()
        
        # Threshold tuning
        best_f1, best_thresh = 0.0, 0.5
        
        for thresh in np.linspace(0.1, 0.9, 17):
            y_pred = (oof_preds >= thresh).astype(int)
            f1 = f1_score(y, y_pred, zero_division=0)
            
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = thresh
        
        print(f"-> F1: {best_f1:.4f} @ thresh={best_thresh:.3f}")
        
        # Train final model
        final_clf = LGBMClassifier(
            n_estimators=100, learning_rate=0.05, max_depth=5,
            num_leaves=31, subsample=0.8, colsample_bytree=0.8,
            min_child_sample=20, reg_alpha=0.1, reg_lambda=0.1, force_col_wise=True,
            random_state=42, n_jobs=-1, verbose=-1
        )
        final_clf.fit(X, y)
        
        # Save model và threshold
        joblib.dump(final_clf, f"{save_dir}/{action}_model.pkl")
        joblib.dump(best_thresh, f"{save_dir}/{action}_threshold.pkl")
        
        thresholds[action] = best_thresh
        f1_scores.append((section_idx, mode, action, best_f1))
        
        del final_clf, oof_preds
        gc.collect()
    
    return f1_scores, thresholds

## Predict functions

In [10]:
def predict_actions(X, models, actions):
    """
    Dự đoán nhiều action.
    
    Args:
        X: feature DataFrame
        models: dict of trained classifiers {action: model}
        actions: danh sách actions để predict
    
    Returns:
        DataFrame với xác suất mỗi actions
    """
    
    proba_df = pd.DataFrame(index=X.index)
    
    for action in actions:
        if action not in models:
            # Action not trained in this section
            proba_df[action] = 0.0
            continue
        
        model = models[action]
        
        try:
            proba = model.predict_proba(X.values)[:, 1]
        except (ValueError, AttributeError) as e:
            # Feature mismatch or missing feature_names_in_
            # Align features to match what model expects
            try:
                # Get features model was trained on
                expected_features = model.feature_names_in_
                
                # Reindex X to match expected features
                # Missing features filled with 0
                X_aligned = X.reindex(columns=expected_features, fill_value=0)
                proba = model.predict_proba(X_aligned.values)[:, 1]
                
            except AttributeError:
                try:
                    n_features = model.n_features_in_
                    
                    if X.shape[1] < n_features:
                        # Pad with zeros
                        padding = pd.DataFrame(
                            0, 
                            index=X.index, 
                            columns=range(X.shape[1], n_features)
                        )
                        X_padded = pd.concat([X, padding], axis=1)
                        proba = model.predict_proba(X_padded.values)[:, 1]
                    elif X.shape[1] > n_features:
                        # Truncate
                        proba = model.predict_proba(X.iloc[:, :n_features].values)[:, 1]
                    else:
                        proba = model.predict_proba(X.values)[:, 1]
                        
                except:
                    # Last resort: skip this action
                    print(f"⚠ Could not predict {action}, skipping")
                    proba_df[action] = 0.0
                    continue
        
        proba_df[action] = proba
    
    return proba_df


def predict_multiclass(pred, meta, thresholds):
    """
    Đổi frame probabilities thành chuỗi frame có action đó
    Args:
        pred: DataFrame (num_frames, num_actions) with probabilities
        meta: DataFrame with video_frame, agent_id, target_id
        thresholds: dict of thresholds per action
    Returns:
        DataFrame with columns: video_id, agent_id, target_id, action, start_frame, stop_frame
    """
    if len(pred.columns) == 0:
        return pd.DataFrame()

    # Get action có xác suất cao nhất mỗi frame
    ama = np.argmax(pred.values, axis=1)
    max_proba = pred.max(axis=1).values
    
    # Áp dụng thresholds
    threshold_array = np.array([thresholds.get(col, 0.5) for col in pred.columns])
    action_thresholds = threshold_array[ama]
    ama = np.where(max_proba >= action_thresholds, ama, -1)
    ama = pd.Series(ama, index=meta.video_frame.values)
    
    # Detect changes
    changes_mask = (ama != ama.shift(1)).values
    ama_changes = ama[changes_mask]
    meta_changes = meta[changes_mask]
    
    mask = ama_changes.values >= 0
    mask[-1] = False

    if mask.sum() == 0:
        return pd.DataFrame()
    
    submission_part = pd.DataFrame({
        'video_id': meta_changes["video_id"].values[mask],
        'agent_id': meta_changes["agent_id"].values[mask],
        'target_id': meta_changes["target_id"].values[mask],
        'action': pred.columns[ama_changes.values[mask]],
        'start_frame': ama_changes.index[mask],
        'stop_frame': ama_changes.index[1:][mask[:-1]]
    })
    
    # Fix stop_frame với mỗi bộ video/agent/target
    stop_video_id = meta_changes["video_id"].values[1:][mask[:-1]]
    stop_agent_id = meta_changes["agent_id"].values[1:][mask[:-1]]
    stop_target_id = meta_changes["target_id"].values[1:][mask[:-1]]
    
    for i in range(len(submission_part)):
        video_id = submission_part.video_id.iloc[i]
        agent_id = submission_part.agent_id.iloc[i]
        target_id = submission_part.target_id.iloc[i]
        
        if (stop_video_id[i] != video_id or 
            stop_agent_id[i] != agent_id or 
            stop_target_id[i] != target_id):

            new_stop_frame = meta[meta.video_id == video_id].video_frame.max() + 1
            submission_part.iat[i, submission_part.columns.get_loc("stop_frame")] = new_stop_frame
    
    return submission_part

In [11]:
def clean_submission(submission, dataset, traintest, traintest_directory=None):
    """
    Làm sạch submission:
    1. Bỏ các chuỗi start_frame >= stop_frame
    2. Bỏ các chuỗi bị lặp (cùng agent-target)
    3. Điền video trống với dummy predictions
    """
    if traintest_directory is None:
        traintest_directory = f"/kaggle/input/MABe-mouse-behavior-detection/{traintest}_tracking"
    
    # Bỏ invalid frames
    old_len = len(submission)
    submission = submission[submission.start_frame < submission.stop_frame]
    if len(submission) != old_len:
        print(f"⚠ Removed {old_len - len(submission)} events with start >= stop")
    
    # Bỏ chuỗi bị lặp
    old_len = len(submission)
    group_list = []
    for _, group in submission.groupby(["video_id", "agent_id", "target_id"]):
        group = group.sort_values("start_frame")
        mask = np.ones(len(group), dtype=bool)
        last_stop_frame = 0
        for i, (_, row) in enumerate(group.iterrows()):
            if row["start_frame"] < last_stop_frame:
                mask[i] = False
            else:
                last_stop_frame = row["stop_frame"]
        group_list.append(group[mask])
    
    submission = pd.concat(group_list)
    if len(submission) != old_len:
        print(f"⚠ Removed {old_len - len(submission)} overlapping events")
    
    # Điền video trống
    s_list = []
    for idx, row in dataset.iterrows():
        lab_id = row["lab_id"]
        if lab_id.startswith('MABe22'):
            continue
        
        video_id = row["video_id"]
        if (submission.video_id == video_id).any():
            continue

        if type(row.behaviors_labeled) != str:
            continue
        
        print(f"⚠ Video {video_id} has no predictions, filling...")
        
        path = f"{traintest_directory}/{lab_id}/{video_id}.parquet"
        try:
            vid = pd.read_parquet(path)
        except:
            continue
        
        vid_behaviors = json.loads(row["behaviors_labeled"])
        vid_behaviors = sorted(list({b.replace("'", "") for b in vid_behaviors}))
        vid_behaviors = [b.split(',') for b in vid_behaviors]
        vid_behaviors = pd.DataFrame(vid_behaviors, columns=["agent", "target", "action"])
        
        start_frame = vid.video_frame.min()
        stop_frame = vid.video_frame.max() + 1
        
        for (agent, target), actions in vid_behaviors.groupby(["agent", "target"]):
            batch_length = int(np.ceil((stop_frame - start_frame) / len(actions)))
            for i, (_, action_row) in enumerate(actions.iterrows()):
                batch_start = start_frame + i * batch_length
                batch_stop = min(batch_start + batch_length, stop_frame)
                s_list.append((video_id, agent, target, action_row["action"], batch_start, batch_stop))
    
    if len(s_list) > 0:
        submission = pd.concat([
            submission,
            pd.DataFrame(s_list, columns=["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"])
        ])
    
    submission = submission.reset_index(drop=True)
    return submission

## Execution

### Training pipeline

In [12]:
if CFG.mode == "validate":
    print("\n" + "="*60)
    print("TRAINING PIPELINE")
    print("="*60)

    thresholds_all = {"single": {}, "pair": {}}
    f1_list_all = []

    # Train model
    for section in range(len(body_parts_list)):
        # Lấy body_parts_tracked trong số 9 bộ của toàn dataset
        body_parts_tracked_str = body_parts_list[section]
        body_parts_tracked = json.loads(body_parts_tracked_str)

        if len(body_parts_tracked) > 5:
            body_parts_tracked = [b for b in body_parts_tracked if b not in CFG.drop_body_parts]

        # Lấy các rows/videos được thu với body_parts_tracked tương ứng
        train_subset = train[train.body_parts_tracked == body_parts_tracked_str]

        if train_subset.empty:
            print("\nNo videos in this section, skipping...")
            continue

        print("\n" + "="*60)
        print(f"SECTION {section}/{len(body_parts_list)-1} (9 sections total): {len(body_parts_tracked)} bodyparts, {len(train_subset)} videos")
        print("="*60)

        fps_lookup = (
            train_subset[["video_id", "frames_per_second"]]
            .drop_duplicates("video_id")
            .set_index("video_id")["frames_per_second"]
            .to_dict()
        )
    
        single_mouse = []
        single_meta = []
        single_y = []

        pair_mouse = []
        pair_meta = []
        pair_y = []

        # Accumulate generated data
        for mode, data, meta, labels in generate_mouse_data(train_subset, mode="train"):
            video_id = meta["video_id"].iloc[0]
            fps = fps_lookup.get(video_id, 30.0)

            if mode == "single":
                single_mouse.append(data)
                single_meta.append(meta)
                single_y.append(labels)

            else:
                pair_mouse.append(data)
                pair_meta.append(meta)
                pair_y.append(labels)

            del data, meta, labels
        gc.collect()

        # Train single models
        if len(single_mouse) > 0:
            single_X = []

            for data_i, meta_i, in zip(single_mouse, single_meta):
                fps_i = _fps_from_meta(meta_i, fps_lookup, default_fps=30.0)

                X_i = build_single_features(data_i, body_parts_tracked, fps_i).astype(np.float32)
                single_X.append(X_i)

                del X_i, fps_i
            gc.collect()
        
            X_all = pd.concat(single_X, ignore_index=True)
            y_all = pd.concat(single_y, ignore_index=True)
            meta_all = pd.concat(single_meta, ignore_index=True)

            del single_X, single_mouse, single_meta, single_y
            gc.collect()
        
            print(f"Shape: {X_all.shape[0]} frames × {X_all.shape[1]} features")
        
            f1_list, thresholds = train_action_models(
                X_all, y_all, meta_all, section, "single", n_splits=3
            )
            f1_list_all.extend(f1_list)
            thresholds_all["single"][section] = thresholds

            del X_all, y_all, meta_all
            gc.collect()

        # Train pair models
        if len(pair_mouse) > 0:
            pair_X = []
            for data_i, meta_i in zip(pair_mouse, pair_meta):
                fps_i = _fps_from_meta(meta_i, fps_lookup, default_fps=30.0)

                X_i = build_pair_features(data_i, body_parts_tracked, fps_i).astype(np.float32)
                pair_X.append(X_i)   

                del X_i, fps_i
            gc.collect()

            X_all = pd.concat(pair_X, ignore_index=True)
            y_all = pd.concat(pair_y, ignore_index=True)
            meta_all = pd.concat(pair_meta, ignore_index=True)  

            del pair_X, pair_mouse, pair_y, pair_meta
            gc.collect()
        
            print(f"Shape: {X_all.shape[0]} frames × {X_all.shape[1]} features")
        
            f1_list, thresholds = train_action_models(
                X_all, y_all, meta_all, section, "pair", n_splits=3
            )
            f1_list_all.extend(f1_list)
            thresholds_all["pair"][section] = thresholds

            del X_all, y_all, meta_all
            gc.collect()

    # Save thresholds and F1 scores
    joblib.dump(thresholds_all, "/kaggle/working/thresholds.pkl")
    f1_df = pd.DataFrame(f1_list_all, columns=["section", "mode", "action", "f1"])
    joblib.dump(f1_df, "/kaggle/working/f1_scores.pkl")

    print("\n" + "="*60)
    print(f"Training complete! Mean F1: {f1_df['f1'].mean():.4f}")
    print("="*60)

### Testing pipeline - Create submission

In [13]:
print("\n" + "="*60)
print("PREDICTION PIPELINE")
print("="*60)

# Load thresholds
thresholds_all = joblib.load(f"{CFG.model_save_dir}/thresholds.pkl")
submission_list = []

# Create fps_lookup for test set
fps_lookup = (
    test_csv[["video_id", "frames_per_second"]]
    .drop_duplicates("video_id")
    .set_index("video_id")["frames_per_second"]
    .to_dict()
)

for idx, row in test_csv.iterrows():
    video_id = row.video_id
    lab_id = row.lab_id
    pix_per_cm = row.pix_per_cm_approx
    fps = row.frames_per_second
    body_parts_tracked_str = row.body_parts_tracked
    
    # Tìm section
    try:
        section = body_parts_list.index(body_parts_tracked_str)
    except ValueError:
        print(f"Video {video_id}: body_parts not in training, skip")
        continue
    
    body_parts_tracked = json.loads(body_parts_tracked_str)
    if len(body_parts_tracked) > 5:
        body_parts_tracked = [b for b in body_parts_tracked if b not in CFG.drop_body_parts]
    
    print(f"Processing video {video_id} (section {section})...")

    # Tạo mini-dataset chỉ cho video này
    video_df = pd.DataFrame([row])

    single_dir = f"{CFG.model_save_dir}/models/{section}/single"
    pair_dir = f"{CFG.model_save_dir}/models/{section}/pair"
        
    # Load single models
    models_single = {}
    if os.path.exists(single_dir):
        for f in os.listdir(single_dir):
            if f.endswith("_model.pkl"):
                action = f.replace("_model.pkl", "")
                models_single[action] = joblib.load(f"{single_dir}/{f}")
        
    # Load pair models
    models_pair = {}
    if os.path.exists(pair_dir):
        for f in os.listdir(pair_dir):
            if f.endswith("_model.pkl"):
                action = f.replace("_model.pkl", "")
                models_pair[action] = joblib.load(f"{pair_dir}/{f}")
        
    thr_single = thresholds_all["single"].get(section, {})
    thr_pair = thresholds_all["pair"].get(section, {})

    # Predict single
    for mode, data, meta, actions in generate_mouse_data(video_df, mode="test", generate_single=True, generate_pair=False):
        fps_i = _fps_from_meta(meta, fps_lookup, default_fps=30.0)

        if len(models_single) == 0:
            continue
        # Feature engineering
        X = build_single_features(data, body_parts_tracked, fps_i)

        # Predict
        preds = predict_actions(X, models_single, actions)

        if len(preds.columns) > 0:
            sub_part = predict_multiclass(preds, meta, thr_single)
            if len(sub_part) > 0:
                submission_list.append(sub_part)

        del data, X, preds
        gc.collect()

    # Predict pair
    for mode, data, meta, actions in generate_mouse_data(video_df, mode="test", generate_single=False, generate_pair=True):
        fps_i = _fps_from_meta(meta, fps_lookup, default_fps=30.0)

        if len(models_pair) == 0:
            continue
        # Feature engineering
        X = build_pair_features(data, body_parts_tracked, fps_i)

        # Predict
        preds = predict_actions(X, models_pair, actions)

        if len(preds.columns) > 0:
            sub_part = predict_multiclass(preds, meta, thr_pair)
            if len(sub_part) > 0:
                submission_list.append(sub_part)

        del data, X, preds
        gc.collect()

    del models_single, models_pair
    gc.collect()


# Tạo final submission
if len(submission_list) > 0:
    submission = pd.concat(submission_list, ignore_index=True)
else:
    # Empty fallback
    print("WARNING: No predictions generated!")
    submission = pd.DataFrame({
        "video_id": [], "agent_id": [], "target_id": [],
        "action": [], "start_frame": [], "stop_frame": []
    })

# Làm sạch submission
submission = clean_submission(submission, test_csv, "test", CFG.test_tracking_path)
    
# Thêm row_id
submission.insert(0, "row_id", range(len(submission)))
    
# Save
submission.to_csv("submission.csv", index=False)
    
print("\n" + "="*60)
print("SUBMISSION COMPLETE!")
print("="*60)
print(f"Total events: {len(submission):,}")
print(f"Unique videos: {submission.video_id.nunique()}")
print(f"Actions: {submission.action.value_counts().to_dict()}")
print(f"Saved to: submission.csv")


PREDICTION PIPELINE
Processing video 438887472 (section 0)...

SUBMISSION COMPLETE!
Total events: 855
Unique videos: 1
Actions: {'rear': 593, 'approach': 136, 'avoid': 86, 'submit': 18, 'chase': 18, 'attack': 4}
Saved to: submission.csv
