In [None]:
from sklearn.base import clone
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import joblib
import os
import gc

In [None]:
def train_binary_classifiers(data_list, mode, model, section, n_splits=5):
    """
    data_list: all_single hoặc all_pair của 1 video
    mode: "single" hoặc "pair"
    model: estimator scikit-learn
    section: 1 -> len(body_parts_list)
    """
    save_dir = f"/kaggle/working/models/{section}/{mode}"
    os.makedirs(save_dir, exist_ok=True)
    
    thresholds = {}
    f1_list = []

    # ⬇️ NEW: lưu F1 theo từng action để print summary sau cùng
    action_results = {}

    # iterate qua từng video/chuột hoặc cặp chuột
    for idx, (df_meta, df_feat, df_lab) in enumerate(data_list):
        groups = df_meta["video_frame"].values
        
        for action in df_lab.columns:
            y = df_lab[action].values.astype(int)
            X = df_feat.values
            
            if y.sum() == 0:
                continue
            
            if len(np.unique(groups)) < n_splits:
                continue

            # Out-of-fold prediction
            oof_preds = np.zeros_like(y, dtype=float)
            gkf = GroupKFold(n_splits=min(n_splits, len(np.unique(groups))))
            
            for train_idx, val_idx in gkf.split(X, y, groups):
                clf = clone(model)
                clf.fit(X[train_idx], y[train_idx])
                oof_preds[val_idx] = clf.predict_proba(X[val_idx])[:, 1]

            # Tune threshold để max F1
            best_thresh = 0.5
            best_f1 = 0
            for t in np.linspace(0.1, 0.9, 17):
                f1 = f1_score(y, oof_preds >= t, zero_division=0)
                if f1 > best_f1:
                    best_f1 = f1
                    best_thresh = t
            
            thresholds[action] = best_thresh
            f1_list.append((section, action, best_f1))

            # NEW: lưu F1 theo action
            if action not in action_results:
                action_results[action] = []
            action_results[action].append(best_f1)
            
            # Train final model
            final_clf = clone(model)
            final_clf.fit(X, y)

            joblib.dump(final_clf, f"{save_dir}/{action}_model.pkl")
            joblib.dump(best_thresh, f"{save_dir}/{action}_threshold.pkl")
            
            del final_clf, oof_preds
            gc.collect()
    
    # ⬇️ NEW: print summary cho toàn bộ actions
    print(f"\n[Section {section}] Summary for {mode}:")
    for action, scores in action_results.items():
        print(f"  - Action {action}: mean F1 = {np.mean(scores):.4f} ({len(scores)} samples)")

    return f1_list, thresholds

In [None]:
def make_section_predictions(df_features, df_meta, model_name, section):
    """
    df_features: dataframe features (chỉ features)
    df_meta: dataframe meta chứa video_frame, agent_id và target_id
    model_dir: thư mục chứa model
    section: "single" hoặc "pair"
    """

    section_dir = f"{model_name}/{section}"
    actions = []

    # Lấy list actions
    for f in os.listdir(section_dir):
        if f.endswith("_model.pkl"):
            actions.append(f.replace("_model.pkl", ""))

    df_preds = pd.DataFrame(0, index=df_features.index, columns = actions)

    for action in actions:
        model_path = f"{section_dir}/{action}_model.pkl"
        threshold_path = f"{section_dir}/{action}_threshold.pkl"

        if not os.path.exists(model_path):
            continue

        model = joblib.load(model_path)
        threshold = joblib.load(threshold_path)

        proba = model.predict_proba(df_features.values)[:, 1]
        df_preds[action] = (proba >= threshold).astype(int)

    return df_meta.copy(), df_preds

In [None]:
def all_predictions(all_single, all_pair, model_dir):
    """
    all_single: list (df_meta, df_features) từng chuột
    all_pair: list (df_meta, df_features) từng cặp chuột
    """
    test_results = []

    for df_meta, df_feat, _ in all_single:
        meta, preds = make_section_predictions(df_feat, df_meta, model_dir, "single")
        test_results.append(("single", meta, preds))

    for df_meta, df_feat, _ in all_pair:
        meta, preds = make_section_predictions(df_feat, df_meta, model_dir, "pair")
        test_results.append(("pair", meta, preds))

    return test_results

In [None]:
def detect_action_segments(frames, pred_col):
    """
    frames: index hoặc mảng video_frame
    pred_col: np array 0/1
    Return list (start, stop)
    """
    segments = []
    start = None
    
    for i, pred in enumerate(pred_col):
        if pred == 1 and start is None:
            start = frames[i]   # bắt đầu action segment
        elif pred == 0 and start is not None:
            stop = frames[i-1]
            segments.append((start, stop))
            start = None

    # nếu kết thúc bằng 1
    if start is not None:
        segments.append((start, frames[-1]))

    return segments

In [None]:
def create_submission(test_results, video_id):
    """
    test_results: output của all_predictions
    video_id: ID của video test
    """
    rows = []

    for section, df_meta, df_preds in test_results:
        frames = df_meta["video_frame"].values

        for action in df_preds.columns:
            pred_col = df_preds[action].values
            segments = detect_action_segments(frames, pred_col)

            for (start, stop) in segments:
                # Lấy agent_id/target_id từ df_meta tương ứng với start frame
                mask = (df_meta["video_frame"] >= start) & (df_meta["video_frame"] <= stop)
                agent_ids = df_meta.loc[mask, "agent_id"].unique()
                target_ids = df_meta.loc[mask, "target_id"].unique()

                # Nếu single thì agent_id == target_id, nếu pair thì giữ 2 id
                for a_id in agent_ids:
                    for t_id in target_ids:
                        rows.append([
                            video_id,
                            a_id,
                            t_id,
                            action,
                            int(start),
                            int(stop)
                        ])

    submission = pd.DataFrame(rows, columns=["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"])
    submission.insert(0, "row_id", range(len(submission)))
    return submission