In [None]:
import shutil
from pathlib import Path

import numpy as np
from tqdm import tqdm

from common.config_utils import base_model_from_file, base_model_to_file
from gbdt.helpers import get_any_train_config, get_train_config
from postprocess.submission_utils import Submission, copy_model_file_to_submission

dst_submission_dir = Path("submissions") / "full-models-26-nov-fix"
if dst_submission_dir.exists():
    shutil.rmtree(dst_submission_dir)
dst_submission_dir.mkdir(exist_ok=True)

submission = base_model_from_file(
    Submission, "submissions/e-nested-f1-all/submission.json"
)

mode = "same"
# mode = "max"
# mode = "median"

new_models = []
for model in tqdm(submission.models):
    config_old = get_any_train_config(model.name)
    # if config.group != model.name:
    #     print(f"model.name={model.name}, config.group={config.group}")
    new_model_name = f"{config_old.group}-full"
    dst = dst_submission_dir / new_model_name
    suff = ""
    if (Path("train_logs") / f"{new_model_name}-seed0-fix").exists():
        suff = "-fix"
    config_new = get_any_train_config(f"{new_model_name}-seed0{suff}")
    assert config_new.get_num_trees() >= max(model.steps)
    for seed in range(5):
        full_name = f"{new_model_name}-seed{seed}{suff}"
        full_path = Path("train_logs") / full_name
        if not full_path.exists():
            print(f"FAIL at {full_name}")
            continue
        dst_ckpt_path = dst / f"cv{seed}"
        if not dst_ckpt_path.exists():
            shutil.copytree(full_path / "cv0", dst_ckpt_path)
            for dir in dst_ckpt_path.rglob("*"):
                if dir.is_dir() and dir.name == "test":
                    shutil.rmtree(dir)

    # keep same set of steps
    model.name = new_model_name
    if mode == "same":
        pass
    elif mode == "max":
        step = max(model.steps)
        model.steps = [step] * len(model.steps)
    elif mode == "median":
        step = int(np.median(model.steps))
        model.steps = [step] * len(model.steps)
    config = get_train_config(f"{new_model_name}-seed0{suff}", 0)
    num_trees = config.get_num_trees()
    steps = []
    for step in model.steps:
        if step > num_trees:
            print(f"PROBLEM with {config.group}")
            print(f"step: {step}, num_trees in full model: {num_trees}, max step was {max(model.steps)}, steps were {model.steps}")
            print()
        step = min(step, num_trees - 1)
        steps.append(step)
    model.steps = steps
    new_models.append(model)
submission.models = new_models
base_model_to_file(submission, f"submission_full_{mode}.json")

In [None]:
from pathlib import Path

from gbdt.helpers import get_train_config
import json

per_action = {}
for model_path in Path("train_logs").iterdir():
    name = model_path.name
    for cv_dir in model_path.iterdir():
        if not cv_dir.is_dir():
            continue
        cv = cv_dir.name
        imps = cv_dir / "final_model" / "feature_importances" / "overall.txt"
        if imps.exists():
            config_old = get_train_config(name, cv)
            if config_old.action in per_action:
                continue
            content = map(lambda s: s.strip(), open(imps).readlines()[:50])
            per_action[config_old.action] = "\n".join(content)
json.dump(per_action, open("all_importances.json", "w"))

In [None]:
import json
from collections import defaultdict
from pathlib import Path

import numpy as np
from tqdm import tqdm

from common.constants import ACTION_NAMES_IN_TEST
from common.ensemble_building_primitives import EnsembleObjective
from config_utils import base_model_to_str
from gbdt.rebalance_utils import DurationStats
from postprocess.ensemble_utils import EnsembleApproach


def get_action_data(dir: Path, action: str) -> str | None:
    json_path = dir / f"{action}.json"
    if not json_path.exists():
        return None
    f1_map = json.load(open(json_path))

    row_per_app = []
    for app in f1_map["f1_valid"].keys():
        f1_valid = f1_map["f1_valid"][app]
        # f1_train = f1_map["f1_train"][app]
        row_per_app.append((f1_valid, f"f1_valid={f1_valid:.5f}, App: {app}"))
    row_per_app.sort(reverse=True) 

    desc = f"action={action}\n"
    for (f1_valid, s) in row_per_app:
        desc += s + "\n"
    desc += "\n"
    return desc




for action in ACTION_NAMES_IN_TEST:
    # desc = get_action_data(Path("approaches"), action)
    # if desc is None:
    #     continue
    # print(desc)
    # continue

    desc_per_action = get_action_data(Path("e_apps/approaches_per_action"), action)
    desc_logloss = get_action_data(Path("e_apps/approaches_logloss"), action)
    if desc_per_action is None or desc_logloss is None:
        continue
    print(f"PR-AUC CKPT SELECTION:")
    print(desc_per_action)
    print(f"LOGLOSS CKPT SELECTION:")
    print(desc_logloss)
    print("-----------")
# logloss ckpt, nested-f1 approach:
# chase, chaseattack, huddle, intromit, mount

In [1]:
from collections import defaultdict
from common.constants import ACTION_NAMES_IN_TEST, LAB_NAMES_IN_TEST
from common.helpers import get_train_meta
from common.parse_utils import parse_behaviors_labeled
from common.config_utils import DataSplitConfig
from common.folds_split_utils import _fold_cache_key
from pathlib import Path
import numpy as np
from gbdt.helpers import is_fully_trained

# def calc_fold_id(
#     meta: pd.DataFrame,
#     config: DataSplitConfig,
#     cache_dir: Path | str,
#     force_recalc: bool = False,
# ):
#     cache_path: Optional[Path] = None
#     if cache_dir:
#         cache_dir = Path(cache_dir)
#         cache_dir.mkdir(parents=True, exist_ok=True)
#         cache_key = _fold_cache_key(meta=meta, config=config)
#         cache_path = cache_dir / f"fold_id_{cache_key}.npy"
#         if cache_path.exists() and not force_recalc:
#             cached = np.load(cache_path, allow_pickle=False)
#             return cached.astype(int)

# train_configs = []
# for model_path in Path("train_logs").iterdir():
#     name = model_path.name
#     train_config = get_any_train_config(name)
#     if train_config:
#         train_configs.append(train_config)

cache_dir = Path("split_cache")

all_paths = set()
for path in cache_dir.iterdir():
    all_paths.add(path)

print(f"Cnt paths: {len(all_paths)}")
by_action = defaultdict(list)
train_meta = get_train_meta()
for action in ACTION_NAMES_IN_TEST:
    config_old = DataSplitConfig(seed=0, num_folds=5, test_fold=0, train_folds=None, actions=[action])

    should_keep_video = train_meta["behaviors_labeled"].apply(
        lambda beh: any(
            item.action in config_old.actions for item in parse_behaviors_labeled(beh)
        )
    )

    meta = train_meta[should_keep_video]

    always_train_mask = ~meta["lab_id"].isin(LAB_NAMES_IN_TEST)
    always_train_rows = meta[always_train_mask].copy()
    meta = meta[~always_train_mask]

    cache_key = _fold_cache_key(meta=meta, config=config_old)
    cache_path = cache_dir / f"fold_id_{cache_key}.npy"
    print(cache_path.exists(), cache_path, action)
    if cache_path.exists():
        all_paths.remove(cache_path)
        fold_id = np.load(cache_path)
        by_action[action].append(fold_id)

for action in ACTION_NAMES_IN_TEST:
    config_old = DataSplitConfig(seed=0, num_folds=5, test_fold=0, train_folds=None, actions=[action])

    should_keep_video = train_meta["behaviors_labeled"].apply(
        lambda beh: any(
            item.action in config_old.actions for item in parse_behaviors_labeled(beh)
        )
    )

    meta = train_meta[should_keep_video]

    always_train_mask = ~meta["lab_id"].isin(LAB_NAMES_IN_TEST)
    always_train_rows = meta[always_train_mask].copy()
    meta = meta[~always_train_mask]

    del config_old.train_folds
    cache_key = _fold_cache_key(meta=meta, config=config_old)
    cache_path = cache_dir / f"fold_id_{cache_key}.npy"
    print(cache_path.exists(), cache_path, action)
    if cache_path.exists():
        all_paths.remove(cache_path)
        fold_id = np.load(cache_path)
        by_action[action].append(fold_id)

print(f"Remain: {len(all_paths)}")

assert len(by_action.keys()) == len(ACTION_NAMES_IN_TEST)
for action in by_action.keys():
    print(f"action={action}, cnt caches: {len(by_action[action])}")
    fold_id = by_action[action][0]
    for i in range(1, len(by_action[action])):
        fold_id_another = by_action[action][i]
        assert np.all(fold_id == fold_id_another)
        print(f"{id(fold_id)} == {id(fold_id_another)}")

for path in all_paths:
    fold_id = np.load(path)
    found = False
    for action, fs in by_action.items():
        for f in fs:
            if f.shape == fold_id.shape and np.all(f == fold_id):
                print(f"Found action={action} for {path}")
                found = True
    if not found:
        print(f"Nothing for {path}, fold_id: {fold_id.shape}, {fold_id.dtype}")

Cnt paths: 71
True split_cache/fold_id_61526ff9937bbcd3ae1a6f7d798e041d3f29df58be50138ab9a03d004fa88ac0.npy allogroom
True split_cache/fold_id_931f537df70aca17baf487cb4c69ae72a5cc19fb162fee0312d32accd8075c8d.npy approach
True split_cache/fold_id_e38878aa3d1cfcd6c13bb518ba05be8eef1d15340032365f4338afe6cc49c810.npy attack
True split_cache/fold_id_bc7f447d7fd01e5bfd5b262ff11481be2f3b7123b51c50ca6856223b6148d985.npy attemptmount
True split_cache/fold_id_545c5635583db71b5b8ac74ef7e5c5f958d57efb276343c19c961eb291409e33.npy avoid
True split_cache/fold_id_1744ec9954b4a604e15a028bd392d2e930a66fe4373bd3639be1d9ea7bba2ce8.npy biteobject
True split_cache/fold_id_795da20bcc92b50023625958b6e52a7c14abed22afb187809f6de944a2e4f699.npy chase
True split_cache/fold_id_054bdd5a71ca01827810ff257bfd0a4b8f0374986956965645bb5c9c4aaac47e.npy chaseattack
True split_cache/fold_id_9abe77f707dc2145cd0b99bdce21e3e47f3a53be167f510328f18143947ec761.npy climb
True split_cache/fold_id_0607428e9a9fd69843c5c343e4b5b1abbbe

KeyError: PosixPath('split_cache/fold_id_61526ff9937bbcd3ae1a6f7d798e041d3f29df58be50138ab9a03d004fa88ac0.npy')