In [None]:
import shutil
from pathlib import Path
from omegaconf import OmegaConf
from typing import Callable
import warnings


def datasets_to_storage(
    path: Path, is_date_dir: bool, is_multirun_dir: bool, target_dir: Path, subdir_id_creator: Callable
):
    if not is_date_dir or not is_multirun_dir:
        raise NotImplementedError()

    for timestamp_dir in path.iterdir():
        if not timestamp_dir.is_dir():
            continue

        for run_dir in timestamp_dir.iterdir():
            if not run_dir.is_dir():
                continue

            cfg = OmegaConf.load(run_dir / ".hydra" / "config.yaml")
            target_subdir = target_dir / subdir_id_creator(cfg)
            target_subdir.mkdir(exist_ok=True)

            shutil.copy2(str(run_dir / "dataset_dict.json"), target_subdir)

            dirs = [d for d in run_dir.iterdir() if d.is_dir() and d.name != "runs"]  # filtering tensorflow logs
            for d in dirs:
                split_dir = target_subdir / d.name
                if split_dir.exists():
                    warnings.warn(f"{split_dir} already exists. Skipping.")
                    continue
                print(split_dir)
                shutil.copytree(str(d), str(split_dir))


def experiment_results_to_storage(
    path: Path, is_date_dir: bool, is_multirun_dir: bool, target_dir: Path, subdir_id_creator: Callable
):
    if not is_date_dir or not is_multirun_dir:
        raise NotImplementedError()

    for timestamp_dir in path.iterdir():
        if not timestamp_dir.is_dir():
            continue

        for run_dir in timestamp_dir.iterdir():
            if not run_dir.is_dir():
                continue

            cfg = OmegaConf.load(run_dir / ".hydra" / "config.yaml")
            target_subdir = target_dir / subdir_id_creator(cfg)
            target_subdir.mkdir(exist_ok=True)

            weights = run_dir / "model.safetensors"
            training_args = run_dir / "training_args.bin"
            model_cfg = run_dir / "config.json"
            hydra_cfg = run_dir / ".hydra" / "config.yaml"
            for f_to_be_copied in [weights, training_args, model_cfg, hydra_cfg]:
                print(target_subdir / f_to_be_copied.name)
                shutil.copy2(str(f_to_be_copied), str(target_subdir / f_to_be_copied.name))


def delete_unused_checkpoints(path: Path, is_date_dir: bool, is_multirun_dir: bool):
    if not is_date_dir or not is_multirun_dir:
        raise NotImplementedError()

    for timestamp_dir in path.iterdir():
        if not timestamp_dir.is_dir():
            continue

        for run_dir in timestamp_dir.iterdir():
            if not run_dir.is_dir():
                continue

            for dir_to_delete in run_dir.glob("checkpoint*"):
                print(f"Deleting {dir_to_delete}")
                for obj in dir_to_delete.iterdir():
                    obj.unlink()
                dir_to_delete.rmdir()

## Shortcut models

- copy best model to `experiments` directory
- delete unused checkpoints

In [None]:
path = Path("/root/similaritybench/multirun/2024-03-20")
is_date_dir = True
is_multirun_dir = True
target_dir = Path("/root/similaritybench/experiments/models/nlp/shortcut")


def shortcut_subdir_id(cfg):
    dataset_id = cfg.dataset.path if not cfg.dataset.name else f"{cfg.dataset.path}__{cfg.dataset.name}"
    shortcut_rate = str(cfg.shortcut_rate).replace(".", "")
    return f"{dataset_id}_pre{cfg.model.seed}_ft{cfg.dataset.finetuning.trainer.args.seed}_scrate{shortcut_rate}"

experiment_results_to_storage(path, is_date_dir, is_multirun_dir, target_dir, shortcut_subdir_id)
delete_unused_checkpoints(path, is_date_dir, is_multirun_dir)


New shortcut rates that take base rate of classes into account

In [None]:
path = Path("/root/similaritybench/multirun/2024-04-02")
is_date_dir = True
is_multirun_dir = True
target_dir = Path("/root/similaritybench/experiments/models/nlp/shortcut")


def shortcut_subdir_id(cfg):
    dataset_id = cfg.dataset.path if not cfg.dataset.name else f"{cfg.dataset.path}__{cfg.dataset.name}"
    shortcut_rate = str(cfg.shortcut_rate).replace(".", "")
    return f"{dataset_id}_pre{cfg.model.seed}_ft{cfg.dataset.finetuning.trainer.args.seed}_scrate{shortcut_rate}"

experiment_results_to_storage(path, is_date_dir, is_multirun_dir, target_dir, shortcut_subdir_id)
delete_unused_checkpoints(path, is_date_dir, is_multirun_dir)


## Augmented models

(copied over from Viserion)

In [None]:
path = Path("/root/multirun/2024-03-20")
is_date_dir = True
is_multirun_dir = True
target_dir = Path("/root/similaritybench/experiments/models/nlp/augmentation")
datasets_target_dir = Path("/root/similaritybench/experiments/datasets/nlp/robustness")


def augmentation_subdir_id(cfg):
    dataset_id = cfg.dataset.path if not cfg.dataset.name else f"{cfg.dataset.path}__{cfg.dataset.name}"
    augmentation_id = "eda"
    strength = str(float(cfg.augmentation.recipe.pct_words_to_swap)).replace(".", "")
    return f"{dataset_id}_pre{cfg.model.seed}_ft{cfg.dataset.finetuning.trainer.args.seed}_{augmentation_id}_strength{strength}"


def augmentation_dataset_subdir_id(cfg):
    dataset_id = cfg.dataset.path if not cfg.dataset.name else f"{cfg.dataset.path}__{cfg.dataset.name}"
    augmentation_id = "eda"
    strength = str(float(cfg.augmentation.recipe.pct_words_to_swap)).replace(".", "")
    return f"{dataset_id}_{augmentation_id}_strength{strength}"


# experiment_results_to_storage(path, is_date_dir, is_multirun_dir, target_dir, augmentation_subdir_id)
# delete_unused_checkpoints(path, is_date_dir, is_multirun_dir)
datasets_to_storage(path, is_date_dir, is_multirun_dir, datasets_target_dir, augmentation_dataset_subdir_id)

## Memorizing Models

In [None]:
is_date_dir = True
is_multirun_dir = True
target_dir = Path("/root/similaritybench/experiments/models/nlp/memorizing")
datasets_target_dir = Path("/root/similaritybench/experiments/datasets/nlp/memorizing")


def memorizing_subdir_id(cfg):
    dataset_id = cfg.dataset.path if not cfg.dataset.name else f"{cfg.dataset.path}__{cfg.dataset.name}"
    strength = str(float(cfg.memorization_rate)).replace(".", "")
    return f"{dataset_id}_pre{cfg.model.seed}_ft{cfg.dataset.finetuning.trainer.args.seed}_labels{cfg.memorization_n_new_labels}_strength{strength}"


def memorizing_dataset_subdir_id(cfg):
    dataset_id = cfg.dataset.path if not cfg.dataset.name else f"{cfg.dataset.path}__{cfg.dataset.name}"
    strength = str(float(cfg.memorization_rate)).replace(".", "")
    return f"{dataset_id}_labels{cfg.memorization_n_new_labels}_strength{strength}"


path = Path("/root/similaritybench/multirun/2024-03-25")
experiment_results_to_storage(path, is_date_dir, is_multirun_dir, target_dir, memorizing_subdir_id)
delete_unused_checkpoints(path, is_date_dir, is_multirun_dir)
datasets_to_storage(path, is_date_dir, is_multirun_dir, datasets_target_dir, memorizing_dataset_subdir_id)

path = Path("/root/similaritybench/multirun/2024-03-26")
experiment_results_to_storage(path, is_date_dir, is_multirun_dir, target_dir, memorizing_subdir_id)
delete_unused_checkpoints(path, is_date_dir, is_multirun_dir)
datasets_to_storage(path, is_date_dir, is_multirun_dir, datasets_target_dir, memorizing_dataset_subdir_id)

## Standard Models

Models are already copied over, just need to add the hydra configs.

In [None]:
path = Path("/root/similaritybench/experiments/models/nlp/standard")
for model_path in path.iterdir():
    with (model_path / "path.txt").open() as f:
        original_hydra_path = f.readline()

    config_path = Path(original_hydra_path) / ".hydra" / "config.yaml"
    print(config_path, model_path / config_path.name)
    shutil.copy2(str(config_path), str(model_path / config_path.name))
