In [1]:
from pathlib import Path


ROOT = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "SMTB" / "embeddings"
models = ["esm_t6", "esm_t12", "esm_t30", "esm_t33", "esm_t36", "esmc_300m", "esmc_600m", "ankh-base", "ankh-large", "prostt5", "prott5", "ohe"]
datasets = ["fluorescence", "stability", "deeploc2", "deeploc2_bin", "scope_40_208"]
# datasets = ["fluorescence", "stability", "esol", "deeploc2", "deeploc2_bin", "scope_40_208", "casp7"]
last = {
    "stability": "P68976",
    "fluorescence": "P54024",
    # "esol": "P03100",
    "deeploc2": "P28302",
    "deeploc2_bin": "P28302",
    "scope_40_208": "P15176",
    # "casp7": "70#1JAV_1_A",
}

def model_to_depth(model):
    if model == "ohe":
        return 1
    elif "ankh" in model:
        return 49
    elif "t5" in model:
        return 25
    elif "esmc" in model:
        return 31 if "300m" in model else 37
    else:
        return int(model.split("t")[-1]) + 1

## Protein-Level Tasks

In [3]:
# 7 + 13 + 31 + 34 + 37 + 31 + 37 + 49 + 49 + 25 + 25 + 1 = 339 total layers
# 8494 files
EMBEDDINGS_ONLY = True
count, total = 0, 0
prints = []
for m, model in enumerate(models):
    for layer in range(model_to_depth(model))[:1]:
        for d, dataset in enumerate(datasets):
            # for each model, for each layer, for each dataset we have:
            print(f"\r{m}-{layer}-{d} | {count}/{total}", end=" "*10)
            for prefix in ["", "empty_"]:
                if not (dataset == "deeploc2_bin" or (ROOT / (prefix + model) / dataset / f"layer_{layer}" / f"{last[dataset]}.pkl").exists()):
                    # if the last embedding of the protein in the dataset is missing, there are likly no embeddings
                    # there are no embeddings for deeploc2_bin (same as deeploc2)
                    prints.append(f"\rMissing embeddings: {ROOT / (prefix + model) / dataset / f'layer_{layer}'}")
                    continue
            if EMBEDDINGS_ONLY:
                continue
            
            # Count Intrinsic Dimension calculations  # all of them have to be past 15:00 GMT+1 11/03/25
            if dataset != "deeploc2_bin":  # no dataset statistics for deepoc2_bin
                for filename in ["ids", "noverlap", "density"]:
                    if filename.startswith("noverlap") and layer == model_to_depth(model) - 1:  # no noverlap for last layer as it's an in-between-layer metric
                        continue

                    if not (ROOT / model / dataset / f"layer_{layer}" / f"{filename}.csv").exists():
                        prints.append(f"\rMissing {filename} file: {ROOT / model / dataset / f'layer_{layer}'}")
                    # elif os.path.getmtime(fpath) < 1762178956.2564127:  # roughly 15:10 GMT+1 03.11.2025
                    #     prints.append(f"\rOutdated {filename} file: {ROOT / model / dataset / f'layer_{layer}'}")
                    else:
                        count += 1
                    total += 1

            # Check for downstream models
            for algo in ["lr", "knn"]:  # each model should have two models trained per layer
                if dataset == "scope_40_208":  # from the scope dataset, we have two levels of labels: superfamily and fold
                    for level in ["superfamily", "fold"]:
                        # TODO: Check for min_10-files
                        count += (e := (ROOT / model / dataset / f"layer_{layer}" / f"predictions_{algo}_{level}_4.pkl").exists())
                        total += 1
                        if not e:
                            prints.append(f"\rMissing {algo.upper()} file: {ROOT / model / dataset / f'layer_{layer}'}")
                else:
                    count += (e := (ROOT / model / dataset / f"layer_{layer}" / f"predictions_{algo}_42.pkl").exists())
                    total += 1
                    if not e:
                        prints.append(f"\rMissing {algo.upper()} file: {ROOT / model / dataset / f'layer_{layer}'}")

print(f"\rFound {count} out of {total} files.")
print("\n".join(sorted(prints)))

Found 0 out of 0 files.
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-base/deeploc2/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-base/fluorescence/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-base/scope_40_208/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-base/stability/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-large/deeploc2/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-large/fluorescence/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-large/scope_40_208/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_ankh-large/stability/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_esm_t33/deeploc2/layer_0
Missing embeddings: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/empty_esm_t33/fluorescence/la

## Amino-Acid Level Tasks