In [1]:
from pathlib import Path


ROOT = Path("/") / "scratch" / "SCRATCH_SAS" / "roman" / "SMTB" / "embeddings"
models = ["esm_t6", "esm_t12", "esm_t30", "esm_t33", "esm_t36", "esmc_300m", "esmc_600m", "ankh-base", "ankh-large", "prostt5", "prott5", "ohe"]
datasets = ["fluorescence", "stability", "esol", "deeploc2", "deeploc2_bin"]
last = {
    "stability": "P68976",
    "fluorescence": "P54024",
    "esol": "P03100",
    "deeploc2": "P28302",
    "deeploc2_bin": "P28302",
    "scope_40_208": "P15176"
}

def model_to_depth(model):
    if model == "ohe":
        return 1
    elif "ankh" in model:
        return 49
    elif "t5" in model:
        return 25
    elif "esmc" in model:
        return 31 if "300m" in model else 37
    else:
        return int(model.split("t")[-1]) + 1

In [2]:
# target: 4746 = (7 + 13 + 31 + 34 + 37 + 31 + 37 + 49 + 49 + 25 + 25 + 1) * (5 * 3 - 1) 
count, total = 0, 0
for m, model in enumerate(models):
    for layer in range(model_to_depth(model)):
        for d, dataset in enumerate(datasets):
            print(f"\r{m}-{layer}-{d} | {count}/{total}", end=" "*10)
            if not (dataset == "deeploc2_bin" or (ROOT / model / dataset / f"layer_{layer}" / f"{last[dataset]}.pkl").exists()):
                print(f"\rMissing embedding: {ROOT / model / dataset / f'layer_{layer}'}")
                continue
            
            # Count Intrinsic Dimension calculations
            if dataset != "deeploc2_bin":
                count += (ROOT / model / dataset / f"layer_{layer}" / f"ids.csv").exists()
                if not (ROOT / model / dataset / f"layer_{layer}" / f"ids.csv").exists():
                    print(f"\rMissing IDs file: {ROOT / model / dataset / f'layer_{layer}'}")
                total += 1

            # Check for downstream models
            for algo in ["lr", "knn"]:
                count += (e := (ROOT / model / dataset / f"layer_{layer}" / f"predictions_{algo}_42.pkl").exists())
                total += 1
                if not e:
                    print(f"\rMissing {algo.upper()} file: {ROOT / model / dataset / f'layer_{layer}'}")

print(f"\rFound {count} out of {total} files.")

Found 4746 out of 4746 files.


In [4]:
import pickle


for model in models[:-1]:
    with open(ROOT / model / "esol" / "layer_0" / "P00000.pkl", "rb") as f:
        print(model, pickle.load(f).shape)

esm_t6 (320,)
esm_t12 (480,)
esm_t30 (640,)
esm_t33 (1280,)
esm_t36 (2560,)
esmc_300m (960,)
esmc_600m (1152,)
ankh-base (768,)
ankh-large (1536,)
prostt5 (1024,)
prott5 (1024,)


In [3]:
count, total = 0, 0
for m, model in enumerate(models):
    for layer in range(model_to_depth(model)):
        print(f"\r{m}-{layer} | {count}/{total}", end=" "*10)
        # if not (ROOT / model / "scope_40_208" / f"layer_{layer}" / f"{last["scope_40_208"]}.pkl").exists():
        #     print(f"\rMissing embedding: {ROOT / model / "scope_40_208" / f'layer_{layer}'}")
        #     continue
        
        # Count Intrinsic Dimension calculations
        # count += (ROOT / model / "scope_40_208" / f"layer_{layer}" / f"ids.csv").exists()
        # if not (ROOT / model / "scope_40_208" / f"layer_{layer}" / f"ids.csv").exists():
        #     print(f"\rMissing IDs file: {ROOT / model / "scope_40_208" / f'layer_{layer}'}")
        # total += 1

        # Check for downstream models
        for form in ["fold", "superfamily"]:
            count += (e := (ROOT / model / "scope_40_208" / f"layer_{layer}" / f"correlations_{form}_4.pkl").exists())
            total += 1
            if not e:
                print(f"\rMissing {form} file: {ROOT / model / "scope_40_208" / f'layer_{layer}'}")

print(f"\rFound {count} out of {total} files.")

Missing fold file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_0
Missing superfamily file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_0
Missing fold file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_1
Missing superfamily file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_1
Missing fold file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_2
Missing superfamily file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_2
Missing fold file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_3
Missing superfamily file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_3
Missing fold file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_4
Missing superfamily file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm_t6/scope_40_208/layer_4
Missing fold file: /scratch/SCRATCH_SAS/roman/SMTB/embeddings/esm