# CAFA6 â€” Generate Optional Embedding Artefacts

Generates optional `.npy` artefacts under `artefacts_local/artefacts/features/` using `scripts/02_generate_optional_embeddings.py`.

Modes:
- `--mode esm2_3b` (ESM2-3B)
- `--mode ankh` (Ankh)
- `--mode text` (10279D TF-IDF; requires `artefacts_local/artefacts/external/entryid_text.tsv`)

In [None]:
# Colab bootstrap: get the repo onto this runtime (/content)
from pathlib import Path
import os

REPO_URL = 'https://github.com/PeterOla/cafa-6-protein-function-prediction.git'
REPO_DIR = Path('/content/cafa-6-protein-function-prediction')

if not REPO_DIR.exists():
    print('Cloning repo into:', REPO_DIR)
    !git clone --depth 1 {REPO_URL} {REPO_DIR}
else:
    print('Repo already present:', REPO_DIR)

os.environ['CAFA_REPO_ROOT'] = str(REPO_DIR)
os.chdir(REPO_DIR)
print('CAFA_REPO_ROOT:', os.environ.get('CAFA_REPO_ROOT'))
print('CWD:', Path.cwd())

In [None]:
# Resolve and validate repo root
from pathlib import Path
import os

repo_root = Path(os.environ.get('CAFA_REPO_ROOT', '/content/cafa-6-protein-function-prediction'))
if not ((repo_root / 'requirements.txt').is_file() and (repo_root / 'scripts').is_dir()):
    raise FileNotFoundError(
        f'Repo root not found at {repo_root}. '
        'Run the previous bootstrap cell (git clone), or set CAFA_REPO_ROOT.'
    )

os.chdir(repo_root)
print('Repo root:', repo_root)
print('CWD:', Path.cwd())

In [None]:
# Install dependencies (skip if your kernel already has them)
%pip -q install -r requirements.txt

## 1) Generate TF-IDF text embeddings (10279D)
Requires `artefacts_local/artefacts/external/entryid_text.tsv` produced by Colab_01 notebook.

In [None]:
# TF-IDF (text) embeddings
!python scripts/02_generate_optional_embeddings.py --mode text --text-dim 10279

## 2) Generate protein embeddings (optional)
These require a GPU and can take hours depending on model + dataset size.

In [None]:
# ESM2-3B (GPU recommended)
# !python scripts/02_generate_optional_embeddings.py --mode esm2_3b --batch-size 1

# Ankh-Large (GPU recommended)
# !python scripts/02_generate_optional_embeddings.py --mode ankh --batch-size 1