# CAFA6 — Build EntryID → Text Corpus (UniProt + PubMed)

Produces `artefacts_local/artefacts/external/entryid_text.tsv` for the TF-IDF text modality (10279D).

Notes:
- This step is network/CPU bound; GPU doesn’t matter.
- If you’re running on a remote runtime (e.g. Colab), the repo must exist on that runtime’s filesystem.

In [None]:
# Colab bootstrap: get the repo onto this runtime (/content)
# If you are running in VS Code attached to a Colab kernel, this is required
# because your local Windows filesystem is NOT visible on the Colab VM.

from pathlib import Path
import os

REPO_URL = 'https://github.com/PeterOla/cafa-6-protein-function-prediction.git'
REPO_DIR = Path('/content/cafa-6-protein-function-prediction')

if not REPO_DIR.exists():
    print('Cloning repo into:', REPO_DIR)
    !git clone --depth 1 {REPO_URL} {REPO_DIR}
else:
    print('Repo already present:', REPO_DIR)

os.environ['CAFA_REPO_ROOT'] = str(REPO_DIR)
os.chdir(REPO_DIR)
print('CAFA_REPO_ROOT:', os.environ.get('CAFA_REPO_ROOT'))
print('CWD:', Path.cwd())

In [None]:
# Resolve and validate repo root
from pathlib import Path
import os

repo_root = Path(os.environ.get('CAFA_REPO_ROOT', '/content/cafa-6-protein-function-prediction'))
if not ((repo_root / 'requirements.txt').is_file() and (repo_root / 'scripts').is_dir()):
    raise FileNotFoundError(
        f'Repo root not found at {repo_root}. '
        'Run the previous bootstrap cell (git clone), or set CAFA_REPO_ROOT.'
    )

os.chdir(repo_root)
print('Repo root:', repo_root)
print('CWD:', Path.cwd())

In [None]:
# Path diagnostics: what does this runtime actually see?
from pathlib import Path
import os
import sys

print('Python:', sys.version)
print('Executable:', sys.executable)
print('CWD:', Path.cwd())

candidates = [
    Path.cwd(),
    Path.home(),
    Path.home() / 'Documents',
    Path('/content'),
    Path('/workspace'),
    Path('/kaggle'),
]

for p in candidates:
    try:
        exists = p.exists()
    except Exception:
        exists = False
    print(f'-- {p} (exists={exists})')
    if exists and p.is_dir():
        try:
            children = sorted([c.name for c in p.iterdir()])[:50]
            print('   children[0:50]:', children)
        except Exception as e:
            print('   listdir error:', repr(e))

print('CAFA_REPO_ROOT env:', os.environ.get('CAFA_REPO_ROOT'))

In [None]:
# Install dependencies (skip if your kernel already has them)
%pip -q install -r requirements.txt

## Configure NCBI settings
NCBI recommends supplying a contact email. If you have an NCBI API key, set it too (higher rate limits).

In [None]:
import os
os.environ['NCBI_EMAIL'] = 'Olalerupeter@gmail.com'  # change if needed
# os.environ['NCBI_API_KEY'] = '...'  # optional

## Build corpus
Tip: start with `--max-ids 1000` to validate end-to-end, then remove it for full scale.

In [None]:
# Build EntryID -> text corpus (UniProt + PubMed abstracts)
# Produces: artefacts_local/artefacts/external/entryid_text.tsv
!python scripts/03_build_entryid_text_from_uniprot_pubmed.py --max-ids 1000 --max-pubmed-per-protein 3 --strip-go --sleep-uniprot 0.1 --sleep-pubmed 0.34

In [None]:
# Quick sanity check (won’t crash if the TSV isn’t produced yet)
from pathlib import Path
import pandas as pd

p = Path('artefacts_local/artefacts/external/entryid_text.tsv')
if not p.is_file():
    print('Missing:', p)
    ext_dir = p.parent
    print('External dir exists:', ext_dir.exists(), ext_dir)
    if ext_dir.exists():
        print('External dir contents:', sorted([x.name for x in ext_dir.iterdir()])[:50])
    print('Run the previous cell to build the corpus. If it failed, scroll that cell output.')
else:
    df = pd.read_csv(p, sep='\t')
    print('Shape:', df.shape)
    print(df.head(3))
    print('Non-empty text rows:', int((df['text'].fillna('').str.len() > 0).sum()))


In [None]:
# Package output + caches (Windows-safe zip)
from pathlib import Path
import zipfile

out_zip = Path('entryid_text_and_cache.zip')
paths = [
    Path('artefacts_local/artefacts/external/entryid_text.tsv'),
    Path('artefacts_local/artefacts/external/uniprot_pubmed_cache'),
]

with zipfile.ZipFile(out_zip, 'w', compression=zipfile.ZIP_DEFLATED) as z:
    for p in paths:
        if p.is_file():
            z.write(p, p.as_posix())
        elif p.is_dir():
            for fp in p.rglob('*'):
                if fp.is_file():
                    z.write(fp, fp.as_posix())

print('Wrote:', out_zip.resolve())

In [None]:
# Optional: verify GPU (not required for this notebook)
!nvidia-smi