In [None]:
# Bootstrap repo into /content (safe even if re-run)
from pathlib import Path
import os
import shutil
import subprocess

REPO_URL = os.environ.get('CAFA_REPO_GIT_URL', 'https://github.com/PeterOla/cafa-6-protein-function-prediction.git')
REPO_DIR = Path(os.environ.get('CAFA_REPO_DIR', '/content/cafa-6-protein-function-prediction'))
SAFE_CWD = Path('/content') if Path('/content').exists() else Path('/')

def run(cmd: list[str]) -> None:
    cmd_str = ' '.join(cmd)
    print('+', cmd_str)
    p = subprocess.run(cmd, text=True, capture_output=True, cwd=str(SAFE_CWD))
    if p.stdout.strip():
        print(p.stdout)
    if p.stderr.strip():
        print(p.stderr)
    if p.returncode != 0:
        raise RuntimeError(f'Command failed (exit={p.returncode}): {cmd_str}')

os.chdir(SAFE_CWD)
print('SAFE_CWD:', SAFE_CWD)
print('REPO_URL:', REPO_URL)
print('REPO_DIR:', REPO_DIR)

if REPO_DIR.exists() and (REPO_DIR / '.git').is_dir():
    run(['git', '-C', str(REPO_DIR), 'fetch', '--depth', '1', 'origin'])
    run(['git', '-C', str(REPO_DIR), 'reset', '--hard', 'origin/HEAD'])
else:
    if REPO_DIR.exists():
        shutil.rmtree(REPO_DIR, ignore_errors=True)
    run(['git', 'clone', '--depth', '1', REPO_URL, str(REPO_DIR)])

os.environ['CAFA_REPO_ROOT'] = str(REPO_DIR)
os.chdir(REPO_DIR)
print('CWD:', Path.cwd())

In [None]:
# Install dependencies
%pip -q install -r requirements.txt

In [None]:
# Phase 1 (minimal): parse FASTA → feather (so downstream scripts can run)
from pathlib import Path
import pandas as pd
from Bio import SeqIO

artefacts_dir = Path('artefacts_local') / 'artefacts'
parsed_dir = artefacts_dir / 'parsed'
parsed_dir.mkdir(parents=True, exist_ok=True)

train_fasta = Path('Train/train_sequences.fasta')
test_fasta = Path('Test/testsuperset.fasta')
train_out = parsed_dir / 'train_seq.feather'
test_out = parsed_dir / 'test_seq.feather'

def fasta_to_feather(inp: Path, out: Path) -> None:
    rows = []
    for r in SeqIO.parse(str(inp), 'fasta'):
        rows.append({'id': str(r.id), 'sequence': str(r.seq)})
    df = pd.DataFrame(rows)
    if df.empty:
        raise RuntimeError(f'Parsed 0 sequences from {inp}')
    out.parent.mkdir(parents=True, exist_ok=True)
    df.to_feather(out)
    print('Wrote:', out, 'shape=', df.shape)

if not train_out.exists():
    print('Building:', train_out)
    fasta_to_feather(train_fasta, train_out)
else:
    print('Exists:', train_out)

if not test_out.exists():
    print('Building:', test_out)
    fasta_to_feather(test_fasta, test_out)
else:
    print('Exists:', test_out)

In [None]:
# NCBI settings (recommended)
import os
os.environ['NCBI_EMAIL'] = os.environ.get('NCBI_EMAIL', 'your.email@example.com')
# os.environ['NCBI_API_KEY'] = '...'  # optional
print('NCBI_EMAIL:', os.environ.get('NCBI_EMAIL'))

In [None]:
# Build EntryID → text corpus (UniProt + PubMed abstracts)
# Produces: artefacts_local/artefacts/external/entryid_text.tsv
# Start small with --max-ids, then remove/raise it for a full run.
!python scripts/03_build_entryid_text_from_uniprot_pubmed.py --max-ids 1000 --max-pubmed-per-protein 3 --strip-go --sleep-uniprot 0.1 --sleep-pubmed 0.34

In [None]:
# Generate TF‑IDF embeddings (10279D) from entryid_text.tsv
!python scripts/02_generate_optional_embeddings.py --mode text --text-path artefacts_local/artefacts/external/entryid_text.tsv --text-dim 10279

In [None]:
# Optional: execute the solution notebook in this same runtime.
# If it is Kaggle-specific, it may need path tweaks before it will run on Colab.
# Uncomment to run:
# !jupyter nbconvert --to notebook --execute notebooks/CAFA6_Rank1_Solution.ipynb --output /content/CAFA6_Rank1_Solution.executed.ipynb
print('Ready: corpus + TF‑IDF are built. Run your solution next.')