In [1]:
# Install all the project dependencies
!pip install uv toml
!uv sync
!uv pip install '.[cuda]'

# --- NEW FIX for PyTorch Geometric ---
# The 'torch-geometric' library needs special packages (pyg-lib or torch-cluster)
# that are in a separate repository. We must install them manually
# after the main packages are installed.

print("🔧 Figuring out PyTorch/CUDA versions to install PyG dependencies...")

try:
    import torch
    import os

    # 1. Get PyTorch version
    # (e.g., "2.8.0")
    torch_version = torch.__version__.split('+')[0]
    
    # 2. Get CUDA version
    if torch.cuda.is_available():
        # (e.g., "12.1")
        cuda_version_str = torch.version.cuda
        if cuda_version_str:
            # (e.g., "cu121")
            cuda_full = f"cu{cuda_version_str.replace('.', '')}"
        else:
            print("⚠️ PyTorch reports CUDA is available, but version is unknown. Falling back to CPU.")
            cuda_full = "cpu"
    else:
        print("🧑‍💻 No CUDA detected. Installing CPU-only version of PyG helpers.")
        cuda_full = "cpu"

    print(f"✅ Detected PyTorch {torch_version} and {cuda_full}")

    # 3. Construct the install command
    # We use pip here because it's simpler for handling the -f (find-links) flag
    # This command tells pip to look for these packages at the special URL
    install_command = f"pip install torch-cluster pyg-lib -f https://data.pyg.org/whl/torch-{torch_version}+{cuda_full}.html"
    
    print(f"🚀 Running: {install_command}")
    
    # Run the command
    # We use os.system() because '!' in a notebook can have environment issues
    install_status = os.system(install_command)
    
    if install_status != 0:
        print(f"🚨 Install command failed with status {install_status}. Please check errors.")
    else:
        print("✅ PyG dependencies (torch-cluster, pyg-lib) installed successfully!")

except ImportError:
    print("🚨 Failed to import PyTorch. Please ensure 'uv sync' ran correctly.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Collecting uv
  Downloading uv-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting toml
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Downloading uv-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m227.8 MB/s[0m  [33m0:00:00[0m
[?25hDownloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: uv, toml
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [toml]
[1A[2KSuccessfully installed toml-0.10.2 uv-0.9.5
Using CPython [36m3.11.14[39m
Creating virtual environment at: [36m.venv[39m
[2mResolved [1m102 packages[0m [2min 2ms[0m[0m
[2K[2mPrepared [1m98 packages[0m [2min 33.69s[0m[0m                                           
[2K[2mInstalled [1m98 packages[0m [2min 340ms[0m[0m                              [0m
 [32m+[39m [1maccelerate[0m[2m==1.10.1[0m


In [2]:
!mkdir -p data/raw/

print("⬇️ Downloading ClinVar (variant_summary.txt.gz)...")
!wget -q "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" -O "data/raw/variant_summary.txt.gz"

print("⬇️ Downloading GRCh38 Genome (GCF_000001405.40_GRCh38.p14_genomic.fna.gz)...")
!wget -q "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz" -O "data/raw/GCF_000001405.40_GRCh38.p14_genomic.fna.gz"

print("⬇️ Downloading Gene Ontology (goa_human.gaf.gz)...")
!wget -q "https://current.geneontology.org/annotations/goa_human.gaf.gz" -O "data/raw/goa_human.gaf.gz"

print("⬇️ Downloading GO Hierarchy (go.json)...")
!wget -q "https://purl.obolibrary.org/obo/go.json" -O "data/raw/go.json"

print("✅ All raw data downloaded!")

⬇️ Downloading ClinVar (variant_summary.txt.gz)...
⬇️ Downloading GRCh38 Genome (GCF_000001405.40_GRCh38.p14_genomic.fna.gz)...
⬇️ Downloading Gene Ontology (goa_human.gaf.gz)...
⬇️ Downloading GO Hierarchy (go.json)...
✅ All raw data downloaded!


In [17]:
import os

fasta_gz_path = "data/raw/GCF_000001405.40_GRCh38.p14_genomic.fna.gz"
fasta_path = "data/raw/GCF_000001405.40_GRCh38.p14_genomic.fna"

if os.path.exists(fasta_gz_path):
    print(f"🧬 Decompressing the genome FASTA file: {fasta_gz_path}...")
    print("This might take a minute...")
    # Use gunzip to decompress the file
    !gunzip -k {fasta_gz_path}
    print(f"✅ FASTA file decompressed to: {fasta_path}")
    # Verify it exists
    if not os.path.exists(fasta_path):
         print(f"🚨 Error: Decompression failed. {fasta_path} not found.")
    else:
        # Optionally remove the compressed version to save space, but let's keep it for now
        # !rm {fasta_gz_path}
        pass
elif os.path.exists(fasta_path):
    print(f"✅ FASTA file already decompressed at: {fasta_path}")
else:
    print(f"🚨 Error: Cannot find FASTA file at {fasta_gz_path} or {fasta_path}. Please run Cell 2 first.")

🧬 Decompressing the genome FASTA file: data/raw/GCF_000001405.40_GRCh38.p14_genomic.fna.gz...
This might take a minute...
✅ FASTA file decompressed to: data/raw/GCF_000001405.40_GRCh38.p14_genomic.fna


In [3]:
!mkdir -p data/processed/

print("🧬 Starting GO Node2Vec embedding generation... (This may take a few minutes)")

!python src/go/go_node2vec.py \
    --go-json data/raw/go.json \
    --gaf data/raw/goa_human.gaf.gz \
    --out-prefix data/processed/go_n2v \
    --dim 256 \
    --epochs 8 \
    --batch-size 384 \
    --walk-len 40 \
    --walks-per-node 12 \
    --ctx-size 6 \
    --neg-samples 5 \
    --aspect PF \
    --drop-roots \
    --prune-term-degree 800

print("✅ GO embeddings created at data/processed/go_n2v_genes.npz")

🧬 Starting GO Node2Vec embedding generation... (This may take a few minutes)
Parsing GO (is_a only) + GAF…
  term edges (is_a): 60,096 | gene-term edges (taxon=9606, aspects=FP): 694,422
Pruned isolates: 18,267
Graph: 62,439 nodes, 274,239 edges
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  total += float(loss)
epoch 1/8: 100%|██████████████████████████████| 163/163 [00:10<00:00, 14.98it/s]
epoch 1 avg loss 8.2448
epoch 2/8: 100%|██████████████████████████████| 163/163 [00:09<00:00, 17.06it/s]
epoch 2 avg loss 2.9974
epoch 3/8: 100%|██████████████████████████████| 163/163 [00:09<00:00, 17.31it/s]
epoch 3 avg loss 1.3573
epoch 4/8: 100%|██████████████████████████████| 163/163 [00:09<00:00, 17.17it/s]
epoch 4 avg loss 0.9868
epoch 5/8: 100%|██████████████████████████████| 163/163 [00:09<00:00, 17.62it/s]
epoch 5 avg loss 0.8974
epoch 6/8: 100%|██████████████████████████████| 163/163 [00:09<00:00, 

In [18]:
import toml
import os

# --- CHOOSE YOUR CONFIG ---
# You can switch between 'full' or 'missense' here
CONFIG_NAME = 'pipeline.full.toml'
# CONFIG_NAME = 'pipeline.missense.toml'
# --------------------------

config_path = os.path.join('configs', CONFIG_NAME)

if not os.path.exists(config_path):
    print(f"🚨 Error: Config file not found at {config_path}")
    print("Please make sure your 'configs' folder is uploaded correctly.")
else:
    print(f"Updating {config_path} with correct file paths...")

    # Load the config
    with open(config_path, 'r') as f:
        config = toml.load(f)

    # Update paths to match downloaded/processed files
    config['Paths']['clinvar'] = 'data/raw/variant_summary.txt.gz' # Keep .gz
    config['Paths']['fasta'] = 'data/raw/GCF_000001405.40_GRCh38.p14_genomic.fna' # Use the UNCOMPRESSED name
    config['Paths']['go'] = 'data/processed/go_n2v_genes.npz' # Correct path for GO embeddings

    # Save the changes
    with open(config_path, 'w') as f:
        toml.dump(config, f)

    print(f"✅ Config file {config_path} updated successfully!")

Updating configs/pipeline.full.toml with correct file paths...
✅ Config file configs/pipeline.full.toml updated successfully!


In [19]:
# 1. Ensure the directory and its subdirectory exist
!mkdir -p vep_cache/Plugins

# 2. Give it full read/write/execute permissions for everyone
# This is the simplest way to solve the Docker user permission issue
!chmod -R 777 vep_cache

In [20]:
import toml
import os

# --- This cell fixes the VEP cache path in your config ---
# It uses the 'config_path' variable set in Cell 4
if 'config_path' not in locals() or not os.path.exists(config_path):
    print("🚨 Error: 'config_path' is not set or the file is missing.")
    print("Please make sure you have run Cell 4 successfully before running this cell.")
else:
    # This is your home directory on the server, which IS writeable
    writeable_cache_path = "/home/zeus/vep_cache"
    
    print(f"🔧 Fixing VEP cache path in {config_path}...")
    
    # 1. Load the config
    with open(config_path, 'r') as f:
        config = toml.load(f)

    # 2. Change the 'vep_cache_dir' to our new writeable path
    config['Protein']['vep_cache_dir'] = writeable_cache_path

    # 3. Save the changes
    with open(config_path, 'w') as f:
        toml.dump(config, f)
        
    print(f"✅ Config file updated. VEP cache will now use {writeable_cache_path}")

🔧 Fixing VEP cache path in configs/pipeline.full.toml...
✅ Config file updated. VEP cache will now use /home/zeus/vep_cache


In [15]:
import os

# --- This cell downloads the VEP cache data, based on your .sh script ---

# 1. Define the writeable paths in your home directory
writeable_cache_path = "/home/zeus/vep_cache"
writeable_data_path = "/home/zeus/vep_data" # From your script

# 2. Create all the directories Docker will need
!mkdir -p {writeable_cache_path}/tmp
!mkdir -p {writeable_data_path}
!mkdir -p {writeable_cache_path}/Plugins # Pre-make the plugins dir

print(f"✅ Created directories at {writeable_cache_path} and {writeable_data_path}")

print(f"⬇️ Downloading VEP cache & FASTA for Homo Sapiens (GRCh38)...")
print(f"This is a one-time download (10-20GB+) and will be saved to {writeable_cache_path}")
print("Please be patient, this will take a long time...")

# 3. Run the adapted docker install command from your .sh file
# We use our writeable_cache_path instead of $(pwd)/vep_cache
!docker run \
    --rm -it \
    --user 1000:1000 \
    -e TMPDIR=/opt/vep/.vep/tmp \
    -v "{writeable_cache_path}:/opt/vep/.vep" \
    -v "{writeable_data_path}:/data" \
    ensemblorg/ensembl-vep \
    bash -lc 'set -euo pipefail; \
      cd /opt/vep/src/ensembl-vep; \
      perl INSTALL.pl -a cf -s homo_sapiens -y GRCh38 --ASSEMBLY GRCh38 --CACHEDIR /opt/vep/.vep'

print("✅ VEP cache and FASTA download complete!")

✅ Created directories at /home/zeus/vep_cache and /home/zeus/vep_data
⬇️ Downloading VEP cache & FASTA for Homo Sapiens (GRCh38)...
This is a one-time download (10-20GB+) and will be saved to /home/zeus/vep_cache
Please be patient, this will take a long time...
 - getting list of available cache files
 - downloading https://ftp.ensembl.org/pub/release-115/variation/indexed_vep_cache/homo_sapiens_vep_115_GRCh38.tar.gz
 - unpacking homo_sapiens_vep_115_GRCh38.tar.gz
 - converting cache, this may take some time but will allow VEP to look up variants and frequency data much faster
 - use CTRL-C to cancel if you do not wish to convert this cache now (you may run convert_cache.pl later)
2025-10-23 12:01:19 - Processing homo_sapiens
2025-10-23 12:01:19 - Processing version 115_GRCh38
2025-10-23 12:01:19 - No unprocessed types remaining, skipping
2025-10-23 12:01:19 - All done!
Looks like you already have the FASTA file for homo_sapiens, skipping
cannot remove directory for /opt/vep/.vep/tmp/h

In [21]:
import toml
import os

# --- This cell uses the 'config_path' variable set in Cell 4 ---
if 'config_path' not in locals() or not os.path.exists(config_path):
    print("🚨 Error: 'config_path' is not set or the file is missing.")
    print("Please make sure you have run Cell 4 successfully before running this cell.")
else:
    # Get the config path from the previous cell
    with open(config_path, 'r') as f:
        config = toml.load(f)

    print(f"🚀 Starting main pipeline using {config_path}...")
    print(f"All artifacts will be saved in: {config['Paths']['artifacts']}")
    print(f"LLM model outputs will be in: {config['LLM']['out_dir']}")

    # Run the master training script
    # We use 'python -u' to make the output print in real-time
    !python -u train.py --config {config_path}

    print("🎉🎉🎉 Pipeline Finished! 🎉🎉🎉")

🚀 Starting main pipeline using configs/pipeline.full.toml...
All artifacts will be saved in: artifacts/full
LLM model outputs will be in: artifacts/full/qwen3
[config] loaded from /teamspace/studios/this_studio/configs/pipeline.full.toml
[config] artifacts dir: /teamspace/studios/this_studio/artifacts/full
[config] device: cuda
[splits] reusing cached ClinVar splits
  [train] /teamspace/studios/this_studio/artifacts/full/splits/clinvar_train.feather (rows=272799)
  [val] /teamspace/studios/this_studio/artifacts/full/splits/clinvar_val.feather (rows=35383)
  [test] /teamspace/studios/this_studio/artifacts/full/splits/clinvar_test.feather (rows=31852)
[vep] reuse cached annotations for train: /teamspace/studios/this_studio/artifacts/full/vep/train/vep_combined.feather
  [vep-report] train: rows=287,418 hgvsc=95.1% hgvsp=67.3% MANE_Select=99.6% seq_wt_missing=2.5% seq_mt_missing=38.8%
               top consequences: synonymous variant: 113825, intron variant: 45985, missense variant: 419