<a href="https://colab.research.google.com/github/VUzan-bio/DNA-Bacteria-JEPA/blob/main/dna_jepa_bacteria.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Enable GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Cell 2: Clone your repo
!git clone https://github.com/VUzan-bio/DNA-Bacteria-JEPA.git
%cd DNA-Bacteria-JEPA

# Cell 3: Install dependencies
!pip install -q pandas numpy scipy scikit-learn tqdm
# PyTorch is already installed in Colab with CUDA support

# Cell 4: Verify imports work
from src.cas12a.tokenizer import Cas12aTokenizer, TokenizerConfig
from src.cas12a.encoder import SparseTransformerEncoder
print("Imports successful")

GPU available: True
GPU: NVIDIA A100-SXM4-40GB
Cloning into 'DNA-Bacteria-JEPA'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 59 (delta 4), reused 55 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (59/59), 23.56 MiB | 16.42 MiB/s, done.
Resolving deltas: 100% (4/4), done.
/content/DNA-Bacteria-JEPA
Imports successful


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# Run download script
!python scripts/download_bacterial_genomes.py \
    --output-dir data/raw/bacterial_genomes

# Extract pretraining sequences
!python scripts/extract_pretraining_sequences.py \
    --genome-dir data/raw/bacterial_genomes \
    --output-csv data/processed/pretrain_sequences.csv \
    --window-size 512 \
    --stride 256


NCBI Bacterial Genome Downloader
Output directory: /content/DNA-Bacteria-JEPA/data/raw/bacterial_genomes
Total accessions: 8

[1/8]
[downloading] Escherichia_coli_K12_MG1655 (NC_000913.3)
[ok] NC_000913.3_Escherichia_coli_K12_MG1655.fasta (4.49 MB)

[2/8]
[downloading] Bacillus_subtilis_168 (NC_000964.3)
[ok] NC_000964.3_Bacillus_subtilis_168.fasta (4.08 MB)

[3/8]
[downloading] Pseudomonas_putida_KT2440 (NC_002947.4)
[ok] NC_002947.4_Pseudomonas_putida_KT2440.fasta (5.98 MB)

[4/8]
[downloading] Streptomyces_coelicolor_A3 (NC_003888.3)
[ok] NC_003888.3_Streptomyces_coelicolor_A3.fasta (8.38 MB)

[5/8]
[downloading] Lactobacillus_acidophilus_NCFM (NC_006814.1)
[ok] NC_006814.1_Lactobacillus_acidophilus_NCFM.fasta (1.93 MB)

[6/8]
[downloading] Staphylococcus_aureus_NCTC8325 (NC_007795.1)
[ok] NC_007795.1_Staphylococcus_aureus_NCTC8325.fasta (2.73 MB)

[7/8]
[downloading] Escherichia_coli_536 (NC_008253.1)
[ok] NC_008253.1_Escherichia_coli_536.fasta (4.78 MB)

[8/8]
[downloading] Acinet

In [4]:
!sed -i 's/total_mem/total_memory/g' /content/DNA-Bacteria-JEPA/scripts/01_pretrain_jepa.py
!grep "total_memory" /content/DNA-Bacteria-JEPA/scripts/01_pretrain_jepa.py

# 1. First, save checkpoints to Google Drive so you don't lose them again
from google.colab import drive
drive.mount('/content/drive')

!mkdir -p "/content/drive/MyDrive/DNA-JEPA-checkpoints"

!python /content/DNA-Bacteria-JEPA/scripts/01_pretrain_jepa.py \
    --data-path data/processed/pretrain_sequences.csv \
    --epochs 200 --batch-size 512 --lr 6e-4 \
    --warmup-epochs 10 --weight-decay 0.05 \
    --ema-decay-start 0.996 --ema-decay-end 1.0 \
    --mask-ratio 0.30 --num-mask-blocks 4 \
    --sim-weight 1.0 --var-weight 25.0 --cov-weight 0.04 \
    --precision auto --save-every 25

        print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Device: cuda
  GPU: NVIDIA A100-SXM4-40GB
  Memory: 42.4 GB
  _C._set_float32_matmul_precision(precision)
Tokenizer: vocab_size=9
Loaded 146223 sequences from /content/DNA-Bacteria-JEPA/data/processed/pretrain_sequences.csv
Dataset: 146,223 sequences
Encoder: 8.5M params | Predictor: 0.7M params

Config: batch=512 x accum=1 = eff_batch=512, precision=bfloat16
Schedule: 200 epochs, warmup=10 epochs, peak_lr=6.0e-04, min_lr=1.0e-06
EMA: tau=0.996 -> 1.0 (cosine)
VICReg: sim=1.0, var=25.0, cov=0.04
Masking: ratio=0.3, blocks=4, min_len=3

  Starting pretraining: 200 epochs remaining

Epoch 1/200: 100%|█████████████████████| 285/285 [00:50<00:00,  5.60it/s, loss=2.576, inv=1.762, var=0.0020, lr=6.0e-05]
  output = torch._nested_tensor_from_mask(

  Epoch 1/200 (50.9s, ETA 

In [6]:
# ── 1. Install UMAP (t-SNE comes with sklearn) ──
!pip install umap-learn -q

# ── 2. After pretraining finishes, generate t-SNE + UMAP (4 panels each) ──
#    Colored by: Genome, KMeans Clusters, GC Content, Sequence Length
!cd /content/DNA-Bacteria-JEPA && python scripts/03_visualize_embeddings.py \
    --checkpoint /content/DNA-Bacteria-JEPA/checkpoints/pretrain/checkpoint_epoch200.pt \
    --data-path data/processed/pretrain_sequences.csv \
    --max-samples 5000 \
    --n-clusters 10 \
    --output-dir figures/

# ── 3. Training curves (6-panel: loss, RankMe, VICReg components, LR, pred_std) ──
#    Option A: from log file (copy your training output to a .log file)
!cd /content/DNA-Bacteria-JEPA && python scripts/04_plot_training_curves.py \
    --log-file training_output.log \
    --output figures/training_curves.png

#    Option B: from checkpoints only (fewer panels but no log needed)
!cd /content/DNA-Bacteria-JEPA && python scripts/04_plot_training_curves.py \
    --checkpoint-dir /content/drive/MyDrive/DNA-JEPA-checkpoints/ \
    --output figures/training_curves.png

Device: cuda
Loaded 146223 sequences from /content/DNA-Bacteria-JEPA/data/processed/pretrain_sequences.csv
Sampled 5,000 sequences for visualization
Loaded checkpoint: epoch 199
  output = torch._nested_tensor_from_mask(
Embeddings shape: (5000, 384)
KMeans: 10 clusters

Running t-SNE...
  t-SNE done (KL divergence: 1.43)
Saved: /content/DNA-Bacteria-JEPA/figures/tsne_embeddings_epoch200.png
2026-02-12 10:58:51.671959: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-12 10:58:51.690375: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770893931.712753   51505 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting 

In [7]:
# ── Training curves from LOCAL checkpoints ──
!cd /content/DNA-Bacteria-JEPA && python scripts/04_plot_training_curves.py \
    --checkpoint-dir /content/DNA-Bacteria-JEPA/checkpoints/pretrain/ \
    --output /content/drive/MyDrive/DNA-JEPA-checkpoints/figures/training_curves.png

Parsed 8 checkpoints
Saved: /content/drive/MyDrive/DNA-JEPA-checkpoints/figures/training_curves.png


In [None]:
%cd /content/DNA-Bacteria-JEPA

from src.cas12a.tokenizer import Cas12aTokenizer, TokenizerConfig
tok = Cas12aTokenizer(TokenizerConfig())
print([m for m in dir(tok) if not m.startswith('_')])

In [None]:
%cd /content/DNA-Bacteria-JEPA
!python /content/DNA-Bacteria-JEPA/scripts/plot_jepa_results.py \
    --checkpoint checkpoints/pretrain/checkpoint_epoch100.pt \
    --metrics checkpoints/pretrain/pretrain_metrics.csv \
    --data-path data/processed/pretrain_sequences.csv \
    --num-samples 5000 \
    --output-dir figures/