In [None]:
import os
import glob
import shutil
import subprocess
import h5py
import time

RUN_SETUP = True
SKIP_EXISTING_POD5 = True
SKIP_EXISTING_BAM = True
INPUT_DIR = "/content/drive/MyDrive/fast 5/DeepSignal"
OUTPUT_DIR = "/content/deepsignal_paper"
MODEL_NAME = "dna_r10.4.1_e8.2_400bps_hac@v4.3.0"
MODEL_CACHE = "/root/.dorado/models"
REF_PATH = os.path.join(OUTPUT_DIR, "grch38_chr20.fa")
BISULFITE_BED = os.path.join(OUTPUT_DIR, "NA12878_bisulfite.bed")
MODS = "5mCG_5hmCG"
DEVICE = "cuda:0"
MIN_QSCORE = 9
BATCHSIZE = 64

os.makedirs(OUTPUT_DIR, exist_ok=True)
POD5_DIR = os.path.join(OUTPUT_DIR, "pod5")
os.makedirs(POD5_DIR, exist_ok=True)

os.environ["DORADO_MODEL_CACHE"] = "/root/.dorado/models"
os.environ["PATH"] = os.environ.get("PATH", "")

# Runs a command, returns (rc, stdout, stderr)
def run_cmd(cmd, shell=False, capture=False):
    if isinstance(cmd, (list, tuple)):
        proc = subprocess.run(cmd, capture_output=capture, text=True)
    else:
        proc = subprocess.run(cmd, shell=True, capture_output=capture, text=True)
    out = proc.stdout if capture else None
    err = proc.stderr if capture else None
    return proc.returncode, out, err

# Builds mapping of read IDs to FAST5 file paths
def build_read_to_fast5_mapping(fast5_dir):
    read_to_fast5 = {}
    fast5_files_local = [f for f in os.listdir(fast5_dir) if f.endswith('.fast5')]
    for fname in fast5_files_local:
        path = os.path.join(fast5_dir, fname)
        try:
            with h5py.File(path, 'r') as f:
                for read_group in f.keys():
                    if read_group.startswith('read_'):
                        read_id = read_group.replace('read_', '')
                        read_to_fast5[read_id] = path
        except Exception as e:
            pass
    return read_to_fast5

if RUN_SETUP:
    !apt-get update -qq
    !apt-get install -y samtools
    !pip install -q pod5 h5py pysam pandas numpy scipy
    !nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
    bam_file = os.path.join(OUTPUT_DIR, "hg002_aligned.bam")
    if os.path.exists(bam_file):
        try:
            os.remove(bam_file)
            if os.path.exists(f"{bam_file}.bai"):
                os.remove(f"{bam_file}.bai")
        except Exception as e:
            pass
    os.makedirs("/root/.dorado/models", exist_ok=True)
    if not os.path.exists(REF_PATH):
        !wget -qO- http://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.20.fa.gz | gunzip -c > {REF_PATH}
        !samtools faidx {REF_PATH}
    if not os.path.exists(BISULFITE_BED):
        os.system(f'wget -qO- https://www.encodeproject.org/files/ENCFF835NTC/@@download/ENCFF835NTC.bed.gz | gunzip -c > {BISULFITE_BED}')
    dorado_bin = "/content/dorado/bin/dorado"
    if not os.path.exists(dorado_bin):
        !wget -qO- https://cdn.oxfordnanoportal.com/software/analysis/dorado-0.9.6-linux-x64.tar.gz | tar -xz
        !mv dorado-0.9.6-linux-x64 /content/dorado
        os.environ['PATH'] = "/content/dorado/bin:" + os.environ['PATH']
    rc, out, err = run_cmd(["/content/dorado/bin/dorado", "--version"], capture=True) if os.path.exists("/content/dorado/bin/dorado") else run_cmd(["dorado", "--version"], capture=True)
    env = os.environ.copy()
    env["DORADO_MODEL_CACHE"] = "/root/.dorado/models"
    try:
        subprocess.run(["dorado", "download", "--model", MODEL_NAME], check=True, env=env)
    except subprocess.CalledProcessError as e:
        pass
    candidate = f"/content/{MODEL_NAME}"
    if os.path.exists(candidate):
        dest = os.path.join("/root/.dorado/models", os.path.basename(candidate))
        if not os.path.exists(dest):
            shutil.move(candidate, "/root/.dorado/models")
    found = glob.glob(os.path.join("/root/.dorado/models", MODEL_NAME + "*"))
    read_to_fast5_map = build_read_to_fast5_mapping(INPUT_DIR)
else:
    if not os.path.exists(REF_PATH):
        raise FileNotFoundError("Reference not found. Either run RUN_SETUP=True or upload the ref file.")
    if not os.path.exists(MODEL_CACHE):
        pass
    read_to_fast5_map = build_read_to_fast5_mapping(INPUT_DIR)

fast5_files = sorted(glob.glob(os.path.join(INPUT_DIR, "*.fast5")))
bams = []

for i, f5 in enumerate(fast5_files):
    base = os.path.basename(f5)
    tmp_dir = os.path.join(POD5_DIR, f"tmp_{i}")
    os.makedirs(tmp_dir, exist_ok=True)
    pod5_out = os.path.join(tmp_dir, base.replace(".fast5", ".pod5"))
    if SKIP_EXISTING_POD5 and os.path.exists(pod5_out):
        pass
    else:
        cmd = ["pod5", "convert", "fast5", "-o", pod5_out, f5]
        rc, out, err = run_cmd(cmd, capture=True)
        if rc != 0:
            try:
                shutil.rmtree(tmp_dir)
            except:
                pass
            continue
    pod5_candidates = glob.glob(os.path.join(tmp_dir, "*.pod5")) + ([pod5_out] if os.path.exists(pod5_out) else [])
    if not pod5_candidates:
        try:
            shutil.rmtree(tmp_dir)
        except:
            pass
        continue
    p5 = pod5_candidates[0]
    out_bam = os.path.join(OUTPUT_DIR, f"temp_{i}.bam")
    if SKIP_EXISTING_BAM and os.path.exists(out_bam) and os.path.getsize(out_bam) > 1000:
        bams.append(out_bam)
        try:
            shutil.rmtree(tmp_dir)
        except:
            pass
        continue
    basecall_cmd = [
        "dorado", "basecaller", MODEL_NAME,
        p5,
        "--reference", REF_PATH,
        "--modified-bases", MODS,
        "--device", DEVICE,
        "--min-qscore", str(MIN_QSCORE),
        "--batchsize", str(BATCHSIZE)
    ]
    with open(out_bam, "w") as outfh:
        proc = subprocess.run(basecall_cmd, stdout=outfh, stderr=subprocess.PIPE, text=True)
    if proc.returncode != 0:
        if os.path.exists(out_bam) and os.path.getsize(out_bam) < 100:
            try:
                os.remove(out_bam)
            except:
                pass
        try:
            shutil.rmtree(tmp_dir)
        except:
            pass
        continue
    if os.path.exists(out_bam) and os.path.getsize(out_bam) > 1000:
        bams.append(out_bam)
    try:
        shutil.rmtree(tmp_dir)
    except:
        pass

if not bams:
    raise Exception("No BAMs produced in Part B.")

final_bam = os.path.join(OUTPUT_DIR, "final.bam")
bam_list_file = os.path.join(OUTPUT_DIR, "bam_list.txt")
with open(bam_list_file, "w") as f:
    for b in bams:
        f.write(b + "\n")

rc, out, err = run_cmd(["samtools", "merge", "-f", "-b", bam_list_file, final_bam], capture=True)
if rc != 0:
    raise Exception("samtools merge failed")

rc, out, err = run_cmd(["samtools", "index", final_bam], capture=True)
if rc != 0:
    raise Exception("samtools index failed")

rc, out, err = run_cmd(["samtools", "view", "-c", final_bam], capture=True)