In [None]:
import hashlib, os, re

def verify_md5(file_path: str):
    try:
        md5_file = file_path + ".md5"
        with open(file_path, "rb") as f:
            local = hashlib.md5(f.read()).hexdigest()

        with open(md5_file) as f:
            refs = [m.group(0).lower()
                    for line in f if (m := re.search(r"[a-fA-F0-9]{32}", line))]

        if local not in refs:
            raise Exception(f"MD5 mismatch: {local} not in {refs}")
        print(f"✅ MD5 checksum for {file_path} matches.")
    except Exception as e:
        raise Exception(f"MD5 verification failed for {file_path}: {e}")


In [None]:
!mkdir -p ./downloads

# Download clinvar GRCh37

In [None]:
!wget -cP ./downloads https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz

In [None]:
!wget -cP ./downloads https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz.md5

In [None]:
verify_md5("./downloads/clinvar.vcf.gz")

In [None]:
# 1.6 GB unzipped

In [None]:
!gzip -dkf ./downloads/clinvar.vcf.gz

# Download some real 23andme data

In [None]:
!curl -L -o ./downloads/family-genome-dataset.zip https://www.kaggle.com/api/v1/datasets/download/zusmani/family-genome-dataset

In [None]:
output = !md5sum ./downloads/family-genome-dataset.zip
assert "3d0ddd9afe1ac5a9f30f1b0358755d73" in output[0]

In [None]:
# info on the format https://jade-cheng.com/au/23andme-to-plink/

In [None]:
!unzip -o ./downloads/family-genome-dataset.zip -d ./downloads/family-genome-dataset

In [None]:
!git clone --filter=blob:none --no-checkout https://github.com/OpenMined/biovault-data.git && \
    cd biovault-data \
    && git checkout main && git sparse-checkout init --cone \
    && git ls-tree -d --name-only origin/main snp/23andme >/dev/null \
    && git sparse-checkout set snp/23andme