In [1]:
import hashlib, os, re

def verify_md5(file_path: str):
    try:
        md5_file = file_path + ".md5"
        with open(file_path, "rb") as f:
            local = hashlib.md5(f.read()).hexdigest()

        with open(md5_file) as f:
            refs = [m.group(0).lower()
                    for line in f if (m := re.search(r"[a-fA-F0-9]{32}", line))]

        if local not in refs:
            raise Exception(f"MD5 mismatch: {local} not in {refs}")
        print(f"✅ MD5 checksum for {file_path} matches.")
    except Exception as e:
        raise Exception(f"MD5 verification failed for {file_path}: {e}")


In [2]:
!mkdir -p ./downloads

# Download clinvar GRCh37

In [3]:
!wget -cP ./downloads https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz

--2025-09-09 15:17:37--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::7, 2607:f220:41e:250::10, 2607:f220:41e:250::12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::7|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [4]:
!wget -cP ./downloads https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz.md5

--2025-09-09 15:17:38--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz.md5
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::31, 2607:f220:41e:250::10, 2607:f220:41e:250::12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::31|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [5]:
verify_md5("./downloads/clinvar.vcf.gz")

✅ MD5 checksum for ./downloads/clinvar.vcf.gz matches.


In [6]:
# 1.6 GB unzipped

In [7]:
!gzip -dkf ./downloads/clinvar.vcf.gz

# Download some real 23andme data

In [8]:
!curl -L -o ./downloads/family-genome-dataset.zip https://www.kaggle.com/api/v1/datasets/download/zusmani/family-genome-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 26.2M  100 26.2M    0     0  6680k      0  0:00:04  0:00:04 --:--:-- 9297k


In [9]:
output = !md5sum ./downloads/family-genome-dataset.zip
assert "3d0ddd9afe1ac5a9f30f1b0358755d73" in output[0]

In [10]:
# info on the format https://jade-cheng.com/au/23andme-to-plink/

In [11]:
!unzip -o ./downloads/family-genome-dataset.zip -d ./downloads/family-genome-dataset

Archive:  ./downloads/family-genome-dataset.zip
  inflating: ./downloads/family-genome-dataset/Child 1 Genome.csv  
  inflating: ./downloads/family-genome-dataset/Child 2 Genome.csv  
  inflating: ./downloads/family-genome-dataset/Child 3 Genome.csv  
  inflating: ./downloads/family-genome-dataset/Father Genome.csv  
  inflating: ./downloads/family-genome-dataset/Mother Genome.csv  
