In [1]:
# clinvar format
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
# 1	66926	3385321	AG	A	.	.	ALLELEID=3544463;CLNDISDB=Human_Phenotype_Ontology:HP:0000547,MONDO:MONDO:0019200,MeSH:D012174,MedGen:C0035334,OMIM:268000,OMIM:PS268000,Orphanet:791;CLNDN=Retinitis_pigmentosa;CLNHGVS=NC_000001.10:g.66927del;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGSCV=SCV005419006;CLNVC=Deletion;CLNVCSO=SO:0000159;GENEINFO=OR4F5:79501;MC=SO:0001627|intron_variant;ORIGIN=0

In [2]:
# 23andme format
# rsid,chromosome,position,genotype
# rs12564807,1,734462,AA

In [3]:
# lets filter clinvar and keep only rows where the position is inside work/merged_rsids_positions.csv

In [4]:
import random
import csv
from pathlib import Path

VCF_IN   = Path("downloads/clinvar.vcf")
CSV_PATH = Path("work/merged_rsids_positions.csv")
VCF_OUT  = Path("work/clinvar.23andme.vcf")
UNMATCH_SAMPLE = Path("work/unmatched_sample.csv")

# --- load CSV targets ---
targets: set[tuple[str,int]] = set()
all_rows: list[dict] = []
with CSV_PATH.open(newline="", encoding="utf-8") as f:
    r = csv.DictReader(f)
    for row in r:
        ch = row["chromosome"]
        pos = int(row["position"])
        targets.add((ch, pos))
        all_rows.append(row)

print(f"Loaded {len(targets):,} targets")

# --- filter VCF ---
kept = 0
total = 0
matched = set()
with VCF_IN.open("r", encoding="utf-8", errors="replace") as fin, VCF_OUT.open("w", encoding="utf-8") as fout:
    for line in fin:
        if line.startswith("#"):
            fout.write(line)
            continue
        total += 1
        parts = line.rstrip("\n").split("\t")
        if len(parts) < 2:
            continue
        chrom = parts[0]
        pos = int(parts[1])
        key = (chrom, pos)
        if key in targets:
            fout.write(line)
            kept += 1
            matched.add(key)

print(f"Processed {total:,} VCF variants. Kept {kept:,}.")

# --- find unmatched ---
unmatched = [row for row in all_rows if (row["chromosome"], int(row["position"])) not in matched]

# take a random sample of 100 (or fewer if fewer exist)
sample_size = min(100, len(unmatched))
sample = random.sample(unmatched, sample_size) if unmatched else []

# save sample to CSV
if sample:
    with UNMATCH_SAMPLE.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=sample[0].keys())
        writer.writeheader()
        writer.writerows(sample)

print(f"Unmatched rows: {len(unmatched):,}. Wrote {sample_size} sample rows -> {UNMATCH_SAMPLE}")


Loaded 1,056,671 targets
Processed 3,683,762 VCF variants. Kept 65,246.
Unmatched rows: 1,004,529. Wrote 100 sample rows -> work/unmatched_sample.csv


In [5]:
# okay so Dawn says this is plausible and it could be that many 100ks of snps 
# that 23andme test for are non medical ancestry based ones with no clinical significance
# https://www.reddit.com/r/23andme/comments/3dd3lp/snp_coverage_analysiscomparisons_23andme_v3v4

In [6]:
# lets save this as a .vcf.gz

In [7]:
!head ./downloads/clinvar.23andme.vcf

##fileformat=VCFv4.1
##fileDate=2025-09-07
##source=ClinVar
##reference=GRCh37
##ID=<Description="ClinVar Variation ID">
##INFO=<ID=AF_ESP,Number=1,Type=Float,Description="allele frequencies from GO-ESP">
##INFO=<ID=AF_EXAC,Number=1,Type=Float,Description="allele frequencies from ExAC">
##INFO=<ID=AF_TGP,Number=1,Type=Float,Description="allele frequencies from TGP">
##INFO=<ID=ALLELEID,Number=1,Type=Integer,Description="the ClinVar Allele ID">
##INFO=<ID=CLNDN,Number=.,Type=String,Description="ClinVar's preferred disease name for the concept specified by disease identifiers in CLNDISDB">


In [8]:
!mkdir -p ./results

In [9]:
!bgzip -c ./work/clinvar.23andme.vcf > ./results/clinvar.23andme.vcf.gz
!tabix -p vcf ./results/clinvar.23andme.vcf.gz

In [10]:
!bcftools view ./results/clinvar.23andme.vcf.gz > /dev/null

In [11]:
import hashlib
from pathlib import Path

def write_md5(path: str):
    file = Path(path)
    md5 = hashlib.md5(file.read_bytes()).hexdigest()
    # Construct ClinVar-style line
    line = f"{md5}  {file.name}\n"
    # Write alongside the file with .md5 extension
    md5_path = file.with_suffix(file.suffix + ".md5")
    md5_path.write_text(line)
    print(f"✅ Wrote {md5_path}")
    print(line.strip())

# Example usage
write_md5("./results/clinvar.23andme.vcf.gz")

✅ Wrote results/clinvar.23andme.vcf.gz.md5
b8d480a2e595dabef0414501bd183485  clinvar.23andme.vcf.gz
