# Figure out if there are any duplicates

In [1]:
# clinvar format
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
# 1	66926	3385321	AG	A	.	.	ALLELEID=3544463;CLNDISDB=Human_Phenotype_Ontology:HP:0000547,MONDO:MONDO:0019200,MeSH:D012174,MedGen:C0035334,OMIM:268000,OMIM:PS268000,Orphanet:791;CLNDN=Retinitis_pigmentosa;CLNHGVS=NC_000001.10:g.66927del;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGSCV=SCV005419006;CLNVC=Deletion;CLNVCSO=SO:0000159;GENEINFO=OR4F5:79501;MC=SO:0001627|intron_variant;ORIGIN=0

In [2]:
# 23andme format
# rsid,chromosome,position,genotype
# rs12564807,1,734462,AA

In [3]:
!mkdir -p work

In [4]:
from pathlib import Path
import pandas as pd
import io
import sys
import re

DATA_DIR = Path("./downloads/family-genome-dataset")
WORK_DIR = Path("./work")
OUTPUT = WORK_DIR / "merged_rsids_positions.csv"
DUP_RSID_FILE = WORK_DIR / "duplicate_rsids.csv"
DUP_POS_FILE = WORK_DIR / "duplicate_positions.csv"

WORK_DIR.mkdir(parents=True, exist_ok=True)

def tuple_str(values):
    """Format a sequence like ('a','b') as (a,b,) with no quotes, trailing comma."""
    vals = [str(v) for v in values]
    return "(" + ",".join(vals) + ("," if len(vals) >= 1 else "") + ")"

csv_paths = sorted(DATA_DIR.glob("*.csv"))
all_dfs = []

for path in csv_paths:
    try:
        with open(path, "r", encoding="utf-8-sig", errors="replace") as f:
            data_lines = [ln for ln in f if not ln.lstrip().startswith("#") and ln.strip()]
        if not data_lines:
            continue

        df = pd.read_csv(
            io.StringIO("".join(data_lines)),
            header=None,
            names=["rsid", "chromosome", "position", "genotype"],
            dtype={"rsid": "string", "chromosome": "string", "genotype": "string"},
            low_memory=False,
        )
        df = df[["rsid", "chromosome", "position"]].copy()
        df["position"] = pd.to_numeric(df["position"], errors="coerce").astype("Int64")
        df = df.dropna(subset=["rsid", "position"])

        all_dfs.append(df)
    except Exception as e:
        print(f"⚠️ Skipping {path.name}: {e}", file=sys.stderr)

if not all_dfs:
    pd.DataFrame(columns=["rsid", "chromosome", "position"]).to_csv(OUTPUT, index=False)
    print(f"⚠️ No records parsed. Wrote empty header to {OUTPUT}", file=sys.stderr)
    raise SystemExit(0)

# Merge and drop exact duplicates (rsid, chr, pos) silently
merged = pd.concat(all_dfs, ignore_index=True)
distinct = merged.drop_duplicates(subset=["rsid", "chromosome", "position"]).reset_index(drop=True)

# ---------- Drop all iXXXXX ids FIRST ----------
is_i_id = distinct["rsid"].str.match(r"^i\d+$", na=False)
i_count = int(is_i_id.sum())
print(f"Dropping iXXXXX IDs before duplicate checks: {i_count:,}")
distinct = distinct.loc[~is_i_id].copy()

# ---------- Duplicate RSIDs with different positions ----------
g = distinct.groupby("rsid", dropna=False)
dup_rsid_groups = g.filter(lambda d: d[["chromosome","position"]].drop_duplicates().shape[0] > 1)

if dup_rsid_groups.empty:
    dup_rsid_df = pd.DataFrame(columns=["rsid", "chromosomes", "positions"])
else:
    rows = []
    for rsid, sub in dup_rsid_groups.groupby("rsid"):
        pairs = sub[["chromosome","position"]].drop_duplicates().sort_values(by=["chromosome","position"])
        chroms = [str(c) for c in pairs["chromosome"].tolist()]
        poss = [str(int(p)) for p in pairs["position"].tolist()]
        rows.append({
            "rsid": rsid,
            "chromosomes": tuple_str(chroms),
            "positions": tuple_str(poss),
        })
    dup_rsid_df = pd.DataFrame(rows, columns=["rsid","chromosomes","positions"])

dup_rsid_df.to_csv(DUP_RSID_FILE, index=False)
print(f"Duplicate RSIDs (different positions): {len(dup_rsid_df):,} -> {DUP_RSID_FILE}")

# ---------- Duplicate positions with different RSIDs ----------
gp = distinct.groupby(["chromosome","position"], dropna=False)
dup_pos_groups = gp.filter(lambda d: d["rsid"].nunique() > 1)

if dup_pos_groups.empty:
    dup_pos_df = pd.DataFrame(columns=["rsid", "chromosomes", "positions"])
else:
    rows = []
    for (chr_, pos_), sub in dup_pos_groups.groupby(["chromosome","position"]):
        rsids = sub["rsid"].drop_duplicates().sort_values(
            key=lambda s: s.str.extract(r"(\d+)", expand=False).fillna("0").astype(int)
        ).tolist()
        chroms = sub["chromosome"].drop_duplicates().tolist()
        rows.append({
            "rsid": tuple_str(rsids),
            "chromosomes": tuple_str([str(c) for c in chroms]),
            "positions": str(int(pos_)),
        })
    dup_pos_df = pd.DataFrame(rows, columns=["rsid","chromosomes","positions"])\
                   .sort_values(by=["positions","rsid"]).reset_index(drop=True)

dup_pos_df.to_csv(DUP_POS_FILE, index=False)
print(f"Duplicate positions (multiple RSIDs): {len(dup_pos_df):,} -> {DUP_POS_FILE}")

final_df = distinct.copy()
final_df = final_df.sort_values(
    by=["chromosome", "position", "rsid"],
    key=lambda col: (
        pd.to_numeric(col, errors="coerce")
        if col.name in ["chromosome", "position"]
        else col
    ),
)

# Save final
final_df.to_csv(OUTPUT, index=False)
print(f"✅ Wrote {len(final_df):,} rows to {OUTPUT}")

Dropping iXXXXX IDs before duplicate checks: 60,880
Duplicate RSIDs (different positions): 0 -> work/duplicate_rsids.csv
Duplicate positions (multiple RSIDs): 35 -> work/duplicate_positions.csv
✅ Wrote 1,056,706 rows to work/merged_rsids_positions.csv


In [5]:
# it looks like there are a few positions with multiple rsids

In [6]:
!head work/duplicate_positions.csv

rsid,chromosomes,positions
"(rs62642906,rs62642946,)","(12,)",103310863
"(rs1131454,rs3741981,)","(12,)",113348870
"(rs770990,rs113777878,)","(12,)",133525460
"(rs8176719,rs56231711,)","(9,)",136132909
"(rs2822142,rs114956511,)","(21,)",15181318
"(rs61748415,rs61748416,)","(X,)",153296798
"(rs61748408,rs61748409,)","(X,)",153296811
"(rs9341274,rs373506129,)","(Y,)",15591446
"(rs57077886,rs58727209,)","(1,)",156084738


In [7]:
# https://www.ncbi.nlm.nih.gov/snp/?term=rs62642906 #indel
# https://www.ncbi.nlm.nih.gov/snp/?term=rs62642946 #snp

In [8]:
# lets check the first one to see whats up

In [9]:
import glob

def grep_csvs(search: str, base_dir: str = "downloads/**/*.csv"):
    """Search recursively through CSV files for a given string."""
    matches = []
    for path in glob.glob(base_dir, recursive=True):
        with open(path, "r", encoding="utf-8-sig", errors="ignore") as f:
            for ln in f:
                if search in ln:
                    matches.append((path, ln.strip()))
    return matches

In [10]:
for path, line in grep_csvs("103310863"):
    print(path, "→", line)

downloads/family-genome-dataset/Mother Genome.csv → rs62642906,12,103310863,II
downloads/family-genome-dataset/Mother Genome.csv → rs62642946,12,103310863,AA
downloads/family-genome-dataset/Child 1 Genome.csv → rs62642906,12,103310863,II
downloads/family-genome-dataset/Child 1 Genome.csv → rs62642946,12,103310863,AA
downloads/family-genome-dataset/Father Genome.csv → rs62642906,12,103310863,II
downloads/family-genome-dataset/Father Genome.csv → rs62642946,12,103310863,AA


In [11]:
# okay looks like some people have both which makes sense

In [12]:
# lets check another common one like APO-E pos 45411941

In [13]:
for path, line in grep_csvs("45411941"):
    print(path, "→", line)

downloads/family-genome-dataset/Mother Genome.csv → rs429358,19,45411941,TT
downloads/family-genome-dataset/Child 2 Genome.csv → rs429358,19,45411941,TT
downloads/family-genome-dataset/Child 1 Genome.csv → rs429358,19,45411941,TT
downloads/family-genome-dataset/Child 3 Genome.csv → rs429358,19,45411941,TT
downloads/family-genome-dataset/Father Genome.csv → rs429358,19,45411941,TT


In [14]:
# https://www.ncbi.nlm.nih.gov/snp/?term=rs429358
# """
# Alleles:T>C [Show Flanks]Chromosome:19:44908684 (GRCh38)
# 19:45411941 (GRCh37)
# """
# looks valid to me

In [15]:
# okay now lets compare each file to see if there are common subsets

In [16]:
from pathlib import Path
import pandas as pd
import io
import sys
import re
from collections import defaultdict, Counter

DATA_DIR = Path("./downloads/family-genome-dataset")
MERGED = Path("./work/merged_rsids_positions.csv")  # optional canonical positions
COMMON_OUT = Path("./work/rsids_common_subset.csv")

# Optional: load merged file (sanity check/use later if needed)
if MERGED.exists():
    merged_df = pd.read_csv(
        MERGED,
        dtype={"rsid": "string", "chromosome": "string"},
        low_memory=False,
    )
    merged_df["rsid"] = merged_df["rsid"].astype("string")
    # Make quick lookup: rsid -> (chromosome, position)
    merged_lookup = {
        r["rsid"]: (str(r["chromosome"]), int(r["position"]))
        for _, r in merged_df.dropna(subset=["rsid", "chromosome", "position"]).iterrows()
    }
else:
    merged_df = pd.DataFrame(columns=["rsid", "chromosome", "position"])
    merged_lookup = {}

# ---- load per-file unique RSIDs (rs* only) and keep chr/pos ----
csv_paths = sorted(DATA_DIR.glob("*.csv"))
if not csv_paths:
    print(f"⚠️ No CSVs found in {DATA_DIR}", file=sys.stderr)
    raise SystemExit(1)

file_rsids: dict[str, set[str]] = {}
file_maps: dict[str, dict[str, tuple[str, int]]] = {}  # filename -> {rsid: (chrom, pos)}

rs_re = re.compile(r"^rs\d+$")

for path in csv_paths:
    try:
        with open(path, "r", encoding="utf-8-sig", errors="replace") as f:
            data_lines = [ln for ln in f if not ln.lstrip().startswith("#") and ln.strip()]
        if not data_lines:
            print(f"⚠️ {path.name} has no data rows after skipping comments.", file=sys.stderr)
            file_rsids[path.name] = set()
            file_maps[path.name] = {}
            continue

        df = pd.read_csv(
            io.StringIO("".join(data_lines)),
            header=None,
            names=["rsid", "chromosome", "position", "genotype"],
            usecols=[0, 1, 2],
            dtype={"rsid": "string", "chromosome": "string"},
            low_memory=False,
        )

        # Clean + filter to rsIDs
        df = df.dropna(subset=["rsid", "chromosome", "position"]).copy()
        df = df[df["rsid"].str.match(rs_re, na=False)]

        # Coerce position to int where possible
        # (23andMe files are integers; be robust if strings sneak in)
        df["position"] = pd.to_numeric(df["position"], errors="coerce").astype("Int64")
        df = df.dropna(subset=["position"]).copy()
        df["position"] = df["position"].astype(int)

        # Build set and map
        file_rsids[path.name] = set(df["rsid"].tolist())
        file_maps[path.name] = {
            r["rsid"]: (str(r["chromosome"]), int(r["position"]))
            for _, r in df.iterrows()
        }

    except Exception as e:
        print(f"⚠️ Skipping {path.name}: {e}", file=sys.stderr)
        file_rsids[path.name] = set()
        file_maps[path.name] = {}

if not file_rsids:
    print("⚠️ No RSIDs collected.", file=sys.stderr)
    raise SystemExit(1)

# ---- group files by identical RSID set ----
groups: dict[frozenset[str], list[str]] = defaultdict(list)
for fname, rset in file_rsids.items():
    groups[frozenset(rset)].append(fname)

all_identical = (len(groups) == 1)

print("\n=== RSID set identity groups (files sharing identical unique RSIDs) ===")
for i, (rset, files) in enumerate(groups.items(), start=1):
    print(f"Group {i}: {len(files)} file(s) -> {', '.join(sorted(files))}  [|RSIDs|={len(rset)}]")

if all_identical:
    print("\n✅ All files share an identical RSID set.")
else:
    print(f"\nℹ️ Found {len(groups)} distinct RSID sets across {len(file_rsids)} files.")

# ---- pairwise difference summary (counts only) ----
print("\n=== Pairwise RSID set comparison (counts) ===")
fnames = sorted(file_rsids.keys())
for i, fa in enumerate(fnames):
    for fb in fnames[i+1:]:
        A, B = file_rsids[fa], file_rsids[fb]
        inter = len(A & B)
        only_a = len(A - B)
        only_b = len(B - A)
        print(f"{fa} vs {fb}: |∩|={inter}, |A−B|={only_a}, |B−A|={only_b}")

# ---- compute RSIDs common to ALL files ----
common_rsids = None
for s in file_rsids.values():
    common_rsids = s if common_rsids is None else (common_rsids & s)
common_rsids = common_rsids or set()

# ---- choose chromosome/position per common rsid ----
# Preference: MERGED canonical -> majority vote across files -> first seen
rows = []
for rsid in sorted(common_rsids):
    if rsid in merged_lookup:
        chrom, pos = merged_lookup[rsid]
    else:
        # Collect all (chrom,pos) across files that have this rsid
        pairs = []
        for fmap in file_maps.values():
            if rsid in fmap:
                pairs.append(fmap[rsid])
        if pairs:
            chrom, pos = Counter(pairs).most_common(1)[0][0]
        else:
            # Shouldn't happen if rsid is in the intersection, but be safe
            chrom, pos = ("NA", -1)
    rows.append((rsid, str(chrom), int(pos)))

common_df = pd.DataFrame(rows, columns=["rsid", "chromosome", "position"])
common_df.to_csv(COMMON_OUT, index=False)
print(f"\n✅ Wrote common RSID subset with chr/pos (n={len(common_df)}) to {COMMON_OUT}")


=== RSID set identity groups (files sharing identical unique RSIDs) ===
Group 1: 3 file(s) -> Child 1 Genome.csv, Father Genome.csv, Mother Genome.csv  [|RSIDs|=553773]
Group 2: 2 file(s) -> Child 2 Genome.csv, Child 3 Genome.csv  [|RSIDs|=614871]

ℹ️ Found 2 distinct RSID sets across 5 files.

=== Pairwise RSID set comparison (counts) ===
Child 1 Genome.csv vs Child 2 Genome.csv: |∩|=111938, |A−B|=441835, |B−A|=502933
Child 1 Genome.csv vs Child 3 Genome.csv: |∩|=111938, |A−B|=441835, |B−A|=502933
Child 1 Genome.csv vs Father Genome.csv: |∩|=553773, |A−B|=0, |B−A|=0
Child 1 Genome.csv vs Mother Genome.csv: |∩|=553773, |A−B|=0, |B−A|=0
Child 2 Genome.csv vs Child 3 Genome.csv: |∩|=614871, |A−B|=0, |B−A|=0
Child 2 Genome.csv vs Father Genome.csv: |∩|=111938, |A−B|=502933, |B−A|=441835
Child 2 Genome.csv vs Mother Genome.csv: |∩|=111938, |A−B|=502933, |B−A|=441835
Child 3 Genome.csv vs Father Genome.csv: |∩|=111938, |A−B|=502933, |B−A|=441835
Child 3 Genome.csv vs Mother Genome.csv: |∩|

In [19]:
# make sure all positions are accounted for even the duplicates
!cat work/merged_rsids_positions.csv | grep 103310863

rs62642906,12,103310863
rs62642946,12,103310863


In [18]:
# apparently they changed the snps a lot over different generations so we need to get the superset from clinvar
# This seems to agree
# https://www.reddit.com/r/23andme/comments/3dd3lp/snp_coverage_analysiscomparisons_23andme_v3v4

In [20]:
from pathlib import Path

def guess_23andme_version(csv_path: Path) -> str:
    """
    Guess 23andMe chip version (v2, v3, v4, v5) based on row count.
    Skips header/comment lines starting with '#'.
    """
    count = 0
    with open(csv_path, "r", encoding="utf-8-sig", errors="replace") as f:
        for ln in f:
            if not ln.lstrip().startswith("#") and ln.strip():
                count += 1

    # Heuristic thresholds
    if 500_000 <= count < 600_000:
        version = "v2 (~550k SNPs)"
    elif 900_000 <= count < 1_000_000:
        version = "v3 (~960k SNPs)"
    elif 580_000 <= count < 620_000:
        version = "v4 (~600k SNPs)"
    elif 620_000 <= count < 660_000:
        version = "v5 (~640k SNPs)"
    else:
        version = "Unknown/ambiguous"

    return f"{csv_path.name}: {count:,} rows → {version}"


In [21]:
DATA_DIR = Path("./downloads/family-genome-dataset")

csv_paths = sorted(DATA_DIR.glob("*.csv"))
for path in csv_paths:
    print(guess_23andme_version(path))

Child 1 Genome.csv: 601,802 rows → v4 (~600k SNPs)
Child 2 Genome.csv: 631,983 rows → v5 (~640k SNPs)
Child 3 Genome.csv: 631,983 rows → v5 (~640k SNPs)
Father Genome.csv: 601,802 rows → v4 (~600k SNPs)
Mother Genome.csv: 601,802 rows → v4 (~600k SNPs)
