In [305]:
import os
import pandas as pd
import gzip
from pysam import VariantFile
from dataclasses import dataclass, field
from collections import defaultdict
from typing import Set, Dict, List
import random
import tqdm
import numpy as np

In [254]:
random.seed(10)

In [255]:
working_dir = "./data/"
local_filename = "GMKF-OFC-GREGoR-denovo-SV-Master-20240918.txt"

In [256]:
@dataclass
class IndividualRelationship:
    family_id: str
    individual_id: str
    father_id: str
    mother_id: str
    gender: int
    affected: int


@dataclass
class Pedigree:
    fathers: Set[str] = field(default_factory=set)
    mothers: Set[str] = field(default_factory=set)
    relationships: Dict[str, IndividualRelationship] = field(default_factory=dict)

    
@dataclass
class FilePair:
    vcf_filename: str
    ped_filename: str
    pedigree: Pedigree = field(default_factory=Pedigree)


def grf(filename):  # get resolved filename
    return os.path.join(working_dir, filename)

vcf_ped_pair_filenames = [
    FilePair(grf("mg_batch20.annotated.vcf.gz"), grf("mg_batch20_ware-20231120.VCFids.ped")),  # ware
    FilePair(grf("m4_batch03.annotated.vcf.gz"), grf("mg_batch12_krantz-20230922.VCFids.ped")),  # krantz
    FilePair(grf("m2_batch11.annotated.vcf.gz"), grf("mg_batch02_alba.ped.txt")),  # butali (not in the workspace)
    FilePair(grf("m2_batch10.annotated.vcf.gz"), grf("mg_batch01_ped.tsv")),  # beaty
    FilePair(grf("mg_batch13.annotated.vcf.gz"), grf("mg_batch13_alba.ped.txt")),  # leslie
    FilePair(grf("mg_batch03.annotated.vcf.gz"), grf("ped_mg_batch03.tsv")), # chung (not in the workspace)
    FilePair(grf("mg_batch05.annotated.vcf.gz"), grf("mg_batch05_ped.txt")), # engle (not in the workspace)
    FilePair(grf("mg_batch17.annotated.vcf.gz"), grf("mg_batch17_ped.tsv")), # marazita (not in the workspace)
    FilePair(grf("mg_batch08.annotated.vcf.gz"), grf("mg_batch08_gleeson-20230727.VCFids.ped")),  # gleeson
    FilePair(grf("mg_batch07.annotated.vcf.gz"), grf("ped_mg_batch07.tsv")),  # gharavi
    # FilePair("phase4_all_batches.annotated.vcf.gz", "..."),  # gregor
    # FilePair("second_run_cp_cohort.annotated.vcf.gz", "...")  # OFC
]

In [257]:
df = pd.read_csv(os.path.join(working_dir, local_filename), sep="\t")
df = df.reset_index() # to make sure indexes pair with the number of rows

  df = pd.read_csv(os.path.join(working_dir, local_filename), sep="\t")


In [258]:
# Drop rows with NaN in the result_final column
df = df[df["result_final"].notna()]

### Extract labeled true not de novo variants (inherited variants)

In [261]:
inherited_variants = df[df["result_final"] != "yes"]

In [262]:
print(f"All variants:       {df.shape[0]:,}")
print(f"Inherited variants: {inherited_variants.shape[0]:,}")

All variants:       19,575
Inherited variants: 16,337


### Data prepration: Extract parents in each PED file

In [263]:
for file_pair in vcf_ped_pair_filenames:
    ped_filename = file_pair.ped_filename
    with open(ped_filename, "r") as f:
        p = file_pair.pedigree
        f.readline()  # skip the header
        for line in f:
            cols = line.strip().split("\t")
            indiv = cols[1]
            father = cols[2]
            mother = cols[3]

            p.fathers.add(father)
            p.mothers.add(mother)
            p.relationships[indiv] = IndividualRelationship(cols[0], indiv, father, mother, cols[4], cols[5])

### Reorganize the inherited variants

Assuming `sample` in the `df` refers to the individual ID, and we should be able to find a hit for that sample ID in only one of the PED files.

In [264]:
def get_pair(individual_id):
    for file_pair in vcf_ped_pair_filenames:
        if individual_id in file_pair.pedigree.relationships:
            return file_pair
    # print(f"Individual ID ({individual_id}) not found in the PED files.")  # TEMP 
    raise KeyError(f"Individual ID ({individual_id}) not found in the PED files.")

In [281]:
def get_samples_genotypes(variant, sample_ids):
    samples_genotype = {}
    for sample_id in sample_ids:
        sample = variant.samples.get(sample_id)
        if sample is None:
            samples_genotype[sample_id] = False
        else:
            samples_genotype[sample_id] = True if sample.get("GT") == (0, 0) else False
    
    # True: variant is not in the parent (i.e., de novo w.r.t that parent) and can be used for swapping
    # False: either the parent does not have a genotype for that variant, or the parent also has this variant, hence, cannot be used for the swapping
    return samples_genotype

In [322]:
def get_best_parent_swap(fathers, mothers, father_id, mother_id):
    permutations = defaultdict(int)
    for variant_id in fathers.keys():
        for father_id, father_not_carries_variant in fathers[variant_id].items():
            for mother_id, mother_not_carries_variant in mothers[variant_id].items():
                permutations[(father_id, mother_id)] += 1 if father_not_carries_variant and mother_not_carries_variant else 0
    
    highest_score = permutations[max(permutations, key=permutations.get)]
    good_permutations = [x for x, score in permutations.items() if score == highest_score]

    best_permutations = [x for x in good_permutations if x[0] == father_id or x[1] == mother_id]
    return best_permutations[0] if len(best_permutations) > 0 else good_permutations[0]

In [323]:
for sample, sample_variants_df in inherited_variants.groupby("sample"):
    
    try:
        file_pair = get_pair(sample)
    except:
        continue
    relationship = file_pair.pedigree.relationships[sample]

    fathers_genotypes = {}
    mothers_genotypes = {}

    vcf = VariantFile(file_pair.vcf_filename)
    for _, row in sample_variants_df.iterrows():
        for variant in vcf.fetch(row["chrom"], row["start"], row["end"]):
            individual = variant.samples.get(sample)
            if individual.get("GT") == (0, 0):
                # Discussed with Harrison. 
                # There are cases this could be a valid de novo, 
                # but since very rare, we're excluding them.
                continue

            original_father = variant.samples.get(relationship.father_id)
            original_mother = variant.samples.get(relationship.mother_id)
            if original_father is None:
                print(f"A sample with the father ID ({relationship.father_id}) not found in the VCF. Skipping this variant")
                continue
            if original_mother is None:
                print(f"A sample with the mother ID ({relationship.mother_id}) not found in the VCF. Skipping this variant")
                continue

            if original_father.get("GT") == (0, 0) and original_mother.get("GT") == (0, 0):
                # Same as above, discussed with Harrison. 
                # There are cases this could be a valid de novo, 
                # but since very rare, we're excluding them.
                continue

            fathers_genotypes[variant.id] = get_samples_genotypes(variant, file_pair.pedigree.fathers)
            mothers_genotypes[variant.id] = get_samples_genotypes(variant, file_pair.pedigree.mothers)

    suggested_swap = get_best_parent_swap(fathers_genotypes, mothers_genotypes, relationship.father_id, relationship.mother_id)
    print(suggested_swap)
    print(len(sample_variants_df))
    break

    
    # break

    # for ped in peds:
    #     relationship = ped.relationships.get(sample, None)
    #     if relationship is None:
    #         continue

    #     # swap the parents of the individual
    #     other_father = random.choice(list(ped.fathers - relationship.father_id))
    #     other_mother = random.choice(list(ped.mothers - relationship.mother_id))
    # break

('BS_2PEATVQM', 'BS_16ENYEWK')
4


In [64]:
# vcf = VariantFile(vcf_filename)

In [66]:
# for idx, row in df.iterrows():
#     for variant in vcf.fetch(row["chrom"], row["start"], row["end"]):
#         print("---")
#     break