In [1]:
from graph_var.utils import *
from graph_var.graph import PangenomeGraph
from graph_var_analysis.evaluating_functions import *
import pandas as pd
from math import inf

In [8]:
import os
os.chdir('D:\Pycharm\GFA_Project')

In [9]:
graph_obj_dir = './Graph_objs'
raw_vcf_dir = './VCFs_chr'
sub_vcf_dir = './VCFs_subset'
graph_vcf_dir = './VCFs_update'
stats_dir = './Stats_update'
ac_dir = './Bubble_result_update'
tree_dir = './Tree'
region_dir = './Region_files'
bubble_summary_dir = './Bubble_summary'
bubble_result_dir = './Bubble_result_update'

In [15]:
def genotype_and_linear_coverage_by_sample(graph, walks):
    """
    Integrates the genotype of each sample from the genotype of each walk.
    :param walks: list of walks for each sample
    :return:
    """
    cr_dict_haplotype = dict()
    ca_dict_haplotype = dict()

    for walk in walks:
        cr_dict_walk, ca_dict_walk = genotype(graph, walk)
        for edge, count in cr_dict_walk.items():
            if edge in cr_dict_haplotype:
                cr_dict_haplotype[edge] += count
            else:
                cr_dict_haplotype[edge] = count
        for edge, count in ca_dict_walk.items():
            if edge in ca_dict_haplotype:
                ca_dict_haplotype[edge] += count
            else:
                ca_dict_haplotype[edge] = count

    return cr_dict_haplotype, ca_dict_haplotype

def genotype(graph, walk, representative=True):
    # Append start and end nodes to walk
    start = [graph.termini[0] + '_+' if graph.direction(walk[0]) == 1 else graph.termini[1] + '_-']
    end = [graph.termini[1] + '_+' if graph.direction(walk[-1]) == 1 else graph.termini[0] + '_-']
    walk = start + walk + end

    count_ref = {}
    count_alt = {}
    for e in zip(walk[:-1], walk[1:]):
        if not graph.has_edge(*e):
            raise ValueError(f"Specified list contains edge {e} which is not present in the graph")
        
        if representative:
            if not graph.edges[e]['is_representative']:
                e = edge_complement(e)

        if graph.edges[e]['is_in_tree']:
            count_ref[e] = count_ref.get(e, 0) + 1
        else:
            count_alt[e] = count_alt.get(e, 0) + 1

    return count_ref, count_alt

def summarise_walk(G, walk):
    edge_dict = {}
    for edge in zip(walk[:-1], walk[1:]):
        if G.edges[edge]['is_in_tree']:
            edge_dict['in_tree'] = edge_dict.get('in_tree', 0) + 1
        else:
            if not G.edges[edge]['is_representative']:
                edge = edge_complement(edge)
            var_type = G.identify_variant_type(edge)
            edge_dict[var_type] = edge_dict.get(var_type, 0) + 1
    return {k: edge_dict[k] for k in sorted(edge_dict.keys())}

def get_combinations(n):
    nums = list(range(n))  # Generate numbers from 0 to n-1
    result = []
    
    for i in range(n):
        for j in range(i + 1, n):  # Ensure i < j to avoid duplicates
            result.append((i, j))

    return result

Count edges shared between walks of the same haplotype, just looking at those with allele count > 1

In [11]:
graph_path = f"{graph_obj_dir}/chr22.pkl"
G, walks, sample_names = load_graph_from_pkl(graph_path)

In [12]:
sample_walks_dict = group_walks_by_name(walks, sample_names)

In [6]:
len(G.nodes()), len(G.edges()), len(G.variant_edges)

(2558670, 3562850, 502091)

Number of edges that have allele count > 1

In [6]:
sample_walk_edges_dict = {sample_name: [{edge for edge in zip(walk[:-1], walk[1:])} for walk in walks] 
                          for sample_name, walks in sample_walks_dict.items()}

In [10]:
sample_data_dict = {sample_name: genotype_and_linear_coverage_by_sample(G, walks) for sample_name, walks in sample_walks_dict.items()}

In [13]:
suspected_overlap_ref_edges = {ref_edge for sample in sample_data_dict.keys() for ref_edge in sample_data_dict[sample][0].keys() if sample_data_dict[sample][0][ref_edge] > 1}

suspected_overlap_alt_edges = {alt_edge for sample in sample_data_dict.keys() for alt_edge in sample_data_dict[sample][1].keys() if sample_data_dict[sample][1][alt_edge] > 1}

In [14]:
len(suspected_overlap_ref_edges), len(suspected_overlap_alt_edges)

(194976, 14074)

Number of edges that are shared between walks of the same haplotype

In [48]:
overlap_edge_dict = dict()

for sample, walks in sample_walk_edges_dict.items():
    if len(walks) <= 1:
        continue
    overlap_edges = set()
    combinations = get_combinations(len(walks))
    for idx1, idx2 in combinations:
        edge_set1 = walks[idx1]
        edge_set2 = walks[idx2]
        
        shared_edges = edge_set1.intersection(edge_set2)
        overlap_edges = overlap_edges.union(shared_edges)
    overlap_edge_dict[sample] = overlap_edges

In [51]:
all_overlap_edges = {edge for edges in overlap_edge_dict.values() for edge in edges}

In [53]:
len(all_overlap_edges)

47015

Examples of shared edge

In [57]:
[key for key, value in overlap_edge_dict.items() if len(value) > 0][:5]

['HG00438_1', 'HG00621_1', 'HG00673_1', 'HG00733_2', 'HG00741_2']

In [58]:
list(overlap_edge_dict['HG00438_1'])[:5]

[('44479139_+', '44479137_+'),
 ('44479227_+', '44479224_+'),
 ('45343553_+', '45343554_+'),
 ('44479285_+', '44479284_+'),
 ('44478856_+', '44478855_+')]

In [None]:
graph_path = f"{graph_obj_dir}/chr22.pkl"
G, walks, sample_names = load_graph_from_pkl(graph_path)
sample_walks_dict = group_walks_by_name(walks, sample_names)

Case 1: Two walk indeed share the same edge

In [74]:
sample_example = 'HG00438_1'
edge_example = ('44479139_+', '44479137_+')

In [76]:
sample_genotypes = [genotype(G, walk) for walk in sample_walks_dict[sample_example]]

print(len(sample_genotypes), len(sample_walks_dict[sample_example]))

tree_ref_edge = G.reference_tree_edge(edge_example)
print(tree_ref_edge)

print([sample_genotypes[i][0].get(tree_ref_edge, 0) for i in range(len(sample_genotypes))])

4 4
('44479139_+', '44479137_+')
[1, 0, 1, 0]


In [77]:
walk_edges_1 = {edge for edge in zip(sample_walks_dict[sample_example][0][:-1], sample_walks_dict[sample_example][0][1:])}
walk_edges_2 = {edge for edge in zip(sample_walks_dict[sample_example][2][:-1], sample_walks_dict[sample_example][2][1:])}

tree_ref_edge in walk_edges_1, tree_ref_edge in walk_edges_2

(True, True)

In [None]:
[(idx+1, summarise_walk(G, walk)) for idx, walk in enumerate(sample_walks_dict[sample_example])]

Case 2: Two walk share the complementary edges, one walk visits the edge and the other visits the complement of the edge

In [69]:
sample_example = 'HG01071_2'
variant_edge_example = ('44987811_-', '44987812_+')
sample_genotypes = [genotype(G, walk) for walk in sample_walks_dict[sample_example]]

print(len(sample_genotypes), len(sample_walks_dict[sample_example]))

tree_ref_edge = G.reference_tree_edge(variant_edge_example)
print(tree_ref_edge)

print([sample_genotypes[i][0].get(tree_ref_edge, 0) for i in range(len(sample_genotypes))])

6 6
('44986610_+', '44987812_+')
[0, 1, 0, 1, 0, 0]
False False


In [73]:
walk_edges_1 = {edge for edge in zip(sample_walks_dict[sample_example][1][:-1], sample_walks_dict[sample_example][1][1:])}
walk_edges_2 = {edge for edge in zip(sample_walks_dict[sample_example][3][:-1], sample_walks_dict[sample_example][3][1:])}

edge_complement(tree_ref_edge) in walk_edges_1, tree_ref_edge in walk_edges_2

(True, True)

Case 3: One walk visits the same edge two times

In [22]:
def find_repeated_edge_in_walk():
    for sample_id, walks in sample_walks_dict.items():
        for idx, walk in enumerate(walks):
            cr_dict, ca_dict = genotype(G, walk, representative=False)
            for edge in cr_dict.keys():
                if cr_dict[edge] > 1:
                    print(sample_id, idx, edge)
                    return

In [23]:
find_repeated_edge_in_walk()

HG00741_1 0 ('44514220_-', '44514219_-')


In [33]:
def find_repeated_edge_in_walk():
    for sample_id, walks in sample_walks_dict.items():
        for idx, walk in enumerate(walks):
            cr_dict, ca_dict = genotype(G, walk, representative=False)
            for edge in ca_dict.keys():
                if ca_dict[edge] > 1:
                    print(sample_id, idx, edge)
                    return

In [34]:
find_repeated_edge_in_walk()

HG00741_1 0 ('44514190_-', '44514189_+')


In [26]:
sample_example = 'HG00741_1'
edge_example = ('44514220_-', '44514219_-')
sample_genotypes = [genotype(G, walk, representative=False) for walk in sample_walks_dict[sample_example]]

print([sample_genotypes[i][0].get(edge_example, 0) for i in range(len(sample_genotypes))])

[2, 0, 0]


In [36]:
sample_example = 'HG00741_1'
edge_example = ('44514190_-', '44514189_+')
sample_genotypes = [genotype(G, walk, representative=False) for walk in sample_walks_dict[sample_example]]

print([sample_genotypes[i][1].get(edge_example, 0) for i in range(len(sample_genotypes))])

[2, 0, 0]


Case 4: one walk visits the edge one time and the complement of the edge one time

In [29]:
def find_repeated_edge_in_walk():
    for sample_id, walks in sample_walks_dict.items():
        for idx, walk in enumerate(walks):
            cr_dict, ca_dict = genotype(G, walk, representative=False)
            for edge in cr_dict.keys():
                if cr_dict[edge] > 1 and cr_dict.get(edge_complement(edge), 0) > 1:
                    print(sample_id, idx, edge)
                    return

In [30]:
find_repeated_edge_in_walk()

In [31]:
def find_repeated_edge_in_walk():
    for sample_id, walks in sample_walks_dict.items():
        for idx, walk in enumerate(walks):
            cr_dict, ca_dict = genotype(G, walk, representative=False)
            for edge in ca_dict.keys():
                if ca_dict[edge] > 1 and ca_dict.get(edge_complement(edge), 0) > 1:
                    print(sample_id, idx, edge)
                    return

In [32]:
find_repeated_edge_in_walk()