In [13]:
from collections import defaultdict
from itertools import product
import math

import numpy as np
import pandas as pd

In [14]:
# Load data
print('Loading Metadata...')
data_folder = 'data/PsychAD_freeze2_personalized_grpahs/'
meta = pd.read_csv(data_folder + 'syn26527784_latest.csv')

Loading Metadata...


In [15]:
# Utility functions
# Defaultdict with depth
def deepdefaultdict(depth, default=None):
    if depth == 0:
        return None
    return defaultdict(lambda: deepdefaultdict(depth-1, default=default))

# Prune zero-length array entries from d=1 dict
def prunedict(dic):
    newdic = {}
    for k in dic:
        if len(dic[k]) > 0:
            newdic[k] = dic[k]
    return newdic

# Convert to dict
def dd2d(dd):
    if type(dd) == type(defaultdict()):
        for k in dd:
            dd[k] = dd2d(dd[k])
        return(dict(dd))
    return dd

In [47]:
# Stratify by column
print('Gathering...')
stratify_cols = ['BRAAK_AD', 'Sex']
graph_ids = {}
unique_vals = [np.unique(meta[col]) for col in stratify_cols]
for vals in product(*unique_vals):
    print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, vals)]), end='')
    current_filter = np.array([True for _ in range(meta.shape[0])])
    for col, val in zip(stratify_cols, vals):
        current_filter *= (meta[col] == val)
    # Don't include zero-length entries
    graph_ids_vals = list(meta.loc[current_filter]['SubID'])
    if len(graph_ids_vals) > 0:
        graph_ids[vals] = graph_ids_vals
    print(f'\t({len(graph_ids_vals)} graphs)')

Gathering...
BRAAK_AD: 0.0	Sex: Female	(58 graphs)
BRAAK_AD: 0.0	Sex: Male	(128 graphs)
BRAAK_AD: 1.0	Sex: Female	(64 graphs)
BRAAK_AD: 1.0	Sex: Male	(65 graphs)
BRAAK_AD: 2.0	Sex: Female	(105 graphs)
BRAAK_AD: 2.0	Sex: Male	(82 graphs)
BRAAK_AD: 3.0	Sex: Female	(104 graphs)
BRAAK_AD: 3.0	Sex: Male	(80 graphs)
BRAAK_AD: 4.0	Sex: Female	(86 graphs)
BRAAK_AD: 4.0	Sex: Male	(52 graphs)
BRAAK_AD: 5.0	Sex: Female	(108 graphs)
BRAAK_AD: 5.0	Sex: Male	(44 graphs)
BRAAK_AD: 6.0	Sex: Female	(253 graphs)
BRAAK_AD: 6.0	Sex: Male	(153 graphs)
BRAAK_AD: nan	Sex: Female	(0 graphs)
BRAAK_AD: nan	Sex: Male	(0 graphs)


In [None]:
# Aggregate graphs
print('Processing...')
processed_graphs = {}
num_graphs = defaultdict(lambda: 0)
for k, v in graph_ids.items():
    print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k)]), end='')
    running_graph = defaultdict(lambda: defaultdict(list))
    for graph_id in v:
        try:
            graph = pd.read_csv(data_folder + f'regulon_grn/{graph_id}_regulon_list.csv')[['TF', 'gene', 'CoexWeight']]
            num_graphs[k] += 1
        except:
            continue
        graph = graph.rename(columns={'gene': 'TG', 'CoexWeight': 'coef'})
        for _, row in graph.iterrows():
            tf, tg, coef = row
            running_graph[tf][tg].append(coef)
    print(f' ({num_graphs[k]} graphs)')
    processed_graphs[k] = dd2d(running_graph)
num_graphs = dict(num_graphs)

Processing...
BRAAK_AD: 0.0	Sex: Female (13 graphs)
BRAAK_AD: 0.0	Sex: Male

In [6]:
# Keep edges which have some references in common
print('Filtering...')
present_pct = .8
edgelists = {}
for k0, v0 in processed_graphs.items():
    print(f'{stratify_col}: {k0}', end='')
    min_edges = math.ceil(present_pct * num_graphs[k0])
    edges = []
    num_edges = 0
    for k1, v1 in v0.items():
        for k2, v2 in v1.items():
            if len(v2) >= min_edges:
                edges.append([k1, k2, np.mean(v2)])
                num_edges += 1
    print(f' ({num_edges} edges)')
    edgelists[k0] = edges

Filtering...
BRAAK_AD: 0.0 (889 edges)
BRAAK_AD: 1.0 (456 edges)
BRAAK_AD: 2.0 (410 edges)
BRAAK_AD: 3.0 (1369 edges)
BRAAK_AD: 4.0 (3383 edges)
BRAAK_AD: 5.0 (527 edges)
BRAAK_AD: 6.0 (531 edges)
BRAAK_AD: nan (0 edges)


In [7]:
print('Saving...')
for k, v in processed_graphs.items():
    print(f'{stratify_col}: {k}')
    pd.DataFrame(edgelists[k], columns=['TF', 'TG', 'coef']).to_csv(data_folder + f'processed/grn_{stratify_col}_{k}.csv')

Saving...
BRAAK_AD: 0.0
BRAAK_AD: 1.0
BRAAK_AD: 2.0
BRAAK_AD: 3.0
BRAAK_AD: 4.0
BRAAK_AD: 5.0
BRAAK_AD: 6.0
BRAAK_AD: nan


In [8]:
# Get binary edges
binary_edgelists = {k: np.array(v)[:, :2] for k, v in edgelists.items() if len(v) > 0}

# Filter to uniquely appearing edges (per stage)
print('Detecting Appearing Edges...')
binary_unique_edgelists = defaultdict(list)
for k0, graph0 in binary_edgelists.items():
    print(f'{stratify_col}: {k0}', end='')
    num_edges = 0
    for edge in graph0:
        for k1, graph1 in binary_edgelists.items():
            if k1 >= k0:
                continue
            if edge in graph1:
                break
        else:  # Only if for loop completes
            binary_unique_edgelists[k0].append(edge)
            num_edges += 1
    print(f' ({num_edges})')
binary_unique_edgelists = {k: np.stack(v) for k, v in binary_unique_edgelists.items()}

print()
# Filter to uniquely disappearing edges (per stage)
print('Detecting Vanishing Edges...')
binary_unique_edgelists_rev = defaultdict(list)
for k0, graph0 in binary_edgelists.items():
    print(f'{stratify_col}: {k0}', end='')
    num_edges = 0
    for edge in graph0:
        for k1, graph1 in binary_edgelists.items():
            if k1 <= k0:
                continue
            if edge in graph1:
                break
        else:  # Only if for loop completes
            binary_unique_edgelists_rev[k0].append(edge)
            num_edges += 1
    print(f' ({num_edges})')
binary_unique_edgelists_rev = {k: np.stack(v) for k, v in binary_unique_edgelists_rev.items()}

Detecting Appearing Edges...
BRAAK_AD: 0.0 (889)
BRAAK_AD: 1.0 (1)
BRAAK_AD: 2.0 (5)
BRAAK_AD: 3.0 (8)
BRAAK_AD: 4.0 (284)
BRAAK_AD: 5.0 (0)
BRAAK_AD: 6.0 (0)

Detecting Vanishing Edges...
BRAAK_AD: 0.0 (0)
BRAAK_AD: 1.0 (1)
BRAAK_AD: 2.0 (0)
BRAAK_AD: 3.0 (6)
BRAAK_AD: 4.0 (1052)
BRAAK_AD: 5.0 (0)
BRAAK_AD: 6.0 (531)


In [15]:
print('Appearing Edges...')
for k, v in binary_unique_edgelists.items():
    print(f'{stratify_col}: {k}')
    print('Top TFs: ', end='')
    genes, counts = np.unique(v[:, 0], return_counts=True)
    for gene in genes[np.argsort(-counts)][:20]:
        print(f' {gene}', end='')
    print()
    print('Top TGs: ', end='')
    genes, counts = np.unique(v[:, 1], return_counts=True)
    for gene in genes[np.argsort(-counts)][:20]:
        print(f' {gene}', end='')
    print()
    print()

Appearing Edges...
BRAAK_AD: 0.0
Top TFs:  RUNX1 FLI1 ZEB1 ETV6 TCF7L2 FOXP2 MAF SOX10 DLX1 TCF7L1 BACH1 SOX5 SOX8 ETS1 ETS2 ARX LHX6
Top TGs:  ARHGAP6 ZFHX3 DISC1 MEF2C NCK2 RIN3 CHST11 MAML3 HS3ST4 DENND3 ZFP36L1 RCSD1 BNC2 LHFPL2 LPAR6 KCNIP1 LPCAT2 LYN LRRK1 LY86

BRAAK_AD: 1.0
Top TFs:  JUND
Top TGs:  PTMA

BRAAK_AD: 2.0
Top TFs:  MXI1 SOX6
Top TGs:  CADM1 FBXL7 ITGB8 TJP1 ZNF462

BRAAK_AD: 3.0
Top TFs:  CREB5 ETV4 JUN STAT1
Top TGs:  CDH19 DHX8 DLG1 FILIP1L ID2 MAP7 PRUNE2 USP54

BRAAK_AD: 4.0
Top TFs:  IKZF1 FOS REL SREBF2 ERG JUNB ONECUT2 SPI1 SREBF1
Top TGs:  TFEC SRGN CD4 NCKAP1L C1QB PIK3R5 AAK1 RALGAPA2 RAC1 RAB8B RAB2A QSER1 PYGL PUM1 PTPN22 PTPN2 RAB20 RANBP9 RAP1GAP2 RAPH1



In [14]:
print('Vanishing Edges...')
for k, v in binary_unique_edgelists_rev.items():
    print(f'{stratify_col}: {k}')
    print('Top TFs: ', end='')
    genes, counts = np.unique(v[:, 0], return_counts=True)
    for gene in genes[np.argsort(-counts)][:20]:
        print(f' {gene}', end='')
    print()
    print('Top TGs: ', end='')
    genes, counts = np.unique(v[:, 1], return_counts=True)
    for gene in genes[np.argsort(-counts)][:20]:
        print(f' {gene}', end='')
    print()
    print()

Vanishing Edges...
BRAAK_AD: 1.0
Top TFs:  JUND
Top TGs:  PTMA

BRAAK_AD: 3.0
Top TFs:  CREB5 ETV4 STAT1
Top TGs:  DHX8 DLG1 FILIP1L MAP7 PRUNE2 USP54

BRAAK_AD: 4.0
Top TFs:  FLI1 IKZF1 DLX1 FOXO1 LHX6 ARX MXI1 REL FOS ELF1 SREBF2 JUN JUNB BCL6 ERG SREBF1 MEF2C ONECUT2 SPI1 ETS1
Top TGs:  NCKAP1L SAMSN1 HIVEP3 TFEC C1QB PRKCD SRGN LCP2 GRIN3A CD53 CD4 ANK1 PIK3R5 ITM2B JAK3 JDP2 SPTLC3 SRBD1 ZNRF2 ITGAX

BRAAK_AD: 6.0
Top TFs:  ETV6 ZEB1 RUNX1 TCF7L2 SOX10 TCF7L1 SOX8 FOXP2 ETS2 MAF SOX5
Top TGs:  PRRX1 CELF2 ZFP36L1 ZFHX3 DENND3 SLCO2B1 CHST11 HS3ST4 GLIS3 GNB4 GPRIN3 GRB2 SFMBT2 HIF1A ZNF710 IFNGR1 INPP5D ITPR2 JAK2 RREB1

