In [1]:
from collections import defaultdict
from itertools import product
import math
from tqdm import tqdm

import numpy as np
import pandas as pd

In [2]:
# Load data
print('Loading Metadata...')
data_folder = 'data/PsychAD_freeze2_personalized_grpahs/'
meta = pd.read_csv(data_folder + 'syn26527784_latest.csv')

Loading Metadata...


In [3]:
# Utility functions
# Defaultdict with depth
def deepdefaultdict(depth, default=None):
    if depth == 0:
        return None
    return defaultdict(lambda: deepdefaultdict(depth-1, default=default))

# Prune zero-length array entries from d=1 dict
def prunedict(dic):
    newdic = {}
    for k in dic:
        if len(dic[k]) > 0:
            newdic[k] = dic[k]
    return newdic

# Convert to dict
def dd2d(dd):
    if type(dd) == type(defaultdict()):
        for k in dd:
            dd[k] = dd2d(dd[k])
        return(dict(dd))
    return dd

# Hyperparameters
cell_type_top_regulons = 20
edge_percentile = 0
edge_present_pct = 99

# Step-by-Step

In [4]:
# # Hyperparameters
# stratify_cols = ['BRAAK_AD', 'nps_PsychoAgiHxValue']
# # Hx - Historical/Ever Seen
# # 'BRAAK_AD', 'Sex'
# # 'nps_PsychoAgiCurValue', 'nps_PsychoAgiHxValue'
# # 'nps_RumCurValue', 'nps_RumHxValue'
# # 'nps_PsychoRetardCurValue', 'nps_PsychoRetardHxValue'
# # 'nps_FatCurValue', 'nps_FatHxValue'
# stratify_cols.sort()
# cell_type = 'Mural'  # EN Endo Glial Immune IN Mural None

In [5]:
# # Stratify by column
# print('Gathering...')
# graph_ids = {}
# unique_vals = [np.unique(meta[col].astype(str)) for col in stratify_cols]
# for vals in product(*unique_vals):
#     print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, vals)]), end='')
#     current_filter = np.array([True for _ in range(meta.shape[0])])
#     for col, val in zip(stratify_cols, vals):
#         current_filter *= (meta[col].astype(str) == val)
#     graph_ids[vals] = list(meta.loc[current_filter]['SubID'])
#     print(f'\t({len(graph_ids[vals])} IDs)')
# graph_ids = prunedict(graph_ids)

In [6]:
# # Aggregate graphs
# print('Processing...')
# processed_graphs = {}
# num_graphs = defaultdict(lambda: 0)
# for k, v in graph_ids.items():
#     print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k)]), end='')
#     running_graph = defaultdict(lambda: defaultdict(list))
#     for graph_id in v:
#         # Load individual graph
#         try:
#             graph = pd.read_csv(data_folder + f'regulon_grn/{graph_id}_regulon_list.csv')[['TF', 'gene', 'CoexWeight', 'regulon']]
#             graph = graph.rename(columns={'gene': 'TG', 'CoexWeight': 'coef'})  # TF, TG, coef, regulon
#         except:
#             continue
#         # Filter to regulons based on cell-type
#         if cell_type is not None:
#             try:
#                 rss = pd.read_csv(data_folder + f'rss/{graph_id}_6_celltype_rss.csv', index_col=0)
#                 assert cell_type in rss.index
#                 # Top x regulons
#                 regulons = rss.loc[cell_type].nlargest(cell_type_top_regulons).index
#                 graph = graph.loc[np.isin(np.array(graph['regulon']), regulons)]
#             except:
#                 continue
#             # Top x percent of coefs
#             graph = graph.loc[np.array(graph['coef']) > np.percentile(graph['coef'], edge_percentile)]
#         graph = graph[['TF', 'TG', 'coef']]
#         num_graphs[k] += 1
#         for _, row in graph.iterrows():
#             tf, tg, coef = row
#             running_graph[tf][tg].append(coef)
#     print(f'\t({num_graphs[k]} Graphs)')
#     processed_graphs[k] = dd2d(running_graph)
# num_graphs = dict(num_graphs)
# processed_graphs = prunedict(processed_graphs)

In [7]:
# # Keep edges which have some references in common
# print('Filtering...')
# edgelists = {}
# for k0, v0 in processed_graphs.items():
#     print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k0)]), end='')
#     # Calculate min edges
#     edge_counts = []
#     for k1, v1 in v0.items():
#         for k2, v2 in v1.items():
#             edge_counts.append(len(v2))
#     min_edges = np.percentile(edge_counts, edge_present_pct)
#     edges = []
#     num_edges = 0
#     for k1, v1 in v0.items():
#         for k2, v2 in v1.items():
#             if len(v2) >= min_edges:
#                 edges.append([k1, k2, np.mean(v2)])
#                 num_edges += 1
#     print(f'\t({num_edges} Edges)')
#     edgelists[k0] = edges
# edgelists = prunedict(edgelists)

In [8]:
# print('Saving...')
# for k in edgelists:
#     print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k)]))
#     pd.DataFrame(edgelists[k], columns=['TF', 'TG', 'coef']).to_csv(
#         data_folder
#         + f'processed/grn_{f"{cell_type}_" if cell_type is not None else ""}'
#         f'{"_".join([f"{col}_{val}" for col, val in zip(stratify_cols, k)])}.csv')

# Together

In [9]:
# https://github.com/DiseaseNeuroGenomics/psychad_metadata/blob/main/psychad_clinical_metadata_querying.R
diagnosis_classification = {
  'neurodegenerative': ['AD', 'MCI', 'Dementia', 'PD', 'PD_uncertain_plus_encephalitic', 'DLBD', 'FTD', 'ALS', 'Others_Neurodegenerative'],
  'neurological': ['MS', 'PSP', 'Epilepsy', 'Seizures', 'Tumor', 'Migraine_headaches', 'Head_Injury', 'Vascular', 'Others_Neurological'], 
  'neuropsychiatric': ['SCZ', 'MDD', 'BD_unspecific', 'BD_I', 'BD_II', 'PTSD', 'ADHD', 'OCD', 'Tardive_Dyskinesia_Neuroleptic_induced', 'Schizoaffective_bipolar', 'Schizoaffective_depressive', 'Anorexia', 'Bulimia', 'Anxiety', 'Binge_Purge', 'Eating_disorder', 'Others_Neuropsychiatric'],
  'metabolic': ['Diabetes_mellitus_unspecified', 'TD_I', 'TD_II'],
}
diagnosis_benign = ['Anxiety', 'Migraine_headaches'] + diagnosis_classification['metabolic']
all_dx = [j for i in diagnosis_classification.values() for j in i]  # Different than file
deleterious_dx = list(set(all_dx) - set(diagnosis_benign))

# Macros
MSSM_OVER_60 = lambda x: (
    (x['Age'].astype(int) >= 60)
    * (x['Brain_bank'].astype(str) == 'MSSM'))
RUSH_OVER_60 = lambda x: (
    (x['Age'].astype(int) >= 60)
    * (x['Brain_bank'].astype(str) == 'RUSH'))
MSSM = lambda x: (x['Brain_bank'].astype(str) == 'MSSM')
HBCC = lambda x: (x['Brain_bank'].astype(str) == 'HBCC')

# Disease
AD = lambda x: (
    (x.fillna(0)[list(set(deleterious_dx) - set(['MCI', 'Dementia', 'AD']))].sum(axis=1) == 0)
    * (x['AD'].fillna(-1).astype(int) == 1))
AD_STRICT = lambda x: (
    (x.fillna(0)[list(set(deleterious_dx) - set(['MCI', 'Dementia', 'AD']))].sum(axis=1) == 0)
    * (x['CERAD'].fillna(-1).astype(int).isin([4]))
    * (x['BRAAK_AD'].fillna(-1).astype(int).isin([3, 4, 5, 6]))
    * (x['Dementia'].fillna(-1).astype(int) == 1)
    * (x['Brain_bank'].astype(str) != 'HBCC'))
SCZ = lambda x: (
    (x.fillna(0)[list(set(deleterious_dx) - set(['SCZ', 'Schizoaffective_bipolar', 'Schizoaffective_depressive', 'Dementia', 'MCI']))].sum(axis=1) == 0)
    * (x.fillna(0)[['SCZ', 'Schizoaffective_bipolar', 'Schizoaffective_depressive']].sum(axis=1) > 0))

# Control (Missing metadata)
CONTROLS_NEUROPATHOLOGICAL_CLINICAL = lambda x: (
    (x.fillna(0)[list(set(deleterious_dx) - set(['MCI', 'Dementia', 'AD']))].sum(axis=1) == 0)
    * (
        (x['CERAD'].fillna(-1).astype(int).isin([1, 2]))
        * (x['BRAAK_AD'].fillna(-1).astype(int).isin([0, 1, 2]))
        + (x['Brain_bank'].astype(str) == 'HBCC')
    ))
CONTROLS_SUPERCONTROLS = lambda x: (
    (x.fillna(0)[list(set(deleterious_dx) - set(['AD']))].sum(axis=1) == 0)
    * (
        (x['CERAD'].fillna(-1).astype(int).isin([1]))
        * (x['BRAAK_AD'].fillna(-1).astype(int).isin([0, 1, 2]))
        + (x['Brain_bank'].astype(str) == 'HBCC')
    ))


In [10]:
# CELLS
# EN Endo Glial Immune IN Mural None
cell_type_list = ['EN', 'Endo', 'Glial', 'Immune', 'IN', 'Mural', None]

# STRATIFY COLUMNS
stratify_cols_list = [
    ['nps_PsychoAgiCurValue'],
    ['HippoPlaquesValue'],
    ['AmygPlaquesValue'],
    ['HippoPlaquesWCoresValue'],
    ['AmygTanglesValue'],
    ['OcciPlaquesValue'],
    ['OcciPlaquesWCoresValue'],
]

# FILTERS
contrast_list = {
    'c01x': [lambda x: MSSM_OVER_60(x) * AD_STRICT(x), lambda x: CONTROLS_SUPERCONTROLS(x)],
    'c02x': [lambda x: MSSM_OVER_60(x) * AD(x), lambda x: CONTROLS_NEUROPATHOLOGICAL_CLINICAL(x)],
    'c03x': [lambda x: RUSH_OVER_60(x) * AD(x), lambda x: CONTROLS_NEUROPATHOLOGICAL_CLINICAL(x)],
    'c06x': [lambda x: (MSSM(x) + HBCC(x)) * AD(x), lambda x: (MSSM(x) + HBCC(x)) * SCZ(x)],
    'c11x': [lambda x: (MSSM_OVER_60(x) + RUSH_OVER_60(x)) * AD(x), lambda x: CONTROLS_NEUROPATHOLOGICAL_CLINICAL(x)],
    'strict_vs_lenient': [lambda x: AD_STRICT(x), lambda x: AD(x)],
}
contrast_titles = {
    'c01x': ['StrictAD', 'SuperControl'],
    'c02x': ['AD', 'Control'],
    'c03x': ['AD', 'Control'],
    'c06x': ['AD', 'SCZ'],
    'c11x': ['AD', 'Control'],
    'strict_vs_lenient': ['StrictAD', 'AD'],
}
for k, v in contrast_list.items():
    print(f'{k}:\t{sum(v[0](meta))} {sum(v[1](meta))}')

c01x:	257 283
c02x:	455 395
c03x:	68 395
c06x:	463 140
c11x:	523 395
strict_vs_lenient:	303 531


In [11]:
for cell_type, stratify_cols, contrast_name in tqdm(product(cell_type_list, stratify_cols_list, contrast_list), total=len(cell_type_list)*len(stratify_cols_list)*len(contrast_list)):
    # print(cell_type)
    # print(stratify_cols)
    # print(filter_name)
    
    # Stratify by column
    # print('Gathering...')
    graph_ids = {}
    unique_vals = [np.unique(meta[col].astype(str)) for col in stratify_cols]

    # Add contrast to stratify
    if contrast_name:
        stratify_cols = stratify_cols + [contrast_name]
        unique_vals = unique_vals + [contrast_titles[contrast_name]]

    for vals in product(*unique_vals):
        # Isolate contrast
        if contrast_name:
            iter_stratify = zip(stratify_cols[:-1], vals[:-1])
        else:
            iter_stratify = zip(stratify_cols, vals)

        # Stratify column
        # print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, vals)]), end='')
        current_filter = np.array([True for _ in range(meta.shape[0])])
        for col, val in iter_stratify:
            current_filter *= (meta[col].astype(str) == val)
        
        # Contrasts
        if contrast_name:
            current_filter *= contrast_list[contrast_name][np.argwhere(np.array(contrast_titles[contrast_name]) == vals[-1])[0][0]](meta)

        graph_ids[vals] = list(meta.loc[current_filter]['SubID'])
        # print(f'\t({len(graph_ids[vals])} IDs)')
    graph_ids = prunedict(graph_ids)

    # Aggregate graphs
    # print('Processing...')
    processed_graphs = {}
    num_graphs = defaultdict(lambda: 0)
    for k, v in graph_ids.items():
        # print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k)]), end='')
        running_graph = defaultdict(lambda: defaultdict(list))
        for graph_id in v:
            # Load individual graph
            try:
                graph = pd.read_csv(data_folder + f'regulon_grn/{graph_id}_regulon_list.csv')[['TF', 'gene', 'CoexWeight', 'regulon']]
                graph = graph.rename(columns={'gene': 'TG', 'CoexWeight': 'coef'})  # TF, TG, coef, regulon
            except:
                continue
            # Filter to regulons based on cell-type
            if cell_type is not None:
                try:
                    rss = pd.read_csv(data_folder + f'rss/{graph_id}_6_celltype_rss.csv', index_col=0)
                    assert cell_type in rss.index
                    # Top x regulons
                    regulons = rss.loc[cell_type].nlargest(cell_type_top_regulons).index
                    graph = graph.loc[np.isin(np.array(graph['regulon']), regulons)]
                except:
                    continue
                # Top x percent of coefs
                graph = graph.loc[np.array(graph['coef']) > np.percentile(graph['coef'], edge_percentile)]
            graph = graph[['TF', 'TG', 'coef']]
            num_graphs[k] += 1
            for _, row in graph.iterrows():
                tf, tg, coef = row
                running_graph[tf][tg].append(coef)
        # print(f'\t({num_graphs[k]} Graphs)')
        processed_graphs[k] = dd2d(running_graph)
    num_graphs = dict(num_graphs)
    processed_graphs = prunedict(processed_graphs)

    # Keep edges which have some references in common
    # print('Filtering...')
    edgelists = {}
    for k0, v0 in processed_graphs.items():
        # print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k0)]), end='')
        # Calculate min edges
        edge_counts = []
        for k1, v1 in v0.items():
            for k2, v2 in v1.items():
                edge_counts.append(len(v2))
        min_edges = np.percentile(edge_counts, edge_present_pct)
        edges = []
        num_edges = 0
        for k1, v1 in v0.items():
            for k2, v2 in v1.items():
                if len(v2) >= min_edges:
                    edges.append([k1, k2, np.mean(v2)])
                    num_edges += 1
        # print(f'\t({num_edges} Edges)')
        edgelists[k0] = edges
    edgelists = prunedict(edgelists)

    # print('Saving...')
    for k in edgelists:
        # print('\t'.join([f'{col}: {val}' for col, val in zip(stratify_cols, k)]))
        pd.DataFrame(edgelists[k], columns=['TF', 'TG', 'coef']).to_csv(
            data_folder
            + f'processed/grn_{f"{cell_type}_" if cell_type is not None else ""}'
            f'{"_".join([f"{col}_{val}" for col, val in zip(stratify_cols, k)])}.csv')
    
    # print()

  1%|          | 3/294 [02:04<2:57:28, 36.59s/it]