In [2]:
import os

import numpy as np
import pandas as pd


In [3]:
# Get files for contrast
contrast = 'c02x'
dir = '../data/scenic_outs/'
fnames = [fname for fname in os.listdir(dir) if fname.startswith(f'{contrast}_')]

# Get groups and cell-type based on fname
# TODO: Doesn't work for groups like 'AD_resilient'
get_group = lambda fname: fname.split('_')[1]
get_cell_type = lambda fname: '_'.join(fname.split('_')[2:-2])

# Get labels
gene_dir = '../data/new_labels/'
gene_fnames = [fname for fname in os.listdir(gene_dir) if fname.endswith('.txt')]
gene_lists = {'.'.join(fname.split('.')[:-1]): np.loadtxt(os.path.join(gene_dir, fname), dtype=str) for fname in gene_fnames}


In [50]:
for fname in fnames:
    ### Reading
    # Choose graph
    group = get_group(fname)
    cell_type = get_cell_type(fname)
    print(' - '.join([fname, group, cell_type]))

    # Escape if not compatible
    if group not in gene_lists:
        continue

    # Get TF-TG linkages
    graph_list = pd.read_csv(os.path.join(dir, fname), index_col=0)
    graph_list = graph_list.rename(columns={'gene': 'TG', 'CoexWeight': 'coex'})

    # Get matrix
    graph_matrix = graph_list.pivot(index='TF', columns='TG', values='coex').fillna(0)
    tf_matrix = pd.DataFrame(graph_matrix.to_numpy() @ graph_matrix.to_numpy().T, index=graph_matrix.index, columns=graph_matrix.index)

    # Annotate
    gene_list = gene_lists[group]
    annotation = graph_matrix.index.map(lambda g: g in gene_list).to_numpy()

    ### Analysis
    # Sort genes based on dot with known TFs
    # NOTE: Generally, scores >=0 are positive
    score = tf_matrix.to_numpy()[:, annotation].sum(axis=1) - tf_matrix.to_numpy().diagonal()
    sorted_idx = score.argsort()[::-1]

    # Evaluate performance
    average_positive_percentile = np.linspace(1, 0, num=tf_matrix.shape[0])[annotation[sorted_idx]].mean()
    print(f'Average positive percentile of {average_positive_percentile:.3f}')
    positive_unknown_genes = tf_matrix.index.to_numpy()[(score >= 0) * ~annotation]
    print(f'Positive unknown genes: {positive_unknown_genes}')
    negative_positive_genes = tf_matrix.index.to_numpy()[(score < 0) * annotation]
    print(f'Negative positive genes: {negative_positive_genes}')
    df = pd.DataFrame({
        'TF': tf_matrix.index.to_numpy()[sorted_idx],
        'score': score[sorted_idx],
        'percentile': np.linspace(1, 0, num=tf_matrix.shape[0]),
        'annotation': annotation[sorted_idx],
    })
    print()


c02x_AD_Astro_regulon_list.csv - AD - Astro
Average positive percentile of 0.978
Positive unknown genes: ['TFCP2L1']
Negative positive genes: []

c02x_AD_Endo_regulon_list.csv - AD - Endo
Average positive percentile of 0.939
Positive unknown genes: ['ARID3A' 'CEBPB' 'CFL2' 'CUX2' 'HIC1' 'HMX1' 'HOXD1' 'NFATC1' 'NFIL3'
 'OLIG1' 'SOX9' 'TFEB' 'TRPS1']
Negative positive genes: []

c02x_AD_EN_L3_5_IT_1_regulon_list.csv - AD - EN_L3_5_IT_1
Average positive percentile of 0.972
Positive unknown genes: []
Negative positive genes: []

c02x_AD_EN_L3_5_IT_3_regulon_list.csv - AD - EN_L3_5_IT_3
Average positive percentile of 0.945
Positive unknown genes: ['BACH1' 'FOXO1' 'NFATC2' 'PBX3']
Negative positive genes: []

c02x_AD_EN_L5_ET_regulon_list.csv - AD - EN_L5_ET
Average positive percentile of 0.974
Positive unknown genes: []
Negative positive genes: []

c02x_AD_EN_L6B_regulon_list.csv - AD - EN_L6B
Average positive percentile of 0.985
Positive unknown genes: []
Negative positive genes: []

c02x

Average positive percentile of 0.981
Positive unknown genes: []
Negative positive genes: []

c02x_AD_EN_L6_IT_2_regulon_list.csv - AD - EN_L6_IT_2
Average positive percentile of 0.957
Positive unknown genes: ['KLF2' 'NFKB2' 'SOX4' 'TEAD4']
Negative positive genes: []

c02x_AD_EN_NF_regulon_list.csv - AD - EN_NF
Average positive percentile of 0.967
Positive unknown genes: []
Negative positive genes: []

c02x_AD_Immune_regulon_list.csv - AD - Immune
Average positive percentile of 0.961
Positive unknown genes: ['CREB5' 'GABPB1' 'TBX2']
Negative positive genes: []

c02x_AD_IN_LAMP5_LHX6_regulon_list.csv - AD - IN_LAMP5_LHX6
Average positive percentile of 0.971
Positive unknown genes: []
Negative positive genes: []

c02x_AD_IN_LAMP5_RELN_regulon_list.csv - AD - IN_LAMP5_RELN
Average positive percentile of 0.971
Positive unknown genes: []
Negative positive genes: []

c02x_AD_IN_PVALB_CHC_regulon_list.csv - AD - IN_PVALB_CHC
Average positive percentile of 0.956
Positive unknown genes: ['GABPB