In [11]:
import os

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pecanpy as pp
import seaborn as sns
import torch
import torch.nn as nn

# Style
sns.set_theme(context='talk', style='white', palette='Set2')
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42


# Preparation

In [2]:
# Get files for contrast
contrast = 'c02x'
dir = '../data/scenic_outs/'
fnames = [fname for fname in os.listdir(dir) if fname.startswith(f'{contrast}_')]

# Get groups and cell-type based on fname
# TODO: Doesn't work for groups like 'AD_resilient'
get_group = lambda fname: fname.split('_')[1]
get_cell_type = lambda fname: '_'.join(fname.split('_')[2:-2])

# Get labels
gene_dir = '../data/new_labels/'
gene_fnames = [fname for fname in os.listdir(gene_dir) if fname.endswith('.txt')]
gene_lists = {'.'.join(fname.split('.')[:-1]): np.loadtxt(os.path.join(gene_dir, fname), dtype=str) for fname in gene_fnames}


# Analyses

### TF Dot Analysis

In [23]:
for fname in fnames:
    ### Reading
    # Choose graph
    group = get_group(fname)
    cell_type = get_cell_type(fname)
    print(' - '.join([fname, group, cell_type]))

    # Escape if not compatible
    if group not in gene_lists:
        continue

    # Get TF-TG linkages
    graph_list = pd.read_csv(os.path.join(dir, fname), index_col=0)
    graph_list = graph_list.rename(columns={'gene': 'TG', 'CoexWeight': 'coex'})

    # Get matrix
    graph_matrix = graph_list.pivot(index='TF', columns='TG', values='coex').fillna(0)

    # Get tf matrix
    tf_matrix = pd.DataFrame(graph_matrix.to_numpy() @ graph_matrix.to_numpy().T, index=graph_matrix.index, columns=graph_matrix.index)

    # Annotate
    gene_list = gene_lists[group]
    annotation = graph_matrix.index.map(lambda g: g in gene_list).to_numpy()

    ### Processing
    # Sort genes based on dot with known TFs
    # NOTE: Generally, scores >=0 are positive
    score = tf_matrix.to_numpy()[:, annotation].sum(axis=1) - tf_matrix.to_numpy().diagonal()
    sorted_idx = score.argsort()[::-1]

    ### Analysis
    # Evaluate performance
    average_positive_percentile = np.linspace(1, 0, num=tf_matrix.shape[0])[annotation[sorted_idx]].mean()
    print(f'Average positive percentile of {average_positive_percentile:.3f}')
    positive_unknown_genes = tf_matrix.index.to_numpy()[(score >= 0) * ~annotation]
    print(f'Positive unknown genes: {positive_unknown_genes}')
    negative_positive_genes = tf_matrix.index.to_numpy()[(score < 0) * annotation]
    print(f'Negative positive genes: {negative_positive_genes}')
    df = pd.DataFrame({
        'TF': tf_matrix.index.to_numpy()[sorted_idx],
        'score': score[sorted_idx],
        'percentile': np.linspace(1, 0, num=tf_matrix.shape[0]),
        'annotation': annotation[sorted_idx],
    })

    ### Visualization
    fig, ax = plt.subplots(1, 1, figsize=(9, 9))
    plt.title(f'{group}_{cell_type} - APP {average_positive_percentile:.3f}')
    sns.heatmap(tf_matrix, cmap='rocket_r', ax=ax)
    fig.savefig(f'./plots/tf_matrix_{group}_{cell_type}.pdf', format='pdf', transparent=True)
    plt.close()

    print()


c02x_AD_Astro_regulon_list.csv - AD - Astro
Average positive percentile of 0.978
Positive unknown genes: ['TFCP2L1']
Negative positive genes: []

c02x_AD_Endo_regulon_list.csv - AD - Endo
Average positive percentile of 0.939
Positive unknown genes: ['ARID3A' 'CEBPB' 'CFL2' 'CUX2' 'HIC1' 'HMX1' 'HOXD1' 'NFATC1' 'NFIL3'
 'OLIG1' 'SOX9' 'TFEB' 'TRPS1']
Negative positive genes: []

c02x_AD_EN_L3_5_IT_1_regulon_list.csv - AD - EN_L3_5_IT_1
Average positive percentile of 0.972
Positive unknown genes: []
Negative positive genes: []

c02x_AD_EN_L3_5_IT_3_regulon_list.csv - AD - EN_L3_5_IT_3
Average positive percentile of 0.945
Positive unknown genes: ['BACH1' 'FOXO1' 'NFATC2' 'PBX3']
Negative positive genes: []

c02x_AD_EN_L5_ET_regulon_list.csv - AD - EN_L5_ET
Average positive percentile of 0.974
Positive unknown genes: []
Negative positive genes: []

c02x_AD_EN_L6B_regulon_list.csv - AD - EN_L6B
Average positive percentile of 0.985
Positive unknown genes: []
Negative positive genes: []

c02x

### Node2Vec Analysis

In [4]:
class MLP(torch.nn.Module):
    def __init__(self, input_size, dropout=0):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_size, input_size//2),
            nn.Dropout(dropout),
            nn.BatchNorm1d(input_size//2),
            nn.LeakyReLU(),

            nn.Linear(input_size//2, input_size//4),
            nn.Dropout(dropout),
            nn.BatchNorm1d(input_size//4),
            nn.LeakyReLU(),

            nn.Linear(input_size//4, 2),
            nn.Dropout(dropout),
            nn.BatchNorm1d(2),
            nn.Softmax(1),
        )

    def forward(self, X):
        return self.mlp(X)


In [5]:
# Reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Parameters
dim = 64
epochs = 1_001
batch_size = 32
lr = 1e-1
gamma = .99

# Books
graph_type = []
performance = []

# Loop
for fname in fnames:
    ### Reading
    # Choose graph
    group = get_group(fname)
    cell_type = get_cell_type(fname)
    print(' - '.join([fname, group, cell_type]))

    # Escape if not compatible
    if group not in gene_lists:
        continue

    # Get TF-TG linkages
    graph_list = pd.read_csv(os.path.join(dir, fname), index_col=0)
    graph_list = graph_list.rename(columns={'gene': 'TG', 'CoexWeight': 'coex'})

    # Format
    graph_list[['TF', 'TG', 'coex']].to_csv('_elist.edg', sep='\t', header=None, index=None)

    ### Processing
    # Generate embeddings
    g = pp.pecanpy.SparseOTF(p=1, q=1, workers=4, verbose=False, random_state=42)
    g.read_edg('_elist.edg', weighted=True, directed=True)  # False if coexpression
    embeddings = g.embed(dim=dim, num_walks=20, walk_length=10)
    labels = np.array(g._node_ids)  # As long as no removing, `_node_idmap` isn't needed

    # Annotate
    gene_list = gene_lists[group]
    annotation = np.array([g in gene_list for g in labels])

    # Predict relevancy
    X = torch.Tensor(embeddings)
    unique, inverse = np.unique(annotation, return_inverse=True)
    y = np.zeros((annotation.shape[0], unique.shape[0]))
    y[np.arange(y.shape[0]), inverse] = 1
    y = torch.Tensor(y)

    # Make model
    model = MLP(dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=gamma)

    # Train model
    model.train()
    for epoch in range(epochs):
        epoch_loss = []
        for batch in range(X.shape[0] // batch_size):
            # Sample
            batch_idx = np.random.choice(X.shape[0], batch_size)
            batch_X = X[batch_idx]
            batch_y = y[batch_idx]

            # Generate logits
            logits = model(batch_X)

            # Loss
            loss = ((logits - batch_y)**2).mean()
            epoch_loss.append(loss.detach())

            # Iterate
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
        epoch_loss = torch.Tensor(epoch_loss).mean()
        if epoch % (epochs // 5) == 0: print(f'Epoch: {epoch:03d}\tLoss: {epoch_loss:.4f}')
    model.eval()

    ### Analysis
    # Processing
    logits = model(X).detach()
    probability = logits[:, 1]
    sorted_idx = torch.argsort(probability).flip(dims=(0,))

    # Outcomes
    average_positive_percentile = np.linspace(1, 0, num=X.shape[0])[annotation[sorted_idx]].mean()
    print(f'Average positive percentile of {average_positive_percentile:.3f}')

    ### Books
    graph_type.append(f'{group}_{cell_type}')
    performance.append(average_positive_percentile)

    print()

### Visualize
df = pd.DataFrame({'Graph': graph_type, 'Average Positive Percentile': performance})
fig, ax = plt.subplots(1, 1, figsize=(9, 9))
sns.lineplot(df, x='Graph', y='Average Positive Percentile', ax=ax)
fig.savefig(f'./plots/grn_performance.pdf', format='pdf', transparent=True)
plt.close()


c02x_AD_Astro_regulon_list.csv - AD - Astro
Epoch: 000	Loss: 0.0303
Epoch: 200	Loss: 0.0016
Epoch: 400	Loss: 0.0586
Epoch: 600	Loss: 0.0016
Epoch: 800	Loss: 0.0303
Epoch: 1000	Loss: 0.0016
Average positive percentile of 0.546

c02x_AD_Endo_regulon_list.csv - AD - Endo
Epoch: 000	Loss: 0.0877
Epoch: 200	Loss: 0.0302
Epoch: 400	Loss: 0.0303
Epoch: 600	Loss: 0.1450
Epoch: 800	Loss: 0.0588
Epoch: 1000	Loss: 0.0876
Average positive percentile of 0.538

c02x_AD_EN_L3_5_IT_1_regulon_list.csv - AD - EN_L3_5_IT_1
Epoch: 000	Loss: 0.0017
Epoch: 200	Loss: 0.0304
Epoch: 400	Loss: 0.0304
Epoch: 600	Loss: 0.0019
Epoch: 800	Loss: 0.0877
Epoch: 1000	Loss: 0.0019
Average positive percentile of 0.528

c02x_AD_EN_L3_5_IT_3_regulon_list.csv - AD - EN_L3_5_IT_3
Epoch: 000	Loss: 0.0304
Epoch: 200	Loss: 0.0873
Epoch: 400	Loss: 0.1457
Epoch: 600	Loss: 0.0301
Epoch: 800	Loss: 0.0302
Epoch: 1000	Loss: 0.0301
Average positive percentile of 0.538

c02x_AD_EN_L5_ET_regulon_list.csv - AD - EN_L5_ET
Epoch: 000	Loss:

In [22]:
# Manual
graph_type = []
for fname in fnames:
    # Choose graph
    group = get_group(fname)
    cell_type = get_cell_type(fname)

    # Escape if not compatible
    if group not in gene_lists:
        continue

    graph_type.append(f'{group}_{cell_type}')

performance = [0.546, 0.538, 0.528, 0.538, 0.529, 0.579, 0.544, 0.553, 0.554, 0.833, 0.598, 0.579, 0.538, 0.574, 0.564, 0.628, 0.527, 0.574, 0.563, 0.542, 0.591, 0.577, 0.518, 0.571]

# Visualize
df = pd.DataFrame({'Graph': graph_type, 'Average Positive Percentile': performance})
fig, ax = plt.subplots(1, 1, figsize=(9, 9))
sns.lineplot(df, x='Graph', y='Average Positive Percentile', ax=ax)
plt.axhline(.5, color='black', linestyle='--')
plt.xticks(rotation=90)
plt.tight_layout()
fig.savefig(f'./plots/grn_performance.pdf', format='pdf', transparent=True)
plt.close()
