In [None]:
import sys
import os
import os.path
import re
import logging
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as sps
from anndata import AnnData
import anndata
from collections import defaultdict, OrderedDict
import plotly.express.colors as pxcolors
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

# local to this analysis
import de
import plotting
import scoring
import signatures
import util

# for development
from importlib import reload


FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120)

In [None]:
#dataset = 'North_2019_DRG'

figure_dir = '../../build/figures'
sc.settings.figdir = figure_dir
supplement_dir = '../../build/supplement'

dataset = 'North_2019'
metrics = ['CDKN1A', 'CDKN2A', 'ATF3', 'senmayo_score']
categoricals = ['associated_pain']
numericals = ['age']


In [None]:
#client = dev_instance()
#broker = client.get_databroker(dataset)
#adata = broker.load_variant(variant)
adata = sc.read_h5ad(os.path.join('../../build/datasets', dataset, f'{dataset}.h5ad'))
adata

In [None]:
# perform senmayo scoring
senmayo_genes = util.get_geneset('senmayo_human')
adata_scaled = adata.copy()
sc.pp.log1p(adata_scaled)
sc.pp.scale(adata_scaled, zero_center=True)
sc.tl.score_genes(
    adata_scaled,
    senmayo_genes,
    ctrl_size=len(senmayo_genes),
    score_name='senmayo_score',
)

In [None]:
# extract some values
i = 0
metric_rows = []
for idx, row in adata_scaled.obs.iterrows():
    metric_values = []
    for m in metrics:
        if m in adata.var_names:
            midx = list(adata.var_names).index(m)
            metric_values.append(adata.X[i, midx])
        elif m in adata_scaled.obs:
            metric_values.append(row[m])
        else:
            raise Exception(f'metric {m} not found in dataset as gene or obs key')
    cats = [row[c] for c in categoricals]
    nums = [row[n] for n in numericals]
    metric_rows.append([idx] + cats + nums + metric_values)
    i += 1

metric_df = pd.DataFrame(metric_rows, columns=['index'] + categoricals + numericals + metrics)
metric_df

In [None]:
# git differential expression by associated pain
contrasts = {"pain:no_pain": ({'associated_pain': 'Yes'}, {'associated_pain': 'No'})}
de_pain = de.differential_expression(adata, contrasts, tests=['ttest'])
de = de_pain['pain:no_pain']
de


In [None]:
rows = []
for c in categoricals:
    for m in metrics:
        if m in adata.var_names:
            p = de.loc[m]['ttest-p']
            l = de.loc[m]['log2fc']
            rows.append([m, 'categorical', c, 't-test', l, None, p])
        else:
            ydf = metric_df[metric_df[c] == 'Yes']
            ndf = metric_df[metric_df[c] == 'No']
            yv = ydf[m]
            nv = ndf[m]
            r = sps.ttest_ind(yv, nv)
            l = np.log2(np.mean(yv) - np.mean(nv))
            p = r.pvalue
            rows.append([m, 'categorical', c, 't-test', l, None, p])

for n in numericals:
    for m in metrics:
        num = metric_df[n]
        v = metric_df[m]

        r = sps.spearmanr(num, v)
        rows.append([m, 'numeric', n, 'spearman', None, r.statistic, r.pvalue])

stats_df = pd.DataFrame(rows, columns=['metric', 'association_type', 'association', 'test', 'log2fc', 'correlation_coeff', 'p'])
stats_df
    

In [None]:
with pd.ExcelWriter(os.path.join(supplement_dir, 'north_2019_tests.xlsx')) as writer:
    metric_df.to_excel(writer, sheet_name='extracted_values')
    stats_df.to_excel(writer, sheet_name='statistical_tests')