In [None]:
import sys
import os
import os.path
import re
import logging
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as sps
from anndata import AnnData
import anndata
from collections import defaultdict, OrderedDict
import plotly.express.colors as pxcolors
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

# local to this analysis
import de
import plotting
import scoring
import signatures
import util

FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120)

In [None]:
figure_dir = '../../build/figures'
sc.settings.figdir = figure_dir
supplement_dir = '../../build/supplement'

dataset = 'GSE249746'
gene_signatures = [
    'CDKN1A,-LMNB1,-MKI67,-TOP2A',
    'CDKN2A,-LMNB1,-MKI67,-TOP2A',
    'CDKN1A,CDKN2A,-LMNB1,-MKI67,-TOP2A',
    'ATF3',
    'ATF3,CDKN1A,-LMNB1,-MKI67,-TOP2A',
    'ATF3,CDKN2A,-LMNB1,-MKI67,-TOP2A',
    'ATF3,CDKN1A,CDKN2A,-LMNB1,-MKI67,-TOP2A',
    
    '-ATF3,CDKN1A,-LMNB1,-MKI67,-TOP2A',
    '-ATF3,CDKN2A,-LMNB1,-MKI67,-TOP2A',
    '-ATF3,CDKN1A,CDKN2A,-LMNB1,-MKI67,-TOP2A',
]


In [None]:
#client = dev_instance()
#broker = client.get_databroker(dataset)
#adata = broker.load_variant(variant)
adata = sc.read_h5ad(os.path.join('../../build/datasets/', dataset, f'{dataset}.h5ad'))
adata

In [None]:
# evaluate signatures
adata_scaled = adata.copy()
sc.pp.log1p(adata_scaled)
sc.pp.scale(adata_scaled, zero_center=False)
signatures.assess_signatures(adata_scaled, gene_signatures)

In [None]:
signatures_df = signatures.summarize_signatures(
    adata_scaled.obs, 
    gene_signatures, 
    'donor_id', 
    include_columns=['age'],
    sort_by=['age'],
)
signatures_df

In [None]:
# example plot (not published) of signature positive cell percentages
fig = px.bar(signatures_df, x='age', y='CDKN2A,-LMNB1,-MKI67,-TOP2A_percent_group_positive')
fig.update_layout(
    height=800, 
    width=1000, 
    font=dict(size=14, family='arial'), 
    title='Human Single Soma: Percent of cells that are CDKN2A+ and MKI67-,LMNB1-,TOP2A-',
)
fig.update_xaxes(title='age')
fig.update_yaxes(title=f'Percent cells expressing CDKN2A,-LMNB1,-MKI67,-TOP2A')
fig.show()

In [None]:
#use chi2 test to see if there is a significant dependency between age and each signature

# chi2 test of cell counts by age and senescence signature
#           | 23 | 56 | 61 
#-----------+----+----+-----
# senescent | a  | b  | c 
#-----------+----+----+-----
# not-senes | d  | e  | f


test_columns = [x for x in signatures_df.columns if x.endswith('num_cells_positive')]
for c in test_columns:
    t = signatures_df['total']
    p = signatures_df[c]
    n = t - p
    ct = np.vstack([p, n])
    r = sps.chi2_contingency(ct)
    pc = re.sub('num_cells_positive', 'chi2_p', c)
    signatures_df[pc] = r.pvalue

# reorder columns again
sig_cols = []
for sig in gene_signatures:
    for c in signatures_df.columns:
        if c.startswith(f'{sig}_'):
            sig_cols.append(c)
signatures_df = signatures_df[[c for c in signatures_df.columns if c not in sig_cols] + sig_cols]
signatures_df

In [None]:
rows = []
for i, p in signatures_df[[c for c in signatures_df.columns if c.endswith('chi2_p')]].iloc[0].items():
    rows.append([i, f'{p:.4}'])
pd.DataFrame(rows, columns=['signature', 'pvalue'])

In [None]:
# write to excel
signatures_df.to_excel(os.path.join(supplement_dir, 'human_single_soma_sen_signatures.xlsx'))