In [1]:
# General modules
import sys
import os
import session_info
import warnings
from pyprojroot.here import here
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as plines
import matplotlib.colors as mcolors
from IPython.display import display
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.lines import Line2D  # for legend handle

# Specific modules
import scanpy as sc
import decoupler as dc
import anndata as ad
import scienceplots

# Setting some parameters
warnings.filterwarnings("ignore")

# Import functions
sys.path.insert(1, str(here('bin')))
from customPalette import *
from customPythonFunctions import mean_by_category, filter_low_represented_cell_group, RelativeDiff_mean_by_category, mscatter, aggregating_features

plt.style.use(['nature'])
dpi_fig_save = 300
sc.set_figure_params(dpi=100, dpi_save=dpi_fig_save, vector_friendly=True)

overwriteFigures = True
overwriteData = True

pd.options.display.max_columns = 300
pd.options.display.max_rows = 1000

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.multitest as smm
import patsy

In [3]:
print("Main directory path: {}".format(here()))

Main directory path: /scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas


In [4]:
n_min_obs = 3

In [5]:
methodList = ['BFGS','Powell','CG','NM']

# Evaluate UML results

Reference: https://www.statsmodels.org/stable/api.html#api-reference

## Level 1

In [6]:
summary_UMLeval_list = []

for celltype in ["B", "Plasma", "pDC", "DC", "Mono",  "T_CD4_Naive", "T_CD4_NonNaive", "T_CD8_Naive", "T_CD8_NonNaive", "ILC", "UTC"]:
    print(celltype)

    # Load data
    anndata_path = here('03_downstream_analysis/06_inflammation_signatures/results/DecoupleR_ulmestimates_{}_L1_Corr.h5ad'.format(celltype))
    actsPS = ad.read_h5ad(anndata_path)
    actsPS = actsPS[:, [var for var in actsPS.var_names if 'X-global-X' not in var]].copy()
    actsPS.obsm["ulm_estimate"] = actsPS.obsm["ulm_estimate"][[var for var in actsPS.var_names if 'X-global-X' not in var]]
    actsPS.obsm["ulm_pvals"] = actsPS.obsm["ulm_pvals"][[var for var in actsPS.var_names if 'X-global-X' not in var]]


    # Prepare data
    factor_list = actsPS.var_names.to_list()
    actsPS_df = pd.DataFrame(actsPS.X, index=actsPS.obs.index, columns=factor_list)
    covariates = actsPS.obs[['studyID', 'disease', 'chemistry']]#, 'sex', 'binned_age']]
    actsPS_covar_df = pd.merge(actsPS_df, covariates, left_index=True, right_index=True)
    
    # Encode categorical variables
    actsPS_covar_df['disease'] = actsPS_covar_df['disease'].astype('category')
    actsPS_covar_df['chemistry'] = actsPS_covar_df['chemistry'].astype('category')

    summary_table_list = []

    ## Check if there are at least 3 observation for each disease
    diseaseCount = actsPS_covar_df.disease.value_counts().reset_index()
    removeDisease = diseaseCount.query("count < @n_min_obs")['disease'].tolist()
    if len(removeDisease) > 0:
        print(f"{', '.join(removeDisease)} include less than {n_min_obs}. They won't be considered")

    actsPS_covar_df_filt = actsPS_covar_df.query("disease not in @removeDisease")

    actsPS_covar_df_filt['disease'] = actsPS_covar_df_filt['disease'].cat.remove_unused_categories()
    actsPS_covar_df_filt['chemistry'] = actsPS_covar_df_filt['chemistry'].cat.remove_unused_categories()
    
    # Fit Mixed Linear Model per each factor
    for factor in factor_list:
        formula = f'Q("{factor}")~ C(disease, Treatment(reference="healthy")) + 'f'C(chemistry)'

        try:
            model = smf.mixedlm(formula, actsPS_covar_df_filt, groups=actsPS_covar_df_filt['studyID']).fit(method=methodList, maxiter=1000)             

            summary_table = model.summary().tables[1]
            summary_table = summary_table.loc[summary_table.index.str.contains("C\(disease,", regex=True)]
            summary_table = summary_table.applymap(pd.to_numeric, errors='ignore')
            
            summary_table["AnnotationLevel"] = "Level1"
            summary_table["CellType"] = celltype
            summary_table["FactorName"] = factor
            summary_table["disease"] = summary_table.index.str.extract(r'C\(disease, Treatment\(reference="healthy"\)\)\[T\.(.*)\]')[0].to_list()
            
            summary_table_list.append(summary_table)

            print(f"{factor} done")
                                
        except Exception as error:
            print(f"ERROR: Not computing mixedLM for {celltype} and {factor} due to {error} error.")
            print("probably due to:")
            print(actsPS_covar_df_filt.value_counts(['disease','chemistry']).reset_index().sort_values('disease'))

        print("",end='\n')
    try:
        summary_table_byCell = pd.concat(summary_table_list)
    except Exception as error:
        print(f"ERROR: Not concatenating mixedLM for {celltype} due to {error} error.")

    # Append results
    summary_UMLeval_list.append(summary_table_byCell)

summary_table_UMLeval = pd.concat(summary_UMLeval_list)
# Correct for multiple Factor testing
corrected_pvalues = smm.fdrcorrection(summary_table_UMLeval['P>|z|'], method='indep', alpha=0.05, is_sorted=False)[1]
summary_table_UMLeval['Pval_adj'] = corrected_pvalues

summary_table_UMLeval.to_csv(here('03_downstream_analysis/06_inflammation_signatures/results/DecoupleR_ulmestimates_mixedmlEval_L1_Corr.csv'))

# Filtering
summary_table_UMLeval = summary_table_UMLeval[summary_table_UMLeval['Pval_adj'] < 0.05]
summary_table_UMLeval.to_csv(here('03_downstream_analysis/06_inflammation_signatures/results/DecoupleR_ulmestimates_mixedmlEval_L1_Corr_filt.csv'))

B
10-X-B-X-cytokine_and_receptors_proinflammatory done

11-X-B-X-cytokine_andreceptors_antiinflammatory done

12-X-B-X-IFN_Type_1_2_Lambda done

13-X-B-X-IFN_response done

14-X-B-X-TNF_receptors_ligands done

15-X-B-X-adhesion_molecules done

16-X-B-X-antigen_presentation_molecules done

7-X-B-X-effector done

8-X-B-X-chemokines done

9-X-B-X-chemokine_receptors done

Plasma
MS, PS include less than 3. They won't be considered
51-X-Plasma-X-chemokines done

52-X-Plasma-X-chemokine_receptors done

53-X-Plasma-X-cytokine_and_receptors_proinflammatory done

54-X-Plasma-X-cytokine_andreceptors_antiinflammatory done

55-X-Plasma-X-IFN_Type_1_2_Lambda done

56-X-Plasma-X-IFN_response done

57-X-Plasma-X-TNF_receptors_ligands done

58-X-Plasma-X-adhesion_molecules done

59-X-Plasma-X-antigen_presentation_molecules done

pDC
flu, NPC include less than 3. They won't be considered
125-X-pDC-X-chemokines done

126-X-pDC-X-chemokine_receptors done

127-X-pDC-X-cytokine_and_receptors_proinflammato

In [7]:
session_info.show()