# scCODA - Compositional analysis of single-cell data

The scCODA package ([Büttner, Ostner et al., 2020](https://www.biorxiv.org/content/10.1101/2020.12.14.422688v2)) is intended to be used with cell composition from single-cell RNA-seq experiments, however there are no technical restrictions that prevent the use of data from other sources.

**Load modules**

In [None]:
# Setup
import sys
import os
import session_info
import warnings
#from pyprojroot.here import here

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import anndata as an
import scanpy as sc
import pickle

warnings.filterwarnings("ignore")
#sys.path.insert(1, str(here('bin')))

#from customPalette import *

from sccoda.util import comp_ana as mod
from sccoda.util import cell_composition_data as dat
from sccoda.util import data_visualization as viz

import matplotlib.colors as mcolors
import matplotlib.colors as mcolors


**Define palette**

In [None]:
diseases_palette = {
  'healthy': "#808080",
    
  'RA': '#264653',
  'PS': '#287271',
  'PSA': '#2a9d8f',
  'CD': '#e76f51',
  'UC': '#e9c46a',
  'SLE': '#941c2f', 
  'MS': '#8ab17d',
    
  'asthma': '#ea698b',
  'COPD': '#c05299',
  'cirrhosis': '#973aa8',
    
  'sepsis': '#ef233c',
    
  'HIV': '#e7ecef',
  'HBV': '#a3cef1',
  'COVID': '#6096ba', 
  'flu': '#274c77', 
    
  'BRCA': '#fff75e',
  'NPC': '#fdb833',
  'HNSCC': '#d9981a',
  'CRC': '#9e7524'
}

annotation_Level1_palette = {
      'B': '#7bc6d6',
      'Plasma': '#025566',
        
      'pDC': '#a7c957',
      'DC': '#6a994e',
      'Mono': '#386641',
        
      'T_CD4_Naive': '#fff3b0',
      'T_CD4_NonNaive': '#e09f3e',
      'T_CD8_Naive': '#9e2a2b',
      'T_CD8_NonNaive': '#540b0e',
        
      'UTC': '#88657f',
      'ILC': '#67253a',
        
      'Cycling_cells': '#d4a373',
      'Progenitors': '#ccd5ae',
    
#    'Platelets': '#808080',  # To remove
#    'RBC': '#000000'         # To remove
}


In [None]:
ct_colors = [annotation_Level1_palette[key] for key in annotation_Level1_palette]
ct_map = ListedColormap(colors)

dis_colors = [diseases_palette[key] for key in diseases_palette]
dis_cmap = ListedColormap(dis_colors)

**Define parameters**

In [None]:
# Paths
workDir = "Inflammation-PBMCs-Atlas/"
scCODADir = "{}/03_downstream_analysis/01_compositional_analysis/scCODA/".format(workDir)
comparisonDir = "{}/03_downstream_analysis/01_compositional_analysis/scCODA/comparisons".format(workDir)

# Load data

In [None]:
celltype_column = "Level1"
disease_column = "disease"
patient_column = "sampleID"
disease_patient_col = "sampleID_disease"
level = celltype_column

disease_order = list(diseases_palette.keys())
celltype_order = list(annotation_Level1_palette.keys())

In [None]:
adata = sc.read_h5ad("{}/03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse.log1p.h5ad".format(workDir))
adata = adata[~adata.obs[celltype_column].isin(['RBC', 'Platelets'])].copy()
adata

# Differential abundance analysis: Disease vs healthy

In [None]:
dis_df = {}
for disease in disease_order:
    if disease != "healthy":
        print(disease)
    # Create folder to store the results
        dis_comparisonDir = "{}/{}/".format(comparisonDir, disease)
        if not os.path.exists(dis_comparisonDir):
            os.makedirs(dis_comparisonDir)
    # Subset the comparison to healthy + disease
        data = adata[adata.obs[disease_column].isin(["healthy", disease])].copy()
    # Process data for scCODA 
        data.obs[disease_patient_col] =  data.obs[disease_column].astype(str) + "_" + data.obs[patient_column].astype(str)
        ## Group the data by 'cell_type' and 'disease', and count the cells in each group
        counts_df = data.obs.groupby([disease_patient_col, celltype_column]).size().reset_index(name='cell_count')
        counts_pivot = counts_df.pivot(index=disease_patient_col, columns=celltype_column, values='cell_count')
        counts_pivot = counts_pivot[celltype_order]
        counts_pivot = counts_pivot.fillna(0)
        columns_to_convert = [col for col in counts_pivot.columns if col != 'patient_disease']
        counts_pivot[columns_to_convert] = counts_pivot[columns_to_convert].astype(int)
        counts_pivot.reset_index(inplace=True)
        covariate_columns = [disease_patient_col,
                             patient_column, 
                             disease_column, 
                             "binned_age", 
                             "sex"]
        patient_info = data.obs[covariate_columns].drop_duplicates()
        counts_pivot = pd.merge(counts_pivot, patient_info, on=disease_patient_col, how='left')
        counts_pivot['disease'] = pd.Categorical(counts_pivot['disease'], categories=[disease, "healthy"], ordered=True)
        counts_pivot = counts_pivot.sort_values(by='disease')
        ## Save
        df_path = "{}/{}/{}_counts.csv".format(comparisonDir, disease, disease)
        counts_pivot.to_csv(df_path, index=True)
        patient_names = counts_pivot.disease
        disease_names = counts_pivot.disease.unique()
        data_all = dat.from_pandas(
            counts_pivot, 
            covariate_columns=covariate_columns) 
        dis_df[disease] = data_all

## Find reference celltype for each disease

**Define reference celltypes**


In [None]:
for disease in disease_order:
    dis_comparisonDir = "{}/{}/".format(comparisonDir, disease)
    if disease == "healthy":
         print("****** Skipping {}".format(disease))
    if disease != "healthy":
        print("****** Computing plots for {}".format(disease))
        data_all = dis_df[disease]
        # Compositional analysis
        ## ****** BOXPLOTS ******
        ### Grouped boxplots. No facets, relative abundance, no dots.
        figure_name = "{}/02_reference_celltype.pdf".format(dis_comparisonDir)
        from matplotlib import cm, rcParams
        viz.rel_abundance_dispersion_plot(
            data=data_all,
            abundant_threshold=0.9
        )
        figure = plt.gcf()
        figure.set_size_inches(25, 15)
        figure.savefig(figure_name,
                      bbox_inches='tight')

In [None]:
ref_celltype = "DC"

## Compute scCODA

**All diseases**

In [None]:
formula_parameter = "C(disease, Treatment('healthy')) + C(sex) + C(binned_age)"
comparisonDir = "{}/comparisons".format(scCODADir)
for disease in disease_order:
    if disease != "healthy":
        print("****** Computing scCODA model for {}".format(disease))
        dis_comparisonDir = "{}/{}".format(comparisonDir, disease)
        data_all = dis_df[disease]
        if disease == "HBV": # One patient with binned_age = NA --> ERROR
            data_all.obs['binned_age_str'] = data_all.obs['binned_age'].astype(str)
            valid_indices = data_all.obs[data_all.obs['binned_age_str'] != 'nan'].index
            data_all = data_all[valid_indices].copy()
            del data_all.obs['binned_age_str']
        model = mod.CompositionalAnalysis(
            data_all, 
            formula = formula_parameter, 
            reference_cell_type = ref_celltype 
        )
        all_results = model.sample_hmc() 
        # Save results 
        path_to_save = "{}/{}/".format(comparisonDir, disease)
        model_path = "{}/{}_scCODA_output_{}_{}.pkl".format(path_to_save, disease, formula_parameter, ref_celltype)
        df_path = "{}/{}_all_results_{}_{}.csv".format(path_to_save, disease, formula_parameter, ref_celltype)
        ## Model
        with open(model_path, 'wb') as file:
            pickle.dump(all_results, file)
        file.close()
        ## DF
        df = all_results.effect_df
        df.to_csv(df_path, index=True)