In [1]:
import pandas as pd
import anndata
import os

In [15]:
from IPython.display import HTML, display
from cellphonedb.utils import db_releases_utils

display(HTML(db_releases_utils.get_remote_database_versions_html()['db_releases_html_table']))

Version,Release date
v4.1.0,2023-03-09
,
,
,


In [16]:
# -- Version of the databse
cpdb_version = 'v5.0.0'

# -- Path where the input files to generate the database are located
cpdb_target_dir = os.path.join('mydirectory/run_cellphonedb/', cpdb_version)

from cellphonedb.utils import db_utils

db_utils.download_database(cpdb_target_dir, cpdb_version)

Downloaded cellphonedb.zip into mydirectory/run_cellphonedb/v5.0.0
Downloaded complex_input.csv into mydirectory/run_cellphonedb/v5.0.0
Downloaded gene_input.csv into mydirectory/run_cellphonedb/v5.0.0
Downloaded interaction_input.csv into mydirectory/run_cellphonedb/v5.0.0
Downloaded protein_input.csv into mydirectory/run_cellphonedb/v5.0.0
Downloaded uniprot_synonyms.tsv into mydirectory/run_cellphonedb/v5.0.0/sources
Downloaded transcription_factor_input.csv into mydirectory/run_cellphonedb/v5.0.0/sources


In [17]:
# input paths
base = 'mydirectory/run_cellphonedb'
cpdb_file_path = os.path.join(base, 'v5.0.0/cellphonedb.zip')
out_path = os.path.join(base, 'outputs')
microenvs_file_path = os.path.join(base, 'microenvs.csv')
meta_file_path = os.path.join(base, 'mdata.csv')
counts_file_path = os.path.join(base, 'log2_counts.h5ad')

In [18]:
mdata = pd.read_csv(meta_file_path)
mdata

Unnamed: 0,Cell,cell_type
0,TMA24_TMA24001_MAC,MAC_CRSwNP_UNC
1,TMA24_TMA24001_IMM,IMM_CRSwNP_UNC
2,TMA24_TMA24001_EPI,EPI_CRSwNP_UNC
3,TMA24_TMA24002_MAC,MAC_CRSwNP_UNC
4,TMA24_TMA24002_IMM,IMM_CRSwNP_UNC
...,...,...
528,TMA21_TMA21062_IMM,IMM_CRSsNP
529,TMA21_TMA21062_EPI,EPI_CRSsNP
530,TMA21_TMA21063_MAC,MAC_CRSsNP
531,TMA21_TMA21063_IMM,IMM_CRSsNP


In [19]:
# adata = anndata.read_h5ad(counts_file_path)
adata = anndata.read_h5ad(counts_file_path)
adata.shape

(533, 18676)

In [20]:
sorted(list(adata.obs.index)) == sorted(list(mdata['Cell']))

True

In [21]:
microenv = pd.read_csv(microenvs_file_path)
microenv.groupby('microenvironment', group_keys = False)['cell_type'].apply(lambda x : list(x.value_counts().index))

microenvironment
sNP                    [MAC_CRSsNP, IMM_CRSsNP, EPI_CRSsNP]
wNP_NP        [MAC_CRSwNP_NP, IMM_CRSwNP_NP, EPI_CRSwNP_NP]
wNP_UNC    [MAC_CRSwNP_UNC, IMM_CRSwNP_UNC, EPI_CRSwNP_UNC]
Name: cell_type, dtype: object

In [22]:
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'gene_name',                       # defines the gene annotation in counts matrix.
    active_tfs_file_path = None,                     # optional: defines cell types and their active TFs.
    microenvs_file_path = microenvs_file_path,       # optional (default: None): defines cells per microenvironment.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 10000,                              # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 10,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = None                             # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
mydirectory/run_cellphonedb/log2_counts_updated.h5ad
mydirectory/run_cellphonedb/mdata_v2.csv
mydirectory/run_cellphonedb/microenvs.csv
[ ][CORE][28/03/24-11:43:05][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:10000 Debug-seed:42 Threads:10 Precision:3
[ ][CORE][28/03/24-11:43:07][INFO] Running Real Analysis
[ ][CORE][28/03/24-11:43:07][INFO] Limiting cluster combinations using microenvironments
[ ][CORE][28/03/24-11:43:07][INFO] Running Statistical Analysis


100%|██████████| 10000/10000 [00:33<00:00, 300.49it/s]

[ ][CORE][28/03/24-11:43:40][INFO] Building Pvalues result





[ ][CORE][28/03/24-11:43:41][INFO] Building results
[ ][CORE][28/03/24-11:43:41][INFO] Scoring interactions: Filtering genes per cell type..


100%|██████████| 15/15 [00:00<00:00, 558.53it/s]

[ ][CORE][28/03/24-11:43:41][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..



100%|██████████| 15/15 [00:00<00:00, 1170.83it/s]


[ ][CORE][28/03/24-11:43:41][INFO] Scoring interactions: Calculating scores for all interactions and cell types..


100%|██████████| 27/27 [00:02<00:00,  9.39it/s]


Saved deconvoluted to mydirectory/run_cellphonedb/outputs/statistical_analysis_deconvoluted_03_28_2024_114344.txt
Saved deconvoluted_percents to mydirectory/run_cellphonedb/outputs/statistical_analysis_deconvoluted_percents_03_28_2024_114344.txt
Saved means to mydirectory/run_cellphonedb/outputs/statistical_analysis_means_03_28_2024_114344.txt
Saved pvalues to mydirectory/run_cellphonedb/outputs/statistical_analysis_pvalues_03_28_2024_114344.txt
Saved significant_means to mydirectory/run_cellphonedb/outputs/statistical_analysis_significant_means_03_28_2024_114344.txt
Saved interaction_scores to mydirectory/run_cellphonedb/outputs/statistical_analysis_interaction_scores_03_28_2024_114344.txt
