In [1]:
import pandas as pd
import anndata
import os

In [2]:
# generate user-specific custom database
from IPython.display import HTML, display
from cellphonedb.utils import db_releases_utils

display(HTML(db_releases_utils.get_remote_database_versions_html()['db_releases_html_table']))

Version,Release date
v4.1.0,2023-03-09
,
,
,


In [7]:
# -- Version of the databse
cpdb_version = 'v5.0.0'

# -- Path where the input files to generate the database are located
cpdb_target_dir = os.path.join('/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/', cpdb_version)

from cellphonedb.utils import db_utils

db_utils.download_database(cpdb_target_dir, cpdb_version)

Downloaded cellphonedb.zip into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0
Downloaded complex_input.csv into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0
Downloaded gene_input.csv into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0
Downloaded interaction_input.csv into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0
Downloaded protein_input.csv into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0
Downloaded uniprot_synonyms.tsv into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0/sources
Downloaded transcription_factor_input.csv into /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0/sources


In [8]:
# input paths
cpdb_file_path = '/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/v5.0.0/cellphonedb.zip'
out_path = '/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/outputs'
microenvs_file_path = '/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/microenvs.csv'
meta_file_path = '/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/mdata.csv'
counts_file_path = '/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/batchcorrected_counts.h5ad'

In [9]:
# check metadata
mdata = pd.read_csv(meta_file_path)
mdata

Unnamed: 0,Cell,cell_type
0,DSP-1001660020890-A-A02,EBV+_Tumor
1,DSP-1001660020890-A-A03,EBV+_CD4mem
2,DSP-1001660020890-A-A04,EBV+_CD4naive
3,DSP-1001660020890-A-A05,EBV+_CD8mem
4,DSP-1001660020890-A-A06,EBV+_CD8naive
...,...,...
726,DSP-1001660018928-H-F09,EBV+_CD4mem
727,DSP-1001660018928-H-F10,EBV+_CD4naive
728,DSP-1001660018928-H-F11,EBV+_CD8mem
729,DSP-1001660018928-H-F12,EBV+_CD8naive


In [10]:
# # convert to anndata object
# csv = pd.read_csv('/mnt/nfs/home/jasonyeung/cHL_EBV/EBV_chl_batchcorrected-CPM-nNCGH_1500-K_11.csv', index_col=0).transpose()
# adata = anndata.AnnData(csv)
# adata.write_h5ad('/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/batchcorrected_counts.h5ad')

adata = anndata.read_h5ad(counts_file_path)
adata.shape

(731, 17300)

In [11]:
# check mdata and counts
sorted(list(adata.obs.index)) == sorted(list(mdata['Cell']))

True

In [12]:
# check microenvironments
microenv = pd.read_csv(microenvs_file_path)
microenv

Unnamed: 0,cell_type,microenvironment
0,EBV+_Tumor,EBV+
1,EBV+_CD4mem,EBV+
2,EBV+_CD4naive,EBV+
3,EBV+_CD8mem,EBV+
4,EBV+_CD8naive,EBV+
5,EBV+_Other,EBV+
6,EBV-_Tumor,EBV-
7,EBV-_CD4mem,EBV-
8,EBV-_CD4naive,EBV-
9,EBV-_CD8mem,EBV-


In [13]:
microenv.groupby('microenvironment', group_keys = False)['cell_type'].apply(lambda x : list(x.value_counts().index))

microenvironment
EBV+    [EBV+_Tumor, EBV+_CD4mem, EBV+_CD4naive, EBV+_...
EBV-    [EBV-_Tumor, EBV-_CD4mem, EBV-_CD4naive, EBV-_...
Name: cell_type, dtype: object

In [15]:
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellPhoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix.
    counts_data = 'gene_name',                       # defines the gene annotation in counts matrix.
    microenvs_file_path = microenvs_file_path,       # optional (default: None): defines cells per microenvironment.
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 4,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = None                             # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/batchcorrected_counts.h5ad
/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/mdata.csv
/mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/microenvs.csv
[ ][CORE][12/06/24-11:29:37][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:42 Threads:4 Precision:3
[ ][CORE][12/06/24-11:29:37][INFO] Running Real Analysis
[ ][CORE][12/06/24-11:29:37][INFO] Limiting cluster combinations using microenvironments
[ ][CORE][12/06/24-11:29:37][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [00:06<00:00, 154.84it/s]

[ ][CORE][12/06/24-11:29:43][INFO] Building Pvalues result





[ ][CORE][12/06/24-11:29:44][INFO] Building results
Saved deconvoluted to /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/outputs/statistical_analysis_deconvoluted_06_12_2024_112944.txt
Saved deconvoluted_percents to /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/outputs/statistical_analysis_deconvoluted_percents_06_12_2024_112944.txt
Saved means to /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/outputs/statistical_analysis_means_06_12_2024_112944.txt
Saved pvalues to /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/outputs/statistical_analysis_pvalues_06_12_2024_112944.txt
Saved significant_means to /mnt/nfs/home/jasonyeung/cHL_EBV/run_cellphonedb/outputs/statistical_analysis_significant_means_06_12_2024_112944.txt


### **pvalue fields**
- **id_cp_interaction**: interaction identifier.
- **interacting_pair**: Name of the interacting pairs separated by separator (default : "|").
- **partner A/B**: Identifier for the first interacting partner (A) or the second (B). It could be: UniProt (prefix simple:) or complex (prefix complex:)
- **gene A/B**: Gene identifier for the first interacting partner (A) or the second (B).
- **secreted**: True if one of the partners is secreted.
- **receptor A/B**: True if the first interacting partner (A) or the second (B) is annotated as a receptor in our database.
- **annotation_strategy**: Curated if the interaction was annotated by the CellPhoneDB developers. Other value if it was added by the user.
- **is_integrin**: True if one of the partners is integrin.
- **cell_a|cell_b**: p-value obtained by random shuffling.

### **deconvoluted fields**
- **gene_name**: Gene identifier for one of the subunits that are participating in the interaction defined in “means.csv” file. The identifier will depend on the input of the user list.
- **uniprot**: UniProt identifier for one of the subunits that are participating in the interaction defined in “means.csv” file.
- **is_complex**: True if the subunit is part of a complex. Single if it is not, complex if it is.
- **protein_name**: Protein name for one of the subunits that are participating in the interaction defined in “means.csv” file.
- **complex_name**: Complex name if the subunit is part of a complex. Empty if not.
- **id_cp_interaction**: Unique CellPhoneDB identifier for each of the interactions stored in the database.
- **mean**: Mean expression of the corresponding gene in each cluster.

### **means fields**
- **id_cp_interaction**: Unique CellPhoneDB identifier for each interaction stored in the database.
- **interacting_pair**: Name of the interacting pairs separated by “|”.
- **partner A or B**: Identifier for the first interacting partner (A) or the second (B). It could be: UniProt (prefix simple:) or complex (prefix complex:)
- **gene A or B**: Gene identifier for the first interacting partner (A) or the second (B). The identifier will depend on the input user list.
- **secreted**: True if one of the partners is secreted.
- **Receptor A or B**: True if the first interacting partner (A) or the second (B) is annotated as a receptor in our database.
- **annotation_strategy**: Curated if the interaction was annotated by the CellPhoneDB developers. Otherwise, the name of the database where the interaction has been downloaded from.
- **is_integrin**: True if one of the partners is integrin.
- **means**: Mean values for all the interacting partners: mean value refers to the total mean of the individual partner average expression values in the corresponding interacting pairs of cell types. If one of the mean values is 0, then the total mean is set to 0.

### **significant means fields**
- **id_cp_interaction**: Unique CellPhoneDB identifier for each interaction stored in the database.
- **interacting_pair**: Name of the interacting pairs separated by “|”.
- **partner A or B**: Identifier for the first interacting partner (A) or the second (B). It could be: UniProt (prefix simple:) or complex (prefix complex:)
- **gene A or B**: Gene identifier for the first interacting partner (A) or the second (B). The identifier will depend on the input user list.
- **secreted**: True if one of the partners is secreted.
- **Receptor A or B**: True if the first interacting partner (A) or the second (B) is annotated as a receptor in our database.
- **annotation_strategy**: Curated if the interaction was annotated by the CellPhoneDB developers. Otherwise, the name of the database where the interaction has been downloaded from.
- **is_integrin**: True if one of the partners is integrin.
- **significant_mean**: Significant mean calculation for all the interacting partners. If the interaction has been found relevant, the value will be the mean. Alternatively, the value is set to 0.

In [None]:
pvalues.head()

In [None]:
deconvoluted.head()

In [None]:
means.head()

In [None]:
significant_means.head()

In [None]:
# can ignore this, did plotting/querying in R
from cellphonedb.utils import search_utils

search_results = search_utils.search_analysis_results(
    query_cell_types_1 = 'All',  # List of cells 1, will be paired to cells 2 (list or 'All').
    query_cell_types_2 = 'All',     # List of cells 2, will be paired to cells 1 (list or 'All').
    # query_genes = ['TGFBR1'],                                       # filter interactions based on the genes participating (list).
    # query_interactions = ['CSF1_CSF1R'],                            # filter intereactions based on their name (list).
    significant_means = significant_means,                          # significant_means file generated by CellPhoneDB.
    deconvoluted = deconvoluted,                                    # deconvoluted file generated by CellPhoneDB.
    separator = '|',                                                # separator (default: |) employed to split cells (cellA|cellB).
    long_format = True                                              # converts the output into a wide table, removing non-significant interactions
)

search_results

In [None]:
import ktplotspy as kpy

# this thing just shows total # of significant interactions, not that useful

kpy.plot_cpdb_heatmap(
    pvals=pvalues,
    figsize=(5, 5),
    title="Sum of significant interactions",
    symmetrical=False,
)