In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import napari_sparrow as nas
from spatialdata import read_zarr
import os
import scanpy as sc
from spatialdata import SpatialData
from typing import Dict, List, Optional, Tuple
from napari_sparrow.table._table import _back_sdata_table_to_zarr
from napari_sparrow.table._annotation import _annotate_celltype
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import anndata as ad
from random import sample 
import seaborn as sns
import matplotlib 
import matplotlib.ticker as ticker
import squidpy as sq

the value of the environment variable BASIC_DCT_BACKEND is not in ["JAX","SCIPY"]


In [3]:
# df_atlas_percentages = pd.read_csv("/home/wout/Documents/Thesis_lokaal/Mouse_Liver_Resolve_Data/basic_annotation_percentage_atlas.csv", index_col=0)

In [4]:
def make_umap(anndata,n_PCAs,n_neighbors):
    sc.pp.neighbors(anndata, n_neighbors=n_neighbors, n_pcs=n_PCAs)
    sc.tl.umap(anndata)
    anndata.uns['umap_'+str(n_neighbors)] = anndata.uns['umap']

In [5]:
def visualize_classification(anndata,classification_name,umap_name,path_mg,plot_umap=True,plot_dot_plot=True,plot_rank_genes_groups=True,plot_cluster_homogeneity=True,plot_corr_scores=True,plot_count_dens=True):
    # Plot the UMAP
    if plot_umap:
        anndata.uns['umap'] = anndata.uns[umap_name]
        sc.pl.umap(anndata,color=["annotation_"+classification_name])
    
    # Give the cell type proportions
    df_prediction = pd.DataFrame(anndata.obs["annotation_"+classification_name].value_counts(normalize=True))
    df_prediction.sort_index(inplace=True)
    df_pred = df_prediction * 100
    print("Cell type proportions:")
    print(df_pred.round(6).abs())
    print("\n")
    print("Leiden clusters")
    print("Only a cell type assigned to a Leiden cluster, if more than 50 percent of cells have the same cell type, otherwise 'Unknown':")
    if plot_cluster_homogeneity:
        clusteringVSleiden(anndata, "annotation_"+classification_name, "leiden", cell_types = 'all', print_results=True)
    fr_cells_unknown = compare_annotations_samples_cluster_homogeneity_percentage_unknown([anndata],[""],["annotation_"+classification_name],[""],plot=False)
    print("Fraction of cells in a Leiden cluster with unknown cell type:")
    print(fr_cells_unknown[0][0].round(3))
    average_homog = compare_annotations_samples_cluster_homogeneity([anndata],[""],["annotation_"+classification_name],[""],plot=False)
    print("Average cluster homogeneity:")
    print(average_homog[0][0].round(3))
    cell_types, all_homog_per_ct = compare_annotations_cluster_homogeneity(anndata,[""],["annotation_"+classification_name],[""],plot=False)
    print("Average cluster homogeneity per cell_type:")
    for i in range(len(cell_types)):
        print(cell_types[i] + ": " + str(all_homog_per_ct[i][0].round(3)))
    print("\n")
    print("Correlation between cell type scores:")
    scores = anndata.uns[classification_name]
    corr_matrix = scores.corr(method='pearson')
    #mean_corr = corr_matrix.mean().mean()
    if plot_corr_scores:
        sns.heatmap(np.round(corr_matrix,2), annot=True)
        plt.show()
    #print("Mean correlation between scores of cell types: " + str(mean_corr.round(3)))
    
    if plot_count_dens:
        sns.violinplot(data=anndata.obs,x="total_counts",y="annotation_" + classification_name)
        plt.show()
        sns.violinplot(data=anndata.obs,x="shapeSize",y="annotation_" + classification_name)
        plt.show()
        anndata.obs['count_density'] = anndata.obs['total_counts'] / anndata.obs['shapeSize']
        sns.violinplot(data=anndata.obs,x="count_density",y="annotation_" + classification_name)
        plt.show()

    # Plot expression of the marker genes for each cluster
    if plot_dot_plot:
        make_dot_plot(anndata,path_mg,classification_dot_plot="annotation_"+classification_name)

    # Plot the highly differential genes for each cluster
    if plot_rank_genes_groups:
        anndata.uns['log1p']["base"] = None
        sc.tl.rank_genes_groups(anndata, groupby="annotation_"+classification_name,key="annotation_"+classification_name+'_rank_genes')
        sc.pl.rank_genes_groups(anndata, n_genes=8, sharey=False, show=False)
    
    

In [6]:
# make dot plot
def make_dot_plot(anndata,path_mg,classification_dot_plot):
    marker_genes = pd.read_csv(path_mg, sep=',', index_col=0)
    cell_types = marker_genes.columns 
    df_backup = anndata.var.copy(deep=True)     
    all_genes = anndata.var.index.str.capitalize()
    for gene in all_genes:
        if gene not in marker_genes.index:
            marker_genes.loc[gene] = [0]*len(marker_genes.columns)  
    marker_genes.sort_index(inplace=True) 
    anndata.var = anndata.var.join(marker_genes)
    anndata.var['sum'] = anndata.var.iloc[:,len(anndata.var.columns)-len(cell_types):len(anndata.var.columns)].sum(axis=1)
    positions_labels_dict = {}
    for cell_type in cell_types:
        positions = np.where(anndata.var[cell_type]>0)[0]
        for p in positions:
            if (p,p) in positions_labels_dict:
                positions_labels_dict[(p,p)] = positions_labels_dict[(p,p)] + '_' + cell_type.lower()
            else:
                positions_labels_dict[(p,p)] = cell_type.lower()
    keys = positions_labels_dict.keys()
    values = positions_labels_dict.values()    
    sc.pl.dotplot(anndata,var_names=anndata.var_names,groupby=classification_dot_plot,dendrogram=True,var_group_positions=list(keys),var_group_labels=list(values))    
    anndata.var = df_backup
    

In [None]:
def annotate_celltype_score_genes_bins(anndata,celltypes,row_norm = False, celltype_column = "annotation"):
    scoresper_cluster = anndata.obs[
        [col for col in anndata.obs if col in celltypes]
    ]

    if row_norm:
        row_norm = scoresper_cluster.sub(
            scoresper_cluster.mean(axis=1).values, axis="rows"
        ).div(scoresper_cluster.std(axis=1).values, axis="rows")
        anndata.obs[scoresper_cluster.columns.values] = row_norm
        temp = pd.DataFrame(np.sort(row_norm)[:, -2:])
    else:
        temp = pd.DataFrame(np.sort(scoresper_cluster)[:, -2:])


    scores = (temp[1] - temp[0]) / ((temp[1] + temp[0]) / 2)
    anndata.obs["Cleanliness"] = scores.values

    def assign_cell_type(row):
        # Identify the cell type with the max score
        max_score_type = row.idxmax()
        # If max score is <= 0, assign 'unknown_celltype'
        if row[max_score_type] <= 0:
            return "unknown_celltype"
        else:
            return max_score_type

    # Assign 'unknown_celltype' cell_type if no cell type could be found that has larger expression than random sample
    # as calculated by sc.tl.score_genes function of scanpy.
    anndata.obs[celltype_column] = scoresper_cluster.apply(assign_cell_type, axis=1)
    anndata.obs[celltype_column] = anndata.obs[celltype_column].astype(
        "category"
    )
    # Set the Cleanliness score for unknown_celltype equal to 0 (i.e. not clean)
    anndata.obs.loc[
        anndata.obs[celltype_column] == "unknown_celltype", "Cleanliness"
    ] = 0

    return anndata, scoresper_cluster

In [None]:
def score_genes_bins(
    anndata,
    path_marker_genes: str,
    bins: int = 25,
    delimiter=",",
    row_norm: bool = False,
    repl_columns: Optional[Dict[str, str]] = None,
    del_celltypes: Optional[List[str]] = None,
    input_dict=False,
    suffix = "",
) -> Tuple[dict, pd.DataFrame]:
    """
    The function loads marker genes from a CSV file and scores cells for each cell type using those markers
    using scanpy's score_genes function.
    Marker genes can be provided as a one-hot encoded matrix with cell types listed in the first row, and marker genes in the first column;
    or in dictionary format. The function further allows replacements of column names and deletions of specific marker genes.

    Parameters
    ----------
    sdata : SpatialData
        Data containing spatial information.
    path_marker_genes : str
        Path to the CSV file containing the marker genes.
        CSV file should be a one-hot encoded matrix with cell types listed in the first row, and marker genes in the first column.
    bins : int, optional
        Number of bins to use for the sc.tl.score_genes function, default is 25.
    delimiter : str, optional
        Delimiter used in the CSV file, default is ','.
    row_norm : bool, optional
        Flag to determine if row normalization is applied, default is False.
    repl_columns : dict, optional
        Dictionary containing cell types to be replaced. The keys are the original cell type names and
        the values are their replacements.
    del_celltypes : list, optional
        List of cell types to be deleted from the list of possible cell type candidates.
        Cells are scored for these cell types, but will not be assigned a cell type from this list.
    input_dict : bool, optional
        If True, the marker gene list from the CSV file is treated as a dictionary with the first column being
        the cell type names and the subsequent columns being the marker genes for those cell types. Default is False.

    Returns
    -------
    dict
        Dictionary with cell types as keys and their respective marker genes as values.
    pd.DataFrame
        Index:
            cells: The index corresponds to indivdual cells ID's.
        Columns:
            celltypes (as provided via the markers file).
        Values:
            Score obtained using scanpy's score_genes function for each celltype and for each cell.

    Notes
    -----
    The cell type 'unknown_celltype' is reserved for cells that could not be assigned a specific cell type.

    """

    # Load marker genes from csv
    if input_dict:
        df_markers = pd.read_csv(
            path_marker_genes, header=None, index_col=0, delimiter=delimiter
        )
        df_markers = df_markers.T
        genes_dict = df_markers.to_dict("list")
        for i in genes_dict:
            genes_dict[i] = [x for x in genes_dict[i] if str(x) != "nan"]
    # Replace column names in marker genes
    else:
        df_markers = pd.read_csv(path_marker_genes, index_col=0, delimiter=delimiter)
        if repl_columns:
            for column, replace in repl_columns.items():
                df_markers.columns = df_markers.columns.str.replace(column, replace)

        # Create genes dict with all marker genes for every celltype
        genes_dict = {}
        for i in df_markers:
            genes = []
            for row, value in enumerate(df_markers[i]):
                if value > 0:
                    genes.append(df_markers.index[row])
            genes_dict[i] = genes

    assert (
        "unknown_celltype" not in genes_dict.keys()
    ), "Cell type 'unknown_celltype' is reserved for cells that could not be assigned a specific cell type"

    # Score all cells for all celltypes
    for key, value in genes_dict.items():
        try:
            sc.tl.score_genes(anndata, value, score_name=key,n_bins=bins) # W: key = cell type, value = list of markergenes of that cell type
        except ValueError:
            log.warning(
                f"Markergenes {value} not present in region, celltype {key} not found"
            )

    # Delete genes from marker genes and genes dict
    if del_celltypes:
        for gene in del_celltypes:
            if gene in df_markers.columns:
                del df_markers[gene]
            if gene in genes_dict.keys():
                del genes_dict[gene]

    anndata, scoresper_cluster = annotate_celltype_score_genes_bins(anndata,celltypes=df_markers.columns,row_norm=row_norm,celltype_column="annotation")

    # add 'unknown_celltype' to the list of celltypes if it is detected.
    if "unknown_celltype" in anndata.obs["annotation"].cat.categories:
        genes_dict["unknown_celltype"] = []

    name_clustering = 'score_genes_original' + suffix
    anndata.uns[name_clustering] = scoresper_cluster
    anndata.obs.rename(columns={'annotation': 'annotation_'+name_clustering}, inplace=True)
    anndata.obs.rename(columns={'Cleanliness': 'cleanliness_'+name_clustering}, inplace=True)
    # check if 'unknown_celltype' is a column of genes_dict
    if 'unknown_celltype' in genes_dict:
        del genes_dict['unknown_celltype']
    anndata.obs.drop(genes_dict.keys(), axis=1, inplace=True)
    cols = anndata.obs.columns.to_list()
    cols_new = cols[0:len(cols)-2]
    cols_new.append(cols[len(cols)-1])
    cols_new.append(cols[len(cols)-2])
    anndata.obs  = anndata.obs .reindex(columns=cols_new)
    # replace 'unknown_celltype' by 'Unknown' in the annotation column
    anndata.obs['annotation_'+name_clustering].replace({'unknown_celltype': 'Unknown'}, inplace=True)

    #_back_sdata_table_to_zarr(sdata)

    print((anndata.obs['annotation_score_genes_original'+suffix].value_counts()/len(anndata.obs['annotation_score_genes_original'+suffix]))*100)
    sc.pl.umap(anndata,color=['annotation_score_genes_original'+suffix])

    return genes_dict, scoresper_cluster

    

In [8]:
def own_score_genes(anndata,path_mg,norm_expr_var=False,min_score='Zero',min_score_q=25,scale_score='MinMax',scale_score_q=1,suffix='',mean = 'all',mean_values = None)->pd.DataFrame: 
    # annotate each cell
    # method based on score_genes of scanpy but no bins and min max normalization of the scores per cell type
    # for each cell, a score is calculated for each cell type: 
    # sum of the expressions of the markers in the cell - sum of the mean expressions of the markers in all cells
    # our expression data does not need to be scaled anymore (norm_expr_var = False) because sc.pp.scale is already applied in Sparrow
    path_marker_genes = path_mg,
    marker_genes = pd.read_csv(path_marker_genes[0], sep=',',index_col=0)
    scores_cell_celltype = pd.DataFrame()
    cell_types = marker_genes.columns.tolist()
    matrix = anndata.to_df()
    # correct for the variance of the expression of each gene
    if norm_expr_var:
        matrix = matrix.div(matrix.std(axis=0))
    if mean == 'all':
        mean_expression = matrix.mean(axis=0)
    if mean == 'given':
        mean_expression = mean_values
    
    matrix_minus_mean = matrix - mean_expression
    genes_in_anndata = anndata.var_names.to_list()
    Nmarkers = marker_genes.sum(axis=0).to_list()
    ct = 0
    for cell_type in cell_types:
        anndata.obs['score_'+cell_type] = 0
        for gene in marker_genes[marker_genes[cell_type] > 0].index.tolist():
                if gene in genes_in_anndata:
                    anndata.obs['score_'+cell_type] = anndata.obs['score_'+cell_type] + (matrix_minus_mean[gene]*marker_genes[cell_type][gene])/Nmarkers[ct]
        scores_cell_celltype[cell_type] = anndata.obs['score_'+cell_type]
        anndata.obs = anndata.obs.drop(columns=['score_'+cell_type])
        ct = ct + 1
    scores_cell_celltype.index.name = None
    scores_cell_celltype = scores_cell_celltype.reset_index(drop=True)

    # min score to obtain for a cell type, otherwise 'unknown' 
    if min_score == 'Zero':
        scores_cell_celltype_ok = scores_cell_celltype.copy(deep=True)
        scores_cell_celltype_ok[scores_cell_celltype_ok > 0] = True
        scores_cell_celltype_ok[scores_cell_celltype_ok != True] = False
    if min_score == 'Quantile':
        scores_cell_celltype_ok = scores_cell_celltype.copy(deep=True)
        scores_cell_celltype_ok[scores_cell_celltype_ok > scores_cell_celltype_ok.quantile(min_score_q/100)] = True
        scores_cell_celltype_ok[scores_cell_celltype_ok != True] = False
    if min_score == 'None':
        scores_cell_celltype_ok = scores_cell_celltype.copy(deep=True)
        scores_cell_celltype_ok[scores_cell_celltype_ok.round(6) == scores_cell_celltype_ok.round(6)] = True

    # scale scores per cell type to make them more comparable between cell types (because some cell types have more markers etc.) 
    if scale_score == 'MinMax':
        # if you chose this the '- mean_expression' you did before does not have an effect
        scores_cell_celltype = scores_cell_celltype.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    if scale_score == 'ZeroMax':
        scores_cell_celltype = scores_cell_celltype.apply(lambda x: (x) / (np.max(x))) # (~ min max scaling with min = 0)
    #if scale_score == 'Nmarkers':
        #Nmarkers = marker_genes.sum(axis=0).to_list()
        #scores_cell_celltype = scores_cell_celltype.div(Nmarkers)
    if scale_score == 'Robust':
        for cell_type in cell_types:
            if np.percentile(scores_cell_celltype[cell_type],scale_score_q) < np.percentile(scores_cell_celltype[cell_type],100-scale_score_q):
                scores_cell_celltype[cell_type] = (scores_cell_celltype[cell_type] - np.percentile(scores_cell_celltype[cell_type],scale_score_q))/(np.percentile(scores_cell_celltype[cell_type],100-scale_score_q)-np.percentile(scores_cell_celltype[cell_type],scale_score_q))
            else: # MinMax scaling if percentiles are equal 
                scores_cell_celltype[cell_type] = (scores_cell_celltype[cell_type]-np.min(scores_cell_celltype[cell_type]))/(np.max(scores_cell_celltype[cell_type])-np.min(scores_cell_celltype[cell_type]))
    if scale_score == 'Rank':
        for cell_type in cell_types:
            scores_cell_celltype[cell_type] = scores_cell_celltype[cell_type].rank(pct=True)
            
    max_scores,second_scores=np.sort(scores_cell_celltype.values)[:,-1],np.sort(scores_cell_celltype.values)[:,-2:-1]
    max_scores = pd.DataFrame(max_scores, index=scores_cell_celltype.index)
    second_scores = pd.DataFrame(second_scores, index=scores_cell_celltype.index)
    cleanliness = (max_scores - second_scores) / ((max_scores + second_scores+0.0000001) / 2)
    scores_cell_celltype_before_min_score = scores_cell_celltype.copy(deep=True)
    scores_cell_celltype[scores_cell_celltype_ok == False] = np.nan
    sc_cell_cellt = scores_cell_celltype.idxmax(axis=1).to_dict()
    unknown_cells = [k for k, v in sc_cell_cellt.items() if pd.isnull(v)]
    for i in unknown_cells:
        sc_cell_cellt[i] = 'Unknown'
    sc_cell_cellt = {str(k): v for k, v in sc_cell_cellt.items()}
    anndata.obs["annotation_own_score_genes"+suffix] = sc_cell_cellt.values()
    anndata.obs["score_celltype_own_score_genes"+suffix] = max_scores.values
    anndata.obs["second_score_celltype_own_score_genes"+suffix] = second_scores.values
    anndata.obs["cleanliness_own_score_genes"+suffix] = cleanliness.values
    anndata.uns["own_score_genes"+suffix] = scores_cell_celltype_before_min_score
    return scores_cell_celltype, scores_cell_celltype_before_min_score

In [None]:
def own_score_genes_iterative(anndata,path_mg,suffix='',nr_iterations=10,save=False,saved_as=''):
    # initial clustering: 'mean expression' is over all cells, but will be mainly influenced by majority cell type
    # --> disadvantage for majority cell type (Hepa), which will be countered with following iterations
    scores = own_score_genes(anndata,path_mg,mean = 'all',scale_score='No',suffix=suffix)
    print((anndata.obs['annotation_own_score_genes'+suffix].value_counts()/len(anndata.obs['annotation_own_score_genes'+suffix]))*100)
    sc.pl.umap(anndata,color=['annotation_own_score_genes'+suffix])
    anndata.obs['annotation_own_score_genes_start_iterative'+suffix] = anndata.obs['annotation_own_score_genes'+suffix]
    anndata.uns["own_score_genes_start_iterative"+suffix] = scores[0]
    # iterative clustering: 
    # own_score_genes again but mean expression with fair contribution of each cell type (cell types are based on the previous clustering)
    changes = []
    completed = 0
    for iteration in range(nr_iterations):
        cell_types = np.unique(anndata.obs["annotation_own_score_genes"+suffix]).tolist()
        if 'Unknown' in cell_types:
            cell_types.remove('Unknown')
        mean_per_ct = []
        for ct in cell_types:
            l = pd.DataFrame(anndata.obs["annotation_own_score_genes"+suffix]==ct)
            l = l.index[l["annotation_own_score_genes"+suffix]].tolist()
            ct_sel = anndata[l,:]
            mean_per_ct.append(ct_sel.to_df().mean(axis=0))
        df = pd.concat(mean_per_ct,axis=1)
        next_mean = df.mean(axis=1)
        if 'annotation_own_score_genes_previous'+suffix in anndata.obs.columns:
            anndata.obs.drop(columns=['annotation_own_score_genes_previous'+suffix], inplace=True)       
        anndata.obs.rename(columns={'annotation_own_score_genes'+suffix: 'annotation_own_score_genes_previous'+suffix}, inplace=True)
        scores = own_score_genes(anndata,path_mg,scale_score='No',suffix=suffix,mean='given',mean_values=next_mean)
        #print(scores)
        t = anndata.obs["annotation_own_score_genes"+suffix] == anndata.obs["annotation_own_score_genes_previous"+suffix]
        anndata.obs["own_score_genes_diff_iter"+suffix] = [int(x) for x in t.to_list()]
        fr = anndata.obs['own_score_genes_diff_iter'+suffix].value_counts()/len(anndata.obs['own_score_genes_diff_iter'+suffix])        
        completed = completed + 1
        if len(fr) > 1 and (fr[0]*100) > 0.05:
            print('Percentage of cells with changed annotation: '+str(np.round((fr[0]*100),2)))
            changes.append(fr[0]*100)
            sc.pl.umap(anndata,color=['own_score_genes_diff_iter'+suffix])
            sc.pl.umap(anndata,color=['annotation_own_score_genes'+suffix])
            print((anndata.obs['annotation_own_score_genes'+suffix].value_counts()/len(anndata.obs['annotation_own_score_genes'+suffix]))*100)
        else:
            if len(fr) > 1:
                print('Percentage of cells with changed annotation: '+str(np.round((fr[0]*100),2)))
            else:
                print('Percentage of cells with changed annotation: '+str(0.0))
            print('converged')
            changes.append(0)
            break
    plt.plot(list(range(1,completed+1,1)),changes)    
    # make x-axis integers and start from 1
    ax = plt.gca()
    ax.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
    plt.xlabel('Iteration')
    plt.ylabel('Percentage of cells with changed annotation')
    # save plot in folder output dir
    if save:
        plt.savefig(saved_as+'.png',dpi=300)
    # drop columns from anndata.obs
    anndata.obs.drop(columns=['own_score_genes_diff_iter'+suffix], inplace=True)
    anndata.obs.drop(columns=['annotation_own_score_genes_previous'+suffix], inplace=True)
    return scores
    


In [9]:
def make_umap_and_perform_leiden_annotation(sdata,path_mg,n_PCAs,n_neighbors,cluster_resolution,norm_expr_var=True,min_score='Quantile',min_score_q=25,scale_score='Robust',scale_score_q=1,clean_th=0.5)->pd.DataFrame:

    # make umap and do leiden clustering with scanpy functions
    make_umap(sdata,n_neighbors=n_neighbors,n_PCAs=n_PCAs)
    column_name = 'leiden_'+str(n_PCAs)+'_'+str(n_neighbors)+'_'+str(cluster_resolution)
    sc.tl.leiden(sdata.table,resolution=cluster_resolution,key_added=column_name)

    # annotate each leiden cluster
    # method based on marker genes and similar to 'own_score_genes' but leiden clusters annotated instead of individual cells
    # for each leiden cluster, a score is calculated for each cell type: 
    # sum of the mean expressions of the markers in leiden cluster - sum of mean expression of the markers in all cells
    n_clusters = np.unique(sdata.table.obs[column_name]).size
    leiden_mean_expression = {}
    for i in range(n_clusters):
        an_cluster = sdata.table[sdata.table.obs[column_name]==str(i)]
        daf = an_cluster.to_df().mean(axis=0)
        pd.DataFrame(daf)
        leiden_mean_expression[i] = daf
    if norm_expr_var:
        matrix = sdata.table.to_df()
        all_mean_expression = matrix.div(matrix.std(axis=0)).mean(axis=0)
        for i in range(n_clusters):
            leiden_mean_expression[i] = leiden_mean_expression[i].div(matrix.std(axis=0))
    else:
        all_mean_expression = sdata.table.to_df().mean(axis=0)
    path_marker_genes = path_mg,
    marker_genes = pd.read_csv(path_marker_genes[0], sep=',',index_col=0)
    scores_leiden_celltype = pd.DataFrame()
    cell_types = marker_genes.columns.tolist()
    for cell_type in cell_types:
        scores_clusters = []
        for i in range(n_clusters):
            score = 0 
            for gene in marker_genes[marker_genes[cell_type] == 1].index.tolist():
                score = score + (leiden_mean_expression[i][gene] - all_mean_expression[gene])
            scores_clusters.append(score)
        scores_leiden_celltype[cell_type] = scores_clusters
    
    # min score to obtain for a cell type, otherwise 'unknown' 
    if min_score == 'Zero':
        scores_leiden_celltype_ok = scores_leiden_celltype.copy(deep=True)
        scores_leiden_celltype_ok[scores_leiden_celltype_ok > 0] = True
        scores_leiden_celltype_ok[scores_leiden_celltype_ok != True] = False
    if min_score == 'Quantile':
        scores_leiden_celltype_ok = scores_leiden_celltype.copy(deep=True)
        scores_leiden_celltype_ok[scores_leiden_celltype_ok > scores_leiden_celltype_ok.quantile(min_score_q/100)] = True
        print(min_score_q/100)
        scores_leiden_celltype_ok[scores_leiden_celltype_ok != True] = False
    if min_score == 'None':
        scores_leiden_celltype_ok = scores_leiden_celltype.copy(deep=True)
        scores_leiden_celltype_ok[scores_leiden_celltype_ok.round(6) == scores_leiden_celltype_ok.round(6)] = True

    # scale scores per cell type to make them more comparable between cell types (because some cell types have more markers etc.) 
    if scale_score == 'MinMax':
        scores_leiden_celltype = scores_leiden_celltype.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
    if scale_score == 'ZeroMax':
        scores_leiden_celltype = scores_leiden_celltype.apply(lambda x: (x) / (np.max(x))) # (~ min max scaling with min = 0)
    if scale_score == 'Nmarkers':
        Nmarkers = marker_genes.sum(axis=0).to_list()
        scores_leiden_celltype = scores_leiden_celltype.div(Nmarkers)
    if scale_score == 'Robust':
        for cell_type in cell_types:
            if np.percentile(scores_leiden_celltype[cell_type],scale_score_q) < np.percentile(scores_leiden_celltype[cell_type],100-scale_score_q):
                scores_leiden_celltype[cell_type] = (scores_leiden_celltype[cell_type] - np.percentile(scores_leiden_celltype[cell_type],scale_score_q))/(np.percentile(scores_leiden_celltype[cell_type],100-scale_score_q)-np.percentile(scores_leiden_celltype[cell_type],scale_score_q))
            else: # MinMax scaling if percentiles are equal 
                scores_leiden_celltype[cell_type] = (scores_leiden_celltype[cell_type]-np.min(scores_leiden_celltype[cell_type]))/(np.max(scores_leiden_celltype[cell_type])-np.min(scores_leiden_celltype[cell_type]))
    if scale_score == 'Rank':
        for cell_type in cell_types:
            scores_leiden_celltype[cell_type] = scores_leiden_celltype[cell_type].rank(pct=True)

    # cluster is annotated with the cell type with the highest score (+ this highest score is above min_score)
    scores_leiden_celltype[scores_leiden_celltype_ok == False] = np.nan
    sc_leiden_cellt = scores_leiden_celltype.idxmax(axis=1).to_dict()
    unknown_clusters = [k for k, v in sc_leiden_cellt.items() if pd.isnull(v)]

    max_scores = scores_leiden_celltype.max(axis=1)
    second_scores = scores_leiden_celltype.apply(lambda x: x.nlargest(2).values[-1], axis=1)
    cleanl_per_cluster = (max_scores - second_scores) / ((max_scores + second_scores) / 2)
    third_scores = scores_leiden_celltype.apply(lambda x: x.nlargest(3).values[-1], axis=1)
    cleanl_per_cluster_extra = (max_scores - third_scores) / ((max_scores + third_scores) / 2)
    scores_draft = scores_leiden_celltype.copy(deep=True)
    for i in range(n_clusters):
        if cleanl_per_cluster[i] < clean_th:
            scores_draft.loc[i].at[scores_draft.idxmax(axis=1)[i]] = np.nan 
            sc_leiden_cellt[i] = sc_leiden_cellt[i] + '/' + scores_draft.idxmax(axis=1)[i]
            if cleanl_per_cluster_extra[i] < clean_th:
                scores_draft.loc[i].at[scores_draft.idxmax(axis=1)[i]] = np.nan 
                sc_leiden_cellt[i] = sc_leiden_cellt[i] + '/' + scores_draft.idxmax(axis=1)[i]
                sum = abs(max_scores[i] + second_scores[i] + third_scores[i])
                if max_scores[i] > 0:
                    p1 = round(100*max_scores[i]/sum)
                    p2 = round(100*second_scores[i]/sum)
                    p3 = round(100*third_scores[i]/sum)
                else:
                    p1 = round(100*(sum + max_scores[i])/(2*sum))
                    p2 = round(100*(sum + second_scores[i])/(2*sum))
                    p3 = round(100*(sum + third_scores[i])/(2*sum))
                sc_leiden_cellt[i] = sc_leiden_cellt[i] + '(' + str(p1) + '%/' + str(p2) + '%/' + str(p3) + '%)'
            else:
                sum = abs(max_scores[i] + second_scores[i])
                if max_scores[i] > 0:
                    p1 = round(100*max_scores[i]/sum)
                    p2 = round(100*second_scores[i]/sum)
                else:
                    p1 = round(100*(sum + max_scores[i])/sum)
                    p2 = round(100*(sum + second_scores[i])/sum)
                sc_leiden_cellt[i] = sc_leiden_cellt[i] + '(' + str(p1) + '%/' + str(p2) + '%)'
    # change the values of keys in list
    for i in unknown_clusters:
        sc_leiden_cellt[i] = 'Unknown'
    sc_leiden_cellt = {str(k): v for k, v in sc_leiden_cellt.items()}
    sdata.table.obs["annotation_"+column_name]=sdata.table.obs[column_name] 
    sdata.table.obs["annotation_"+column_name].replace(list(sc_leiden_cellt.keys()),list(sc_leiden_cellt.values()), inplace=True)
    b = pd.DataFrame.from_dict(sc_leiden_cellt, orient='index')
    cell_type_leiden = {}
    cell_types = np.unique(b[0])
    for cell_type in cell_types:
        indices = b.index[b[0] == cell_type].tolist()
        cell_type_leiden[cell_type] = indices
    sdata.table.uns["mapping_cell_type_"+column_name] = cell_type_leiden
    # cleanliness of the annotation based on highest and second highest score
    sc_leiden_cleanl = cleanl_per_cluster.to_dict()
    for i in unknown_clusters:
        sc_leiden_cleanl[i] = 0
    sc_leiden_cleanl = {str(k): v for k, v in sc_leiden_cleanl.items()}
    sdata.table.obs["cleanliness_"+column_name]=sdata.table.obs[column_name] 
    sdata.table.obs["cleanliness_"+column_name].replace(list(sc_leiden_cleanl.keys()),list(sc_leiden_cleanl.values()), inplace=True)

    return scores_leiden_celltype



In [10]:
def plot_dendrogram(model,N_clusters,labels) -> dict:
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    R = dendrogram(linkage_matrix,truncate_mode='lastp',p=N_clusters,no_plot=True)   
    
    R2 = dendrogram(linkage_matrix,labels=labels,no_plot=True)  
    clusters = np.array(list(dict.fromkeys(R2["ivl"])))
    clusters = clusters.astype(str)

    # create a label dictionary
    temp = {R["leaves"][ii]: clusters[ii] + ' ' + R["ivl"][ii] for ii in range(len(R["leaves"]))}
    def llf(xx):
        return "{}".format(temp[xx])

    dendrogram(linkage_matrix,leaf_label_func=llf,leaf_rotation=60.,leaf_font_size=10.,truncate_mode='lastp',p=N_clusters)

    return R

In [11]:
def KMeans_clustering(sdata,matrix_score_genes,N_clusters,suffix_name=''):
    kmeans = KMeans(n_clusters = N_clusters, n_init=10) # run 10 times with different centroid seeds
    kmeans_annotation = kmeans.fit_predict(matrix_score_genes)
    kmeans_annotation = kmeans_annotation.astype(str)
    sdata.table.obs["KMeans"+str(N_clusters)+suffix_name] = kmeans_annotation

In [12]:
def Hierarchical_clustering(sdata,matrix_score_genes,N_clusters,suffix_name='',levels_dendrogram=4)-> dict:
    hier = AgglomerativeClustering(n_clusters=N_clusters,compute_distances=True)
    hierarchical = hier.fit(matrix_score_genes)
    hierarchical_annotation = hierarchical.fit_predict(matrix_score_genes)
    hierarchical_annotation = hierarchical_annotation.astype(str)
    sdata.table.obs["Hierarchical"+str(N_clusters)+suffix_name] = hierarchical_annotation
    R = plot_dendrogram(hierarchical,N_clusters=N_clusters,labels=hierarchical_annotation)
    return R

In [13]:
def correlation_matrix_expression_marker_genes_of_2_cell_types(anndata,path_mg,type1,type2):
    df_mg = pd.read_csv(path_mg,index_col=0)
    gene_set1 = df_mg.index[df_mg[type1]>0].tolist()
    gene_set2 = df_mg.index[df_mg[type2]>0].tolist()
    df_genes = pd.DataFrame()
    overlap = [g for g in gene_set1 if g in gene_set2]
    gene_set1 = [g for g in gene_set1 if g not in overlap]
    gene_set2 = [g for g in gene_set2 if g not in overlap]
    for g in gene_set1:
        df_genes[g+' '+type1] = anndata.to_df()[g]
    for g in overlap:
        df_genes[g+' both'] = anndata.to_df()[g]
    for g in gene_set2:
        df_genes[g+' '+type2] = anndata.to_df()[g]
    sns.heatmap(df_genes.corr(method='pearson'))

In [14]:
def DEGs_between_2_sets_leiden_clusters_compared_to_markers(adata,name_cell_type1, putative_leiden_clusters_cell_type1, name_cell_type2, putative_leiden_clusters_cell_type2, path_mg)->dict:
    leidcl1 = [str(x[0]) for x in putative_leiden_clusters_cell_type1]
    leidcl2 = [str(x[0]) for x in putative_leiden_clusters_cell_type2]
    a = adata.obs['leiden']
    for n in leidcl1:
        a = a.replace(n,leidcl1[0])
    for n in leidcl2:
        a = a.replace(n,leidcl2[0])
    adata.obs['leiden_mod'] = a
    #adata.uns['log1p']["base"] = None
    sc.tl.rank_genes_groups(adata, groupby='leiden_mod', groups = [leidcl1[0],leidcl2[0]], method = 'wilcoxon')
    #sc.pl.rank_genes_groups(sdata.table, n_genes=99, sharey=False, show=False)
    genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    genes.rename(columns = {leidcl1[0]:'gene_'+name_cell_type1,leidcl2[0]:'gene_'+name_cell_type2},inplace=True)
    pvals_adj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])
    pvals_adj.rename(columns = {leidcl1[0]:'pval_adj_'+name_cell_type1,leidcl2[0]:'pval_adj_'+name_cell_type2},inplace=True)
    logf2 = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
    logf2.rename(columns = {leidcl1[0]:'logf2_'+name_cell_type1,leidcl2[0]:'logf2_'+name_cell_type2},inplace=True)
    df = pd.concat([genes,pvals_adj,logf2],axis=1)
    df_ct1_vs_rest = df[['gene_'+name_cell_type1,'pval_adj_'+name_cell_type1,'logf2_'+name_cell_type1]]
    df_ct1_vs_rest = df_ct1_vs_rest[(df_ct1_vs_rest['pval_adj_'+name_cell_type1] < 0.01) & (df_ct1_vs_rest['logf2_'+name_cell_type1] > 0)]
    df_ct2_vs_rest = df[['gene_'+name_cell_type2,'pval_adj_'+name_cell_type2,'logf2_'+name_cell_type2]]
    df_ct2_vs_rest = df_ct2_vs_rest[(df_ct2_vs_rest['pval_adj_'+name_cell_type2] < 0.01) & (df_ct2_vs_rest['logf2_'+name_cell_type2] > 0)]

    sc.tl.rank_genes_groups(adata, groupby='leiden_mod', groups = [leidcl1[0]], reference = leidcl2[0], method = 'wilcoxon')
    #sc.pl.rank_genes_groups(sdata.table, n_genes = 99, sharey=False, show=False)
    genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    genes.rename(columns = {leidcl1[0]:'gene'},inplace=True)
    pvals_adj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])
    pvals_adj.rename(columns = {leidcl1[0]:'pval_adj'},inplace=True)
    logf2 = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
    logf2.rename(columns = {leidcl1[0]:'logf2'},inplace=True)
    df_ct1_vs_ct2 = pd.concat([genes,pvals_adj,logf2],axis=1)
    df_ct1_vs_ct2 = df_ct1_vs_ct2[df_ct1_vs_ct2['pval_adj'] < 0.01]
    
    df_mg = pd.read_csv(path_mg,index_col=0)
    mg_ct1 = df_mg.index[df_mg[name_cell_type1]>0].tolist()
    mg_ct2 = df_mg.index[df_mg[name_cell_type2]>0].tolist()
    mg_overlap = [x for x in mg_ct1 if x in mg_ct2]
    df_overlap = df_ct1_vs_ct2[df_ct1_vs_ct2['gene'].isin(mg_overlap)]
    candidates_ct1 = df_overlap[df_overlap['logf2']>0]['gene'].to_list()
    candidates_ct2 = df_overlap[df_overlap['logf2']<0]['gene'].to_list()
    drop_ct2 = []
    if len(candidates_ct1)>0:
        reject1 = df_ct2_vs_rest[df_ct2_vs_rest['gene_'+name_cell_type2].isin(candidates_ct1)]['gene_'+name_cell_type2].to_list()
        drop_ct2 = [x for x in candidates_ct1 if x not in reject1]    
    drop_ct1 = []
    if len(candidates_ct2)>0:
        reject2 = df_ct1_vs_rest[df_ct1_vs_rest['gene_'+name_cell_type1].isin(candidates_ct2)]['gene_'+name_cell_type1].to_list()
        drop_ct1 = [x for x in candidates_ct2 if x not in reject2]
    results = {'DEGs': df_ct1_vs_ct2, 'DEGs_'+name_cell_type1+'_vs_rest': df_ct1_vs_rest, 'DEGs_'+name_cell_type2+'_vs_rest': df_ct2_vs_rest, 'markers_'+name_cell_type1: mg_ct1, 'markers_'+name_cell_type2: mg_ct2, 'overlap_markers': mg_overlap, 'drop_'+name_cell_type1: drop_ct1, 'drop_'+name_cell_type2: drop_ct2}
    return results

In [15]:
def DEGs_between_each_leiden_cluster_and_rest_compared_to_markers(adata,name_cell_types, putative_leiden_clusters_per_cell_type, path_mg)->dict:
    a = adata.obs['leiden']
    leidcl = []
    for putative_leiden_clusters in putative_leiden_clusters_per_cell_type:
        L = [str(x[0]) for x in putative_leiden_clusters]
        for n in L:
            a = a.replace(n,L[0])
        leidcl.append(L[0])
    adata.obs['leiden_mod'] = a
    print(leidcl)
    #adata.uns['log1p']["base"] = None
    sc.tl.rank_genes_groups(adata, groupby='leiden_mod', method = 'wilcoxon')
    #sc.pl.rank_genes_groups(sdata.table, n_genes=99, sharey=False, show=False)
    genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
    pvals_adj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])
    logf2 = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
    dict_df_ct_vs_rest = {}
    dict_ct_markers = {}
    dict_ct_pos_DEG_but_not_marker = {}
    df_mg = pd.read_csv(path_mg,index_col=0)
    i = 0
    for nr in leidcl:
        df_ct_vs_rest = pd.concat([genes[[nr]],pvals_adj[[nr]],logf2[[nr]]],axis=1)
        # change column names of df_ct_vs_rest
        df_ct_vs_rest.columns = ['gene','pvals_adj','logf2']
        df_ct_vs_rest = df_ct_vs_rest[df_ct_vs_rest['pvals_adj'] < 0.01]
        dict_df_ct_vs_rest[name_cell_types[i]] = df_ct_vs_rest
        if name_cell_types[i] != 'Unknown':
            markers_ct = df_mg.index[df_mg[name_cell_types[i]]>0].tolist()
        else:
            markers_ct = []
        dict_ct_markers[name_cell_types[i]] = markers_ct
        pos_DEGs_but_no_mg = df_ct_vs_rest[(~df_ct_vs_rest['gene'].isin(markers_ct)) & (df_ct_vs_rest['logf2'] > 0)]
        dict_ct_pos_DEG_but_not_marker[name_cell_types[i]] = pos_DEGs_but_no_mg
        i = i + 1
   
    results = {'DEGs': dict_df_ct_vs_rest, 'markers': dict_ct_markers, 'pos_DEGs_but_not_marker': dict_ct_pos_DEG_but_not_marker}
    return results

In [1]:
def Jaccard(list1,list2):
    for i in range(len(list1)):
        if list1[i] > 0:
            list1[i] = 1
    for i in range(len(list2)):
        if list2[i] > 0:
            list2[i] = 1
    list3 = [list1[i] and list2[i] for i in range(len(list1))]
    list4 = [list1[i] or list2[i] for i in range(len(list1))]
    Jaccard = np.sum(list3)/np.sum(list4)    
    return np.round(Jaccard,3)

In [17]:
def Jaccard_similarity_matrix(path_mg,name):
    df_mg = pd.read_csv(path_mg,index_col=0)
    Jaccard_sim = pd.DataFrame(index=df_mg.columns, columns=df_mg.columns)
    for i in df_mg.columns:
        for j in df_mg.columns:
            Jaccard_sim.loc[i,j] = Jaccard(df_mg[i].to_list(),df_mg[j].to_list())
    Jaccard_sim = Jaccard_sim.astype(float)
    sns.heatmap(Jaccard_sim,annot=True,fmt='.2f')
    plt.title('Jaccard similarity matrix '+name)
    print(df_mg.sum(axis=0))
    return Jaccard_sim

In [18]:
def Apply_strategy_1(adata,cell_types,leiden_clusters,path_mg)->dict:
    df_mg = pd.read_csv(path_mg,index_col=0)
    n_ct = len(cell_types)
    marker_gene_drop = {}
    details_DEGs = {}
    for i in range(n_ct):
        for j in range(i+1,n_ct):            
            d = DEGs_between_2_sets_leiden_clusters_compared_to_markers(adata,cell_types[i],leiden_clusters[i],cell_types[j],leiden_clusters[j],path_mg)
            details_DEGs[cell_types[i]+'_'+cell_types[j]] = d
            if len(d['drop_'+cell_types[i]])>0:
                for g in d['drop_'+cell_types[i]]:
                    if df_mg.loc[g,cell_types[i]] >= df_mg.loc[g,cell_types[j]]:
                        if cell_types[i] in marker_gene_drop:
                            marker_gene_drop[cell_types[i]].append([g,cell_types[j]])
                        else:
                            marker_gene_drop[cell_types[i]] = []
                            marker_gene_drop[cell_types[i]].append([g,cell_types[j]])
            if len(d['drop_'+cell_types[j]])>0:
                for g in d['drop_'+cell_types[j]]:
                    if df_mg.loc[g,cell_types[j]] >= df_mg.loc[g,cell_types[i]]:
                        if cell_types[j] in marker_gene_drop:
                            marker_gene_drop[cell_types[j]].append([g,cell_types[i]])
                        else:
                            marker_gene_drop[cell_types[j]] = []
                            marker_gene_drop[cell_types[j]].append([g,cell_types[i]])
    print('Summary:')
    for key in marker_gene_drop.keys():
        print(key)
        print('Maybe drop:'+str(marker_gene_drop[key]))
    return marker_gene_drop, details_DEGs

In [1]:
def Apply_strategy_2(adata,cell_types,leiden_clusters,path_mg)->(dict,dict):
    dict_DEGs = DEGs_between_each_leiden_cluster_and_rest_compared_to_markers(adata,cell_types,leiden_clusters,path_mg)
    df_mg = pd.read_csv(path_mg,index_col=0)
    genes = adata.var_names
    marker_genes = df_mg.index.tolist()
    marker_gene_add = {}
    for gene in genes:
        candidates = []
        for i in cell_types:
            if gene in dict_DEGs['pos_DEGs_but_not_marker'][i]['gene'].tolist():
                candidates.append(i)
                if i in marker_gene_add:
                    marker_gene_add[i].append(gene)
                else:
                    marker_gene_add[i] = [gene]
        if(len(candidates) > 0):
            print(gene)
            if gene in marker_genes:
                a = df_mg.loc[gene,:]
                b = a[a>0].index.values
                print('Is marker gene of: '+str(b.tolist()))
                print('Could also be a marker gene of: '+str(candidates))
            else:
                print('Is marker gene of: []')
                print('Could also be a marker gene of: '+str(candidates))
    print('Summary:')
    for key in marker_gene_add.keys():
        print(key)
        print('Maybe add:'+str(marker_gene_add[key]))
    return marker_gene_add, dict_DEGs

In [20]:
def Apply_strategy_multiple_times(adata,annotation,path_mg,N,strategy,output_dir):
    results_runs = {}
    DEG_details_runs = {}
    Not_considered_celltypes = []
    Not_considered_leiden_clusters = []

    cell_types = adata.obs[annotation].unique().tolist()  
    leiden_clusters = clusteringVSleiden(adata,annotation,"leiden",cell_types=cell_types,print_results=False)

    nr_cells_of_each_cons_ct = []
    for i in range(len(cell_types)):
        if len(leiden_clusters[i])>0 and cell_types[i] != 'Unknown':
            leiden_cl = [str(x[0]) for x in leiden_clusters[i]]
            leiden = adata[(adata.obs['leiden'].isin(leiden_cl))&(adata.obs[annotation]==cell_types[i]),:]
            nr_cells_of_each_cons_ct.append(len(leiden))

    n_cells = min(nr_cells_of_each_cons_ct)

    print(""+str(n_cells)+" cells randomly sampled from each cell type in every iteration to do the DEG analysis")

    for k in range(N):
        list_sub_anndata = []
        for i in range(len(cell_types)):
            if len(leiden_clusters[i])>0 and cell_types[i] != 'Unknown':
                leiden_cl = [str(x[0]) for x in leiden_clusters[i]]
                leiden = adata[(adata.obs['leiden'].isin(leiden_cl))&(adata.obs[annotation]==cell_types[i]),:]
                leiden_cells = leiden.obs.index.to_list()
                leiden_cells_random = sample(leiden_cells,n_cells)
                leiden = leiden[leiden_cells_random,:]
                list_sub_anndata.append(leiden)
            else:
                Not_considered_celltypes.append(cell_types[i])
                if leiden_clusters[i] not in Not_considered_leiden_clusters:
                    Not_considered_leiden_clusters.append(leiden_clusters[i])
        cell_types = [x for x in cell_types if x not in Not_considered_celltypes]
        leiden_clusters = [x for x in leiden_clusters if x not in Not_considered_leiden_clusters]
        sub_anndata = ad.concat(list_sub_anndata)
        if strategy == 1:
            results_runs[k],DEG_details_runs[k] = Apply_strategy_1(sub_anndata,cell_types,leiden_clusters,path_mg)
        if strategy == 2:
            results_runs[k],DEG_details_runs[k] = Apply_strategy_2(sub_anndata,cell_types,leiden_clusters,path_mg)

    counts_run = {}
    for key in results_runs.keys():
        for k in results_runs[key].keys():
            if k not in counts_run.keys():
                counts_run[k] = results_runs[key][k]
            else:
                for i in results_runs[key][k]:
                    counts_run[k].append(i)
    final = {}
    for key in counts_run:
        df = pd.DataFrame(counts_run[key]).value_counts()
        # keep rows with value > N/2
        print(df)
        df = df[df > N/2]
        final[key] = df.index.to_list()

    colors = []
    for c in plt.cm.tab20.colors: colors.append(matplotlib.colors.to_hex(c))
    for c in plt.cm.tab20b.colors: colors.append(matplotlib.colors.to_hex(c))

    if strategy == 1:
        sign_tuples = []
        for key in final.keys():
            for t in final[key]:
                sign_tuples.append((key,t))
        sign_tuples = [(x[0],x[1][0],x[1][1]) for x in sign_tuples]

        DEGs_runs = {} 
        DEGs_runs_sign = {}
        n_ct = len(cell_types)
        for r in range(N):
            dfs = []
            for i in range(n_ct):
                for j in range(i+1,n_ct):
                    if cell_types[i] != 'Unknown' and cell_types[j] != 'Unknown':
                        df = DEG_details_runs[r][cell_types[i]+'_'+cell_types[j]]['DEGs']
                        df['ct1'] = cell_types[i]
                        df['ct2'] = cell_types[j]
                        # only keep rows with value of gene in overlap_markers
                        df = df[df['gene'].isin(DEG_details_runs[r][cell_types[i]+'_'+cell_types[j]]['overlap_markers'])]
                        # only keep rows if gene is not sign expressed vs the rest in cell type with smaller expression (see def strategy 1)
                        ct1_sign_vs_rest = DEG_details_runs[r][cell_types[i]+'_'+cell_types[j]]['DEGs_'+cell_types[i]+'_vs_rest']['gene_'+cell_types[i]].to_list()
                        ct2_sign_vs_rest = DEG_details_runs[r][cell_types[i]+'_'+cell_types[j]]['DEGs_'+cell_types[j]+'_vs_rest']['gene_'+cell_types[j]].to_list()
                        genes = df['gene'].to_list()
                        for g in genes:
                            # get logf2 of row in df with gene equal to g
                            logf2 = df[df['gene']==g]['logf2'].to_list()[0]
                            if logf2 > 0:
                                if g in ct2_sign_vs_rest:
                                    df = df[df['gene'] != g]
                            else:
                                if g in ct1_sign_vs_rest:
                                    df = df[df['gene'] != g]
                        dfs.append(df)                 
            DEGs_runs[r] = pd.concat(dfs)
            
            s = []
            s.append(DEGs_runs[r].merge(pd.DataFrame(sign_tuples, columns=['ct1','gene','ct2'])))
            s.append(DEGs_runs[r].merge(pd.DataFrame(sign_tuples, columns=['ct2','gene','ct1'])))
            DEGs_runs_sign[r] = pd.concat(s)

        all_together = []
        for r in range(N):
            all_together.append(DEGs_runs[r])
        df_all_together = pd.concat(all_together)

        all_together_sign = []
        all_together_sign.append(df_all_together.merge(pd.DataFrame(sign_tuples, columns=['ct1','gene','ct2'])))
        all_together_sign.append(df_all_together.merge(pd.DataFrame(sign_tuples, columns=['ct2','gene','ct1'])))
        df_all_together_sign = pd.concat(all_together_sign)

        logpadj_min_plot = df_all_together['pval_adj'].min()/10
        logpadj_max_plot = 0.5
        logf2_max_plot = df_all_together['logf2'].max() + 0.2
        logf2_min_plot = df_all_together['logf2'].min() - 0.2

        
        fig, ax = plt.subplots()
        c = 0
        for run in DEGs_runs.keys():
            ax.scatter(DEGs_runs[run]['pval_adj'].to_list(),DEGs_runs[run]['logf2'].to_list(), label='run '+str(run),c=colors[c])
            c = (c + 1)%40
        ax.legend()
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
        ax.set_ylim(logf2_min_plot,logf2_max_plot)
        ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_found_strategy_1')
        plt.show()

        fig, ax = plt.subplots()
        c = 0
        for run in DEGs_runs_sign.keys():
            ax.scatter(DEGs_runs_sign[run]['pval_adj'].to_list(),DEGs_runs_sign[run]['logf2'].to_list(), label='run '+str(run),c=colors[c])
            c = (c + 1)%40
        ax.legend()
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))  
        #ax.set_ylim(logf2_min_plot,logf2_max_plot)
        #ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_kept_strategy_1')
        plt.show()

        fig, ax = plt.subplots()
        c = 0
        for i in range(n_ct):
                for j in range(n_ct):
                    df = df_all_together[df_all_together['ct1'] == cell_types[i]]
                    df = df[df['ct2'] == cell_types[j]]
                    if len(df) > 0:
                        ax.scatter(df['pval_adj'].to_list(),df['logf2'].to_list(), label=cell_types[i]+'/'+cell_types[j],c=colors[c])
                        c = (c + 1)%40
        ax.legend()
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
        ax.set_ylim(logf2_min_plot,logf2_max_plot)
        ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_found_strategy_1')
        plt.show()

        fig, ax = plt.subplots()
        c = 0
        for i in range(n_ct):
                for j in range(n_ct):
                    df = df_all_together_sign[df_all_together_sign['ct1'] == cell_types[i]]
                    df = df[df['ct2'] == cell_types[j]]
                    if(len(df) > 0):
                        ax.scatter(df['pval_adj'].to_list(),df['logf2'].to_list(), label=cell_types[i]+'/'+cell_types[j],c=colors[c])
                        c = (c + 1)%40
        ax.legend()
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
        #ax.set_ylim(logf2_min_plot,logf2_max_plot)
        #ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_kept_strategy_1')
        plt.show()

        genes_all = df_all_together['gene'].unique()
        genes_all_sets = []
        for i in range(0,len(genes_all),10):
                    genes_all_sets.append(genes_all[i:i+10])
                
        for genes in genes_all_sets:
            c = 0
            fig, ax = plt.subplots()
            for i in range(n_ct):
                for j in range(i+1,n_ct):            
                    for g in genes:
                        df = df_all_together[df_all_together['ct1'] == cell_types[i]]
                        df = df[df['ct2'] == cell_types[j]]
                        df = df[df['gene'] == g]
                        if len(df) > 0:
                            ax.scatter(df['pval_adj'].to_list(),df['logf2'].to_list(), label=g + ' ('+cell_types[i]+'/'+cell_types[j]+')',c=colors[c])
                            c = (c + 1)%40
            ax.legend()
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
            ax.set_ylim(logf2_min_plot,logf2_max_plot)
            ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
            ax.set_xscale('log')
            ax.set_xlabel('pvals_adj')
            ax.set_ylabel('logf2')
            ax.set_title('DEGs_found_strategy_1')
            plt.show()

        genes_all_sign = df_all_together_sign['gene'].unique()
        genes_all_sets_sign = []
        for i in range(0,len(genes_all_sign),20):
                    genes_all_sets_sign.append(genes_all_sign[i:i+20])
                
        # create file to write output to
        g1 = open(output_dir+'/DEGs_kept_strategy_1_all_info.txt', 'w+')
        for genes in genes_all_sets_sign:
            c = 0
            fig, ax = plt.subplots()
            for i in range(n_ct):
                for j in range(i+1,n_ct):            
                    for g in genes:
                        df = df_all_together_sign[df_all_together_sign['ct1'] == cell_types[i]]
                        df = df[df['ct2'] == cell_types[j]]
                        df = df[df['gene'] == g]
                        if len(df) > 0:
                            ax.scatter(df['pval_adj'].to_list(),df['logf2'].to_list(), label=g + ' ('+cell_types[i]+'/'+cell_types[j]+')',c=colors[c])
                            c = (c + 1)%40
                            # write output to file separated by tabs
                            if np.average(df['logf2'].to_list()) > 0:
                                g1.write(g + ' ' + cell_types[i]+'/'+ cell_types[j] + ' ' + str(df['logf2'].to_list()) + ' ' + str(df['pval_adj'].to_list()) + ' <->markers: ' + cell_types[i] + ' ' + '<=' + ' ' + cell_types[j] + '\n')
                            else:
                                g1.write(g + ' ' + cell_types[i]+'/'+ cell_types[j] + ' ' + str(df['logf2'].to_list()) + ' ' + str(df['pval_adj'].to_list()) + ' <->markers: ' + cell_types[i] + ' ' + '>=' + ' ' + cell_types[j] + '\n')
            ax.legend()
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
            #ax.set_ylim(logf2_min_plot,logf2_max_plot)
            #ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
            ax.set_xscale('log')
            ax.set_xlabel('pvals_adj')
            ax.set_ylabel('logf2')
            ax.set_title('DEGs_kept_strategy_1')
            plt.show()
        g1.close()
        
        genes_drop = []
        for keys in final.keys():
            for g in final[keys]:
                if g[0] not in genes_drop:
                    genes_drop.append(g[0])
        sc.pl.heatmap(adata,var_names=genes_drop,groupby=annotation,standard_scale='var',show_gene_labels=True,save='_strategy_1_genes.png')
        adata_sub = adata[~(adata.obs[annotation]=='Hepa'),:]
        sc.pl.heatmap(adata_sub,var_names=genes_drop,groupby=annotation,standard_scale='var',show_gene_labels=True,save='_strategy_1_genes_noHepa.png')

    if strategy == 2:
        for key in final.keys():
            final[key] = [i[0] for i in final[key]]

        sign_tuples = []
        for key in final.keys():
            for g in final[key]:
                sign_tuples.append((g,key))

        DEGs_runs_all_cell_types_together = {}
        for i in range(N):
            dfs = []
            for ct in cell_types:
                df = DEG_details_runs[i]['pos_DEGs_but_not_marker'][ct]
                # add column to df with cell type
                df['cell_type'] = ct
                dfs.append(df)
            # concatenate all dataframes in dfs
            DEGs_runs_all_cell_types_together[i] = pd.concat(dfs)
        all_together = []
        for i in range(N):
            all_together.append(DEGs_runs_all_cell_types_together[i])
        df_all_together = pd.concat(all_together)
        df_all_together_sign = df_all_together.merge(pd.DataFrame(sign_tuples, columns=['gene','cell_type']))  
        genes_all = df_all_together['gene'].unique()
        genes_all_sign = df_all_together_sign['gene'].unique()

        logpadj_min_plot = df_all_together['pvals_adj'].min()/10
        logpadj_max_plot = 0.5
        logf2_max_plot = df_all_together['logf2'].max() + 0.2
        logf2_min_plot = -0.2

        fig, ax = plt.subplots()
        c = 0
        for run in DEGs_runs_all_cell_types_together:
            ax.scatter(DEGs_runs_all_cell_types_together[run]['pvals_adj'].to_list(),DEGs_runs_all_cell_types_together[run]['logf2'].to_list(), label='run '+str(run),c=colors[c])
            c = (c + 1)%40
        ax.legend(ncols=2)
        ax.set_ylim(logf2_min_plot,logf2_max_plot)
        ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_found_strategy_2')
        plt.show()

        fig, ax = plt.subplots()
        c = 0
        for run in range(N):
            df = DEGs_runs_all_cell_types_together[run].merge(pd.DataFrame(sign_tuples, columns=['gene','cell_type']))        
            ax.scatter(df['pvals_adj'].to_list(),df['logf2'].to_list(), label='run '+str(run),c=colors[c])
            c = (c + 1)%40
        ax.legend(ncols=2)
        #ax.set_ylim(logf2_min_plot,logf2_max_plot)
        #ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_kept_strategy_2')
        plt.show()

        fig, ax = plt.subplots()
        c = 0
        for ct in cell_types:
            # select only the rows of the cell type
            df = df_all_together[df_all_together['cell_type'] == ct]
            ax.scatter(df['pvals_adj'].to_list(),df['logf2'].to_list(), label=str(ct),c=colors[c])
            c = (c + 1)%40
        ax.legend(ncols=2)
        ax.set_ylim(logf2_min_plot,logf2_max_plot)
        ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_found_strategy_2')
        plt.show()

        fig, ax = plt.subplots()
        c = 0 
        for ct in cell_types:
            # select only the rows of the cell type
            df = df_all_together_sign[df_all_together_sign['cell_type'] == ct]
            ax.scatter(df['pvals_adj'].to_list(),df['logf2'].to_list(), label=ct, c = colors[c])
            c = (c + 1)%40
        ax.legend(ncols=2)
        #ax.set_ylim(logf2_min_plot,logf2_max_plot)
        #ax.set_xlim(logpadj_min_plot,logpadj_max_plot)
        ax.set_xscale('log')
        ax.set_xlabel('pvals_adj')
        ax.set_ylabel('logf2')
        ax.set_title('DEGs_kept_strategy_2')
        plt.show()

        genes_all_sets = []
        for i in range(0,len(genes_all),20):
            genes_all_sets.append(genes_all[i:i+20])
        for genes in genes_all_sets:
            c = 0
            fig, ax = plt.subplots()
            for gene in genes:
                # select only the rows of the cell type
                df = df_all_together[df_all_together['gene'] == gene]
                # get unique cell_types in df
                cell_types = df['cell_type'].unique()
                for ct in cell_types:
                    df_ct = df[df['cell_type'] == ct]
                    ax.scatter(df_ct['pvals_adj'].to_list(),df_ct['logf2'].to_list(), label=gene + ' ('+ct+')',c=colors[c])
                    c = (c + 1)%40
            # Shrink current axis by 20%
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            # Put a legend to the right of the current axis
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
            ax.set_xscale('log')
            ax.set_xlabel('pvals_adj')
            ax.set_ylabel('logf2')
            ax.set_title('DEGs_found_strategy_2')
            plt.show()

        g2 = open(output_dir+'/DEGs_kept_strategy_2_all_info.txt', 'w+')
        genes_all_sets = []
        for i in range(0,len(genes_all_sign),20):
            genes_all_sets.append(genes_all_sign[i:i+20])
        for genes in genes_all_sets:
            c = 0
            fig, ax = plt.subplots()
            for gene in genes:
                # select only the rows of the cell type
                df = df_all_together_sign[df_all_together_sign['gene'] == gene]
                # get unique cell_types in df
                cell_types = df['cell_type'].unique()
                for ct in cell_types:
                    df_ct = df[df['cell_type'] == ct]
                    ax.scatter(df_ct['pvals_adj'].to_list(),df_ct['logf2'].to_list(), label=gene + ' ('+ct+')',c=colors[c])
                    c = (c + 1)%40
                    g2.write(''+gene + ' ' + ct + ' ' + str(df_ct['logf2'].to_list()) + ' ' + str(df_ct['pvals_adj'].to_list()) + '\n')

            # Shrink current axis by 20%
            box = ax.get_position()
            ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            # Put a legend to the right of the current axis
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5),ncols=2)  
            ax.set_xscale('log')
            ax.set_xlabel('pvals_adj')
            ax.set_ylabel('logf2')
            ax.set_title('DEGs_kept_strategy_2')
            plt.show()
        g2.close() 

        genes_add = []
        for keys in final.keys():
            for g in final[keys]:
                if g not in genes_add:
                    genes_add.append(g)

        sc.pl.heatmap(adata,var_names=genes_add,groupby=annotation,standard_scale='var',show_gene_labels=True,save='_strategy_2_genes.png')
        adata_sub = adata[~(adata.obs[annotation]=='Hepa'),:]
        sc.pl.heatmap(adata_sub,var_names=genes_add,groupby=annotation,standard_scale='var',show_gene_labels=True,save='_strategy_2_genes_noHepa.png')

    return final, results_runs, DEG_details_runs

In [1]:
def clusteringVSleiden(adata, celltype_column, leiden_column, cell_types = 'all', print_results=True):
      
    if cell_types == 'all':
        cell_types = adata.obs[celltype_column].unique().tolist()  

    stacked = (
            adata.obs.groupby([leiden_column, celltype_column], as_index=False)
            .size()
            .pivot(leiden_column, celltype_column)
            .fillna(0)
        )
    stacked_norm = stacked.div(stacked.sum(axis=1), axis=0)
    stacked_norm.columns = [x[1] for x in stacked_norm.columns]
    stacked_norm.index = [(x,adata.obs[leiden_column].value_counts()[x]) for x in stacked_norm.index]

    # get max of each row
    max = stacked_norm.max(axis=1)
    # get column with max value of each row
    max_col = stacked_norm.idxmax(axis=1)

    dict_leiden_clusters_per_ct = {}
    for i in range(len(stacked_norm)):
        if max[i]>0.5:
            if max_col[i] not in dict_leiden_clusters_per_ct.keys():
                dict_leiden_clusters_per_ct[max_col[i]] = []
                if 'Unknown' in stacked_norm.columns:
                    dict_leiden_clusters_per_ct[max_col[i]].append((i,np.round(max[i],2),stacked_norm.index[i][1],max_col[i],np.round(stacked_norm['Unknown'][i],2)))
                else:
                    dict_leiden_clusters_per_ct[max_col[i]].append((i,np.round(max[i],2),stacked_norm.index[i][1],max_col[i],0))
        else:
            if 'Unknown' not in dict_leiden_clusters_per_ct.keys():
                dict_leiden_clusters_per_ct['Unknown'] = []
            if 'Unknown' in stacked_norm.columns:
                dict_leiden_clusters_per_ct['Unknown'].append((i,np.round(max[i],2),stacked_norm.index[i][1],max_col[i],np.round(stacked_norm['Unknown'][i],2)))
            else:
                dict_leiden_clusters_per_ct['Unknown'].append((i,np.round(max[i],2),stacked_norm.index[i][1],max_col[i],0))
    
    leiden_clusters_per_cell_type = []

    if 'Unknown' not in cell_types:
        cell_types.append('Unknown')

    for ct in cell_types:
        if ct not in dict_leiden_clusters_per_ct.keys():
            leiden_clusters_per_cell_type.append([])
        else:
            leiden_clusters_per_cell_type.append(dict_leiden_clusters_per_ct[ct])

    if print_results:
        for i in range(len(cell_types)):
            print(cell_types[i])
            print(leiden_clusters_per_cell_type[i])
        
       
        for i in range(0,len(stacked_norm),35):
            st_n = stacked_norm.iloc[i:i+35,:]
            fig, ax = plt.subplots(1, 1, figsize=(10, 5))
            st_n.plot(kind="bar", stacked=True, ax=fig.gca(),colormap="tab20b")
            ax.spines["top"].set_visible(False)
            ax.spines["right"].set_visible(False)
            ax.spines["bottom"].set_visible(False)
            ax.spines["left"].set_visible(False)
            ax.get_yaxis().set_ticks([])
            plt.xlabel("Clusters")
            plt.legend(loc="center left", bbox_to_anchor=(1.0, 0.5), fontsize="large")
            plt.show()
            plt.close(fig)

        all_values = []
        miniLeiden = leiden_clusters_per_cell_type
        for i in range(len(miniLeiden)):
            for j in range(len(miniLeiden[i])):
                all_values.append(miniLeiden[i][j])
        keys = list(set([x[3] for x in all_values]))
        dict_plot = {}
        for k in keys:
            dict_plot[k] = []
            for i in range(len(all_values)):
                if all_values[i][3] == k:
                    dict_plot[k].append(all_values[i][1])

        colors = []
        for c in plt.cm.tab20.colors: colors.append(matplotlib.colors.to_hex(c))

        data = []
        for k in keys:
            data.append(dict_plot[k])

        plt.hist(data, color=colors[:len(dict_plot.keys())], label=list(dict_plot.keys()),bins=30,stacked=True)
        plt.xlabel('Homogeneity')
        plt.ylabel('Cluster Count')
        plt.legend(title='Majority cell type')
        plt.show()
        plt.close()

        homogeneities = [x[1] for x in all_values]
        fraction_unknown = [x[4] for x in all_values]
        # color point above 0
        for i in range(len(homogeneities)):
            if fraction_unknown[i] > 0:
                plt.scatter(homogeneities[i], fraction_unknown[i], color='black')
            else:
                plt.scatter(homogeneities[i], fraction_unknown[i], color='blue')
        plt.xlabel('Cluster homogeneity')
        plt.ylabel('Fraction cell type Unknown')

    return leiden_clusters_per_cell_type

In [None]:
def compare_annotations_composition(anndata,sample_name,annotations,names_anno_plot,drop_cell_types=[],plot=True,save=False,saved_as=''):
    cell_types = []
    for anno in annotations:
        cell_types.extend(anndata.obs[anno].unique().to_list())
    cell_types = np.unique(cell_types).tolist()
    for g in drop_cell_types:
        cell_types.remove(g)

    all_fractions_per_cell_type = []
    for ct in cell_types:
        fraction_in_each_annotation = []
        for annotation in annotations:
            nr_cells = len(anndata.obs[annotation])
            ct_counts = anndata.obs[annotation].value_counts()
            if ct in ct_counts.index:
                count = ct_counts[ct]
            else:
                count = 0
            fraction_in_each_annotation.append((count/nr_cells)*100)
        all_fractions_per_cell_type.append(fraction_in_each_annotation)

    if plot:
        colors = []
        for c in plt.cm.tab20.colors: colors.append(matplotlib.colors.to_hex(c))
        for c in plt.cm.tab20b.colors: colors.append(matplotlib.colors.to_hex(c))
        fig, ax = plt.subplots()
        c = 0
        for i in range(len(all_fractions_per_cell_type)):    
            ax.scatter(all_fractions_per_cell_type[i],names_anno_plot,label=cell_types[i], c=colors[c])
            c = (c + 1)%40
        plt.xlabel("Percentage")
        plt.title("Cell type percentages in each annotation of "+sample_name)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))  
        # save plot
        if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')

    return all_fractions_per_cell_type

In [None]:
def compare_annotations_samples_cell_type_percentage(anndata_objects,sample_names,annotations,names_anno_plot,chosen_ct,plot=True,save=False,saved_as='',mean=False,suffix=''):
    all_fractions_per_sample = []
    for annd in anndata_objects:
        fraction_in_each_annotation = []
        for annotation in annotations:
            nr_cells = len(annd.obs[annotation])
            ct_counts = annd.obs[annotation].value_counts()
            if chosen_ct in ct_counts.index:
                count = ct_counts[chosen_ct]
            else:
                count = 0
            fraction_in_each_annotation.append((count/nr_cells)*100)
        all_fractions_per_sample.append(fraction_in_each_annotation)

    if plot:
        fig, ax = plt.subplots()
        if mean:
            for i in range(len(all_fractions_per_sample)):    
                ax.scatter(all_fractions_per_sample[i],names_anno_plot,label = sample_names[i])
            m = np.mean(all_fractions_per_sample,axis=0)
            ax.scatter(m.tolist(),names_anno_plot,c='black',label='Mean',marker='x')
        else:
            for i in range(len(all_fractions_per_sample)):    
                ax.scatter(all_fractions_per_sample[i],names_anno_plot,label = sample_names[i])
        plt.title("Percentage of '" + chosen_ct + "' in each annotation"+suffix)
        plt.xlabel("Percentage")  
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))        
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) 
        if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')

    return all_fractions_per_sample

In [None]:
def compare_annotations_samples_mean_composition(anndata_objects,annotations,names_anno_plot,drop_cell_types=[],plot=True,save=False,saved_as='',suffix=''):
    cell_types = []
    for anno in annotations:
        cell_types.extend(anndata_objects[0].obs[anno].unique().to_list())
    cell_types = np.unique(cell_types).tolist()
    for g in drop_cell_types:
        cell_types.remove(g)
    sample_compositions = []
    for anndata in anndata_objects:
        sample_compositions.append(compare_annotations_composition(anndata,'',annotations,names_anno_plot,drop_cell_types,plot=False,save=False,saved_as=''))
    mean_composition = np.mean(sample_compositions,axis=0)
    colors = []
    for c in plt.cm.tab20.colors: colors.append(matplotlib.colors.to_hex(c))
    for c in plt.cm.tab20b.colors: colors.append(matplotlib.colors.to_hex(c))
    if plot:
        fig, ax = plt.subplots()
        c = 20
        for i in range(len(mean_composition)):   
            # get sum of all previous values in mean_composition
            b = 0
            for j in range(i):
                b += mean_composition[j] 
            ax.barh(names_anno_plot,mean_composition[i],label=cell_types[i], color=colors[c],left=b)
            c = (c + 2)%40
        plt.title("Composition (no 'Hepa') of each annotation averaged over the samples"+suffix)
        plt.xlabel("Percentage") 
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))        
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) 
        # rotate figure
        #plt.xticks(rotation=90)
        if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')
    return mean_composition


In [None]:
def compare_annotations_cluster_homogeneity(anndata,sample_name,annotations,names_anno_plot,drop_cell_types=[],plot=True,save=False,saved_as=''):
    cell_types = []
    for anno in annotations:
        cell_types.extend(anndata.obs[anno].unique().to_list())
    cell_types = np.unique(cell_types).tolist()
    if 'Unknown' not in cell_types:
        cell_types.append('Unknown')
    for g in drop_cell_types:
        cell_types.remove(g)
    
    all_homog_per_ct = []
    for ct in cell_types:
        homog_in_each_annotation = []
        for annotation in annotations:
            leiden_clusters_per_ct = clusteringVSleiden(anndata,annotation,'leiden',cell_types=[ct],print_results=False)
            all_max = []
            for cluster in leiden_clusters_per_ct[0]:
                all_max.append(cluster[1])
            mean_all_max = np.mean(all_max)
            homog_in_each_annotation.append(mean_all_max)
        all_homog_per_ct.append(homog_in_each_annotation)

    if plot:
        colors = []
        for c in plt.cm.tab20.colors: colors.append(matplotlib.colors.to_hex(c))
        for c in plt.cm.tab20b.colors: colors.append(matplotlib.colors.to_hex(c))
        c = 0
        fig, ax = plt.subplots()
        for i in range(len(cell_types)):    
            ax.scatter(all_homog_per_ct[i],names_anno_plot,label = cell_types[i],c=colors[c])
            c = (c + 1)%40
        plt.xlabel("Average homogeneity of leiden clusters")
        plt.title("Cluster homogeneity in each annotation of "+sample_name)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) 
        if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')
        
    return cell_types, all_homog_per_ct

In [None]:
def compare_annotations_samples_cluster_homogeneity(anndata_objects,sample_names,annotations,names_anno_plot,drop_cell_types=[],plot=True,save=False,saved_as='',suffix='',mean=False):
    cell_types = []
    for anndata in anndata_objects:
        for anno in annotations:
            cell_types.extend(anndata.obs[anno].unique().to_list())
    cell_types = np.unique(cell_types).tolist()
    for g in drop_cell_types:
        cell_types.remove(g)

    all_homog_per_sample = []
    for annd in anndata_objects:
        homog_in_each_annotation = []
        for annotation in annotations:
            leiden_clusters_per_ct = clusteringVSleiden(annd,annotation,'leiden',cell_types=cell_types,print_results=False)
            all_max = []
            for i in range(len(leiden_clusters_per_ct)):
                for cluster in leiden_clusters_per_ct[i]:
                    all_max.append(cluster[1])
            mean_all_max = np.mean(all_max)
            homog_in_each_annotation.append(mean_all_max)
        all_homog_per_sample.append(homog_in_each_annotation)

    if plot:
        fig, ax = plt.subplots()
        if mean:
            for i in range(len(all_homog_per_sample)):    
                ax.scatter(all_homog_per_sample[i],names_anno_plot,label = sample_names[i])
            m = np.mean(all_homog_per_sample,axis=0)
            ax.scatter(m.tolist(),names_anno_plot,c='black',label='Mean',marker='x')
        else:
            for i in range(len(all_homog_per_sample)):    
                ax.scatter(all_homog_per_sample[i],names_anno_plot,label = sample_names[i])
        plt.title("Cluster homogeneity of each annotation" + suffix)
        plt.xlabel("Average homogeneity of leiden clusters")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) 
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        
        if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')

    return all_homog_per_sample


In [1]:
def percentage_cells_unknown_leiden_cluster(anndata,annotation_column,leiden_column):
    leiden_clusters_per_ct = clusteringVSleiden(anndata,annotation_column,leiden_column,cell_types=['Unknown'],print_results=False)
    cells_in_unknown_cluster = 0
    for cluster in leiden_clusters_per_ct[0]:
        cells_in_unknown_cluster += cluster[2]
    return (cells_in_unknown_cluster/len(anndata.obs[annotation_column]))*100

In [2]:
def compare_annotations_samples_cluster_homogeneity_percentage_unknown(anndata_objects,sample_names,annotations,names_anno_plot,plot=True,save=False,saved_as='',mean=False,suffix=''):
    all_perc_per_sample = []
    for annd in anndata_objects:
        perc_in_each_annotation = []
        for annotation in annotations:
            perc_in_each_annotation.append(100.0-percentage_cells_unknown_leiden_cluster(annd,annotation,'leiden'))
        all_perc_per_sample.append(perc_in_each_annotation)
    if plot:
        fig, ax = plt.subplots()
        if mean:
            for i in range(len(all_perc_per_sample)):    
                ax.scatter(all_perc_per_sample[i],names_anno_plot,label = sample_names[i])
            m = np.mean(all_perc_per_sample,axis=0)
            ax.scatter(m.tolist(),names_anno_plot,c='black',label='Mean',marker='x')
        else:
            for i in range(len(all_perc_per_sample)):    
                ax.scatter(all_perc_per_sample[i],names_anno_plot,label = sample_names[i])
        plt.title("Percentage of cells in Leiden cluster with known cell type"+suffix)
        plt.xlabel("Percentage of cells")   
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))       
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])        
        if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')

    return all_perc_per_sample

In [1]:
def similarity_matrix_annotations(anndata,annotations,names_anno_plot,plot=True,save=False,saved_as='',suffix=''):
    array = np.empty((len(annotations), len(annotations)))
    for i in range(len(annotations)):
        for j in range(len(annotations)):
            c1 = anndata.obs[annotations[i]]
            c2 = anndata.obs[annotations[j]]
            c1 = c1.tolist()
            c2 = c2.tolist()
            count = 0
            for k in range(len(c1)):
                if c1[k] == c2[k]:
                    count += 1
            array[i][j] = count/len(c1)
    if plot:
        svm = sns.heatmap(array, annot=True, annot_kws={'size': 8})
        plt.xticks(np.arange(len(annotations))+0.5, names_anno_plot, rotation=90)
        plt.yticks(np.arange(len(annotations))+0.5, names_anno_plot, rotation=0)
        plt.tick_params(axis='both', which='both', length=0)
        plt.title("Similarity matrix"+suffix)
        figure = svm.get_figure()
        if save:
            figure.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')
    return array

In [None]:
def similarity_matrix_annotations_avg_samples(anndata_objects,annotations,names_anno_plot,save=False,saved_as='',suffix=''):
    arrays = []
    for anndata in anndata_objects:
        a = similarity_matrix_annotations(anndata,annotations,names_anno_plot,plot=False)
        arrays.append(a)
    avg_array = np.mean(arrays,axis=0)
    svm = sns.heatmap(avg_array, annot=True, annot_kws={'size': 8})
    plt.xticks(np.arange(len(annotations))+0.5, names_anno_plot, rotation=90)
    plt.yticks(np.arange(len(annotations))+0.5, names_anno_plot, rotation=0)
    plt.tick_params(axis='both', which='both', length=0)
    plt.title("Similarity matrix averaged over the samples"+suffix)
    figure = svm.get_figure()
    if save:
        figure.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')
    return avg_array

In [1]:
def co_occurrence(anndata,annotation,plot=True):
    sq.gr.spatial_neighbors(anndata, coord_type="generic")
    sq.gr.nhood_enrichment(anndata, cluster_key=annotation)
    anndata.uns[annotation + '_nhood_enrichment']['zscore'] = np.nan_to_num(anndata.uns[annotation + '_nhood_enrichment']['zscore'], nan=0)
    if plot:
        sq.pl.nhood_enrichment(anndata, cluster_key=annotation,mode='zscore',method='ward',optimal_ordering=True)

In [None]:
def compare_co_occurrence(anndata,annotation1,annotation2,plot=True):
    co_occurrence(anndata,annotation1,plot=False)
    co_occurrence(anndata,annotation2,plot=False)
    array_1 = anndata.uns[annotation1 + '_nhood_enrichment']['zscore']
    array_2 = anndata.uns[annotation2 + '_nhood_enrichment']['zscore']
    cell_types_1 = np.unique(anndata.obs[annotation1]).tolist()
    cell_types_2 = np.unique(anndata.obs[annotation2]).tolist()
    df_1_original = pd.DataFrame(array_1, columns=cell_types_1, index=cell_types_1)
    df_2_original = pd.DataFrame(array_2, columns=cell_types_2, index=cell_types_2)
    df_1 = df_1_original.copy(deep=True)
    df_2 = df_2_original.copy(deep=True)
    cell_types_not_in_both = list(set(cell_types_1) ^ set(cell_types_2))
    print(cell_types_not_in_both)
    for cellt in cell_types_not_in_both:
        if cellt in cell_types_1:
            # drop column and row with this cell
            df_1 = df_1.drop(cellt, axis=1)
            df_1 = df_1.drop(cellt, axis=0)
        else:
            df_2 = df_2.drop(cellt, axis=1)
            df_2 = df_2.drop(cellt, axis=0)
    if 'Chol' in df_1.columns and 'Portal vein' in df_1.columns and 'Oth im' in df_1.columns:
        df_1 = df_1.reindex(['Chol','Portal vein','Oth im'] + [ct for ct in df_1.columns if ct not in ['Chol','Portal vein','Oth im']],axis=1)
        df_1 = df_1.reindex(['Chol','Portal vein','Oth im'] + [ct for ct in df_1.columns if ct not in ['Chol','Portal vein','Oth im']],axis=0)
    if 'Chol' in df_1_original.columns and 'Portal vein' in df_1_original.columns and 'Oth im' in df_1_original.columns:
        df_1_original = df_1_original.reindex(['Chol','Portal vein','Oth im'] + [ct for ct in df_1_original.columns if ct not in ['Chol','Portal vein','Oth im']],axis=1)
        df_1_original = df_1_original.reindex(['Chol','Portal vein','Oth im'] + [ct for ct in df_1_original.columns if ct not in ['Chol','Portal vein','Oth im']],axis=0)
    if 'Chol' in df_2_original.columns and 'Portal vein' in df_2_original.columns and 'Oth im' in df_2_original.columns:
        df_2_original = df_2_original.reindex(['Chol','Portal vein','Oth im'] + [ct for ct in df_2_original.columns if ct not in ['Chol','Portal vein','Oth im']],axis=1)
        df_2_original = df_2_original.reindex(['Chol','Portal vein','Oth im'] + [ct for ct in df_2_original.columns if ct not in ['Chol','Portal vein','Oth im']],axis=0)    
    if 'Kupf' in df_1.columns and 'Stel' in df_1.columns and 'LSEC' in df_1.columns:
        df_1 = df_1.reindex(['Kupf','Stel','LSEC'] + [ct for ct in df_1.columns if ct not in ['Kupf','Stel','LSEC']],axis=1)
        df_1 = df_1.reindex(['Kupf','Stel','LSEC'] + [ct for ct in df_1.columns if ct not in ['Kupf','Stel','LSEC']],axis=0)
    if 'Kupf' in df_1_original.columns and 'Stel' in df_1_original.columns and 'LSEC' in df_1_original.columns:
        df_1_original = df_1_original.reindex(['Kupf','Stel','LSEC'] + [ct for ct in df_1_original.columns if ct not in ['Kupf','Stel','LSEC']],axis=1)
        df_1_original = df_1_original.reindex(['Kupf','Stel','LSEC'] + [ct for ct in df_1_original.columns if ct not in ['Kupf','Stel','LSEC']],axis=0)
    if 'Kupf' in df_2_original.columns and 'Stel' in df_2_original.columns and 'LSEC' in df_2_original.columns:
        df_2_original = df_2_original.reindex(['Kupf','Stel','LSEC'] + [ct for ct in df_2_original.columns if ct not in ['Kupf','Stel','LSEC']],axis=1)
        df_2_original = df_2_original.reindex(['Kupf','Stel','LSEC'] + [ct for ct in df_2_original.columns if ct not in ['Kupf','Stel','LSEC']],axis=0)
    df_2 = df_2.reindex(columns=df_1.columns, index=df_1.index)
    df_diff = df_1 - df_2
    if plot:
        fig, ax = plt.subplots()
        sns.heatmap(df_1_original, cmap='coolwarm', center=0, annot=True, fmt=".1f",cbar_kws={'label': 'nhood_enrichm z_score'})
        plt.title(annotation1)
        fig, ax = plt.subplots()
        sns.heatmap(df_2_original, cmap='coolwarm', center=0, annot=True, fmt=".1f",cbar_kws={'label': 'nhood_enrichm z_score'})
        plt.title(annotation2)
        fig, ax = plt.subplots()
        sns.heatmap(df_diff, cmap='coolwarm', center=0, annot=True, fmt=".1f",cbar_kws={'label': 'nhood_enrichm z_score'})
        plt.title(annotation1 + ' - ' + annotation2)
    return df_1_original, df_2_original, df_diff

In [None]:
def calculate_average_df(dfs,title):
    cell_types = dfs[0].index
    for i in range(1,len(dfs)):
        cell_types = cell_types.intersection(dfs[i].index)
    for i in range(len(dfs)):
        dfs[i] = dfs[i].loc[cell_types]
        dfs[i] = dfs[i][cell_types]
    dfs_mean = sum(dfs)/len(dfs)
    dfs_mean = dfs_mean.round(1)
    sns.heatmap(dfs_mean, cmap='coolwarm', center=0, annot=True, fmt=".1f",cbar_kws={'label': 'nhood_enrichm z_score'})
    plt.title(title)
    return dfs_mean

In [None]:
def changing_cell_types(anndata,annotation1,annotation2,cut_off=0.05):
    l1 = anndata.obs[annotation1]
    l2 = anndata.obs[annotation2]
    # compare l1 to l2 value by value
    diff = []
    for i in range(len(l1)):
        if l1[i] != l2[i]:
            diff.append(False)
        else:
            diff.append(True)          
    l1_diff = [l1[i] for i in range(len(diff)) if diff[i] == False]
    l2_diff = [l2[i] for i in range(len(diff)) if diff[i] == False]
    false_count = [1 if x == False else 0 for x in diff]
    #print('Percentage changed: ')
    #print(round(100*sum(false_count )/len(l1),2))
    changes = {}
    changes_output = []
    for i in range(len(l1_diff)):
        if l1_diff[i] not in changes.keys():
            changes[l1_diff[i]] = [l2_diff[i]]
        else:
            changes[l1_diff[i]].append(l2_diff[i])
    for key in changes.keys():
        cts = np.unique(changes[key],return_counts=True)
        a = cts[0].tolist()
        b = cts[1].tolist()
        for i in range(len(a)):
            changes_output.append([(key,a[i]),round(100*b[i]/len(l1),2)])
    changes_output = sorted(changes_output, key=lambda x: x[1], reverse=True)
    changes_output = [x for x in changes_output if x[1] >= cut_off]
    return changes_output

In [None]:
def differences_composition_annotation_methods(anndata_objects,original_annotations,new_annotations,original_name,new_name,marker_names,cut_off=0.5,save=False,saved_as='',suffix=''):
    changes = {}
    for i in range(len(marker_names)):
        changes[marker_names[i]] = []
        for anndata in anndata_objects:
            changes[marker_names[i]].append(changing_cell_types(anndata,original_annotations[i],new_annotations[i]))
    changes_slices_together = {}
    changes_kept_cut_off = []
    for markers in marker_names:
        changes_slices_together[markers] = {}
        for i in range(len(anndata_objects)):
            for j in range(len(changes[markers][i])):
                if changes[markers][i][j][0] in changes_slices_together[markers].keys():
                    changes_slices_together[markers][changes[markers][i][j][0]] += changes[markers][i][j][1]
                else:
                    changes_slices_together[markers][changes[markers][i][j][0]] = changes[markers][i][j][1]
        for key in changes_slices_together[markers].keys():
            changes_slices_together[markers][key] = changes_slices_together[markers][key]/len(anndata_objects)  
            if changes_slices_together[markers][key] >= cut_off:
                changes_kept_cut_off.append(key) 
    changes_kept_cut_off = list(set(changes_kept_cut_off))
    for markers in marker_names:
        filtered_keys = [key for key in changes_slices_together[markers].keys() if key in changes_kept_cut_off]
        changes_slices_together[markers] = {key: changes_slices_together[markers][key] for key in filtered_keys}
    values_of_changes_kept = []
    for ch in changes_kept_cut_off:
        values = []
        for markers in marker_names:
            if ch in changes_slices_together[markers].keys():
                values.append(changes_slices_together[markers][ch])
            else:
                values.append(0)
        values_of_changes_kept.append(values)
    values_of_changes_kept
    mean_values = np.mean(values_of_changes_kept,axis=1)
    order = np.argsort(mean_values)
    order = order[::-1]
    values_of_changes_kept = np.array(values_of_changes_kept)[order].tolist()
    changes_kept_cut_off = np.array(changes_kept_cut_off)[order].tolist()
    colors = []
    for c in plt.cm.tab20.colors: colors.append(matplotlib.colors.to_hex(c))
    for c in plt.cm.tab20b.colors: colors.append(matplotlib.colors.to_hex(c))
    fig, ax = plt.subplots()
    c = 20
    b = [0]*len(marker_names)
    for i in range(len(changes_kept_cut_off)):    
        ax.barh(marker_names,values_of_changes_kept[i],label=str(changes_kept_cut_off[i][0])+' -> '+str(changes_kept_cut_off[i][1]), color=colors[c],left=b)
        b = [values_of_changes_kept[i][x]+b[x] for x in range(len(values_of_changes_kept[0]))]
        c = (c + 2)%40
    plt.title("Differences in annotation ("+original_name+' -> '+new_name+") averaged over slices"+suffix)
    plt.xlabel("Percentage") 
    ax.legend()
    # add title to legend
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.45),title='Cut-off = '+str(cut_off)+'%')        
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) 
    if save:
            plt.savefig(saved_as+'.png',dpi=300,bbox_inches='tight')