In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import napari_sparrow as nas
from spatialdata import read_zarr
import os
import scanpy as sc
from spatialdata import SpatialData
from typing import Dict, List, Optional, Tuple
from napari_sparrow.table._table import _back_sdata_table_to_zarr
from napari_sparrow.table._annotation import _annotate_celltype

In [5]:
def visualize_classification(sdata,classification_name):
    # Plot the UMAP
    sc.pl.umap(sdata.table,color=[classification_name])
    
    # Give the cell type proportions
    print(sdata.table.obs[classification_name].value_counts(normalize=True))
    
    # Plot expression of the marker genes for each cluster
    path_mg = "/home/wout/Documents/Thesis_lokaal/Mouse_Liver_Resolve_Data/markerGeneListBasic.csv"
    marker_genes = pd.read_csv(path_mg, sep=',', index_col=0)
    cell_types = marker_genes.columns 
    postfix = ['_Hep','_LSEC','_End','_Cho','_Kup','_Ste','_Fib','_B','_Oth']
    post = dict(zip(cell_types, postfix))
    marker_genes['label'] = marker_genes.index
    for cell_type in cell_types: 
        marker_genes.loc[marker_genes[cell_type] == 1, 'label'] = marker_genes['label'] + post[cell_type]
    all_genes = sdata.table.var.index 
    for gene in all_genes:
        if gene not in marker_genes.index:
            marker_genes.loc[gene] = [0]*len(marker_genes.columns) 
            marker_genes.loc[gene, 'label'] = gene 
    marker_genes.sort_index(inplace=True)
    marker_genes['sum'] = marker_genes.iloc[:,0:len(cell_types)].sum(axis=1)
    useful_markers = marker_genes[marker_genes['sum'] > 0].index
    sdata.table.var = sdata.table.var.join(marker_genes['label'])
    new_anndata = sdata.table
    new_anndata.var_names = new_anndata.var['label']
    sc.pl.dotplot(sdata.table,useful_markers,groupby=classification_name,dendrogram=True,gene_symbols='label')
    
    # Plot the highly differential genes for each cluster
    sdata.table.uns['log1p']["base"] = None
    sc.tl.rank_genes_groups(sdata.table, groupby=classification_name,key=classification_name+'_rank_genes')
    sc.pl.rank_genes_groups(sdata.table, n_genes=8, sharey=False, show=False)
    
    # Plot the image with the cells
    nas.pl.plot_shapes(sdata,column=classification_name,img_layer='clahe',shapes_layer = "segmentation_mask_boundaries")

In [6]:
def score_genes_bins(
    sdata: SpatialData,
    path_marker_genes: str,
    bins: int = 25,
    delimiter=",",
    row_norm: bool = False,
    repl_columns: Optional[Dict[str, str]] = None,
    del_celltypes: Optional[List[str]] = None,
    input_dict=False,
) -> Tuple[dict, pd.DataFrame]:
    """
    The function loads marker genes from a CSV file and scores cells for each cell type using those markers
    using scanpy's score_genes function.
    Marker genes can be provided as a one-hot encoded matrix with cell types listed in the first row, and marker genes in the first column;
    or in dictionary format. The function further allows replacements of column names and deletions of specific marker genes.

    Parameters
    ----------
    sdata : SpatialData
        Data containing spatial information.
    path_marker_genes : str
        Path to the CSV file containing the marker genes.
        CSV file should be a one-hot encoded matrix with cell types listed in the first row, and marker genes in the first column.
    bins : int, optional
        Number of bins to use for the sc.tl.score_genes function, default is 25.
    delimiter : str, optional
        Delimiter used in the CSV file, default is ','.
    row_norm : bool, optional
        Flag to determine if row normalization is applied, default is False.
    repl_columns : dict, optional
        Dictionary containing cell types to be replaced. The keys are the original cell type names and
        the values are their replacements.
    del_celltypes : list, optional
        List of cell types to be deleted from the list of possible cell type candidates.
        Cells are scored for these cell types, but will not be assigned a cell type from this list.
    input_dict : bool, optional
        If True, the marker gene list from the CSV file is treated as a dictionary with the first column being
        the cell type names and the subsequent columns being the marker genes for those cell types. Default is False.

    Returns
    -------
    dict
        Dictionary with cell types as keys and their respective marker genes as values.
    pd.DataFrame
        Index:
            cells: The index corresponds to indivdual cells ID's.
        Columns:
            celltypes (as provided via the markers file).
        Values:
            Score obtained using scanpy's score_genes function for each celltype and for each cell.

    Notes
    -----
    The cell type 'unknown_celltype' is reserved for cells that could not be assigned a specific cell type.

    """

    # Load marker genes from csv
    if input_dict:
        df_markers = pd.read_csv(
            path_marker_genes, header=None, index_col=0, delimiter=delimiter
        )
        df_markers = df_markers.T
        genes_dict = df_markers.to_dict("list")
        for i in genes_dict:
            genes_dict[i] = [x for x in genes_dict[i] if str(x) != "nan"]
    # Replace column names in marker genes
    else:
        df_markers = pd.read_csv(path_marker_genes, index_col=0, delimiter=delimiter)
        if repl_columns:
            for column, replace in repl_columns.items():
                df_markers.columns = df_markers.columns.str.replace(column, replace)

        # Create genes dict with all marker genes for every celltype
        genes_dict = {}
        for i in df_markers:
            genes = []
            for row, value in enumerate(df_markers[i]):
                if value > 0:
                    genes.append(df_markers.index[row])
            genes_dict[i] = genes

    assert (
        "unknown_celltype" not in genes_dict.keys()
    ), "Cell type 'unknown_celltype' is reserved for cells that could not be assigned a specific cell type"

    # Score all cells for all celltypes
    for key, value in genes_dict.items():
        try:
            sc.tl.score_genes(sdata.table, value, score_name=key,n_bins=bins) # W: key = cell type, value = list of markergenes of that cell type
        except ValueError:
            log.warning(
                f"Markergenes {value} not present in region, celltype {key} not found"
            )

    # Delete genes from marker genes and genes dict
    if del_celltypes:
        for gene in del_celltypes:
            if gene in df_markers.columns:
                del df_markers[gene]
            if gene in genes_dict.keys():
                del genes_dict[gene]

    sdata, scoresper_cluster = _annotate_celltype( 
        sdata=sdata,
        celltypes=df_markers.columns,
        row_norm=row_norm,
        celltype_column="annotation",
    )

    # add 'unknown_celltype' to the list of celltypes if it is detected.
    if "unknown_celltype" in sdata.table.obs["annotation"].cat.categories:
        genes_dict["unknown_celltype"] = []

    name_clustering = 'score_genes_' + str(bins)
    sdata.table.uns[name_clustering] = scoresper_cluster
    sdata.table.obs.rename(columns={'annotation': 'annotation_'+name_clustering}, inplace=True)
    sdata.table.obs.rename(columns={'Cleanliness': 'cleanliness_'+name_clustering}, inplace=True)
    sdata.table.obs.drop(genes_dict.keys(), axis=1, inplace=True)
    cols = sdata.table.obs.columns.to_list()
    cols_new = cols[0:len(cols)-2]
    cols_new.append(cols[len(cols)-1])
    cols_new.append(cols[len(cols)-2])
    sdata.table.obs  = sdata.table.obs .reindex(columns=cols_new)

    _back_sdata_table_to_zarr(sdata)

    return genes_dict, scoresper_cluster

    