#  1. Libary & Data 

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import squidpy as sq
from squidpy.im import ImageContainer
import pickle
import copy
import seaborn as sns
import matplotlib.colors
import matplotlib as mpl
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import stlearn as st
from anndata import AnnData
from sklearn.model_selection import (train_test_split,) 
import tensorflow as tf
from tensorflow.keras.layers.experimental import (preprocessing,)
import locale
import matplotlib.font_manager as fm
import plotly.express as px

sc.settings.verbosity=3
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=600, format='pdf')
path = '/path/data/brca'

In [None]:
save_file=path+"/object/adata.h5ad"
adata=sc.read_h5ad(save_file)

# 2. Gene Cluster Assign

In [None]:
# Data filtering & Default setting 
cd8_cells = adata[adata.obs['cell_tyepe'].isin(['CD8T'])]
cd8_cells.obs['cell_id'] = cd8_cells.obs['cell_id'].astype('category')

# Gene panel load 
gene_function_path = '/path/marker/cd8_gene_annot_panel.xlsx'
gene_function_panel = pd.read_excel(gene_function_path, index_col=0)
gene_function_panel = gene_function_panel.iloc[1:, :1]
gene_function_panel.columns = ['Function']

# Marker gene 
marker_genes = gene_function_panel[gene_function_panel['Function'].str.contains('marker')].index.tolist()
gene_metadata = copy.deepcopy(cd8_cells.var)
common_marker_genes = list(set(gene_metadata.index).intersection(marker_genes))
gene_metadata.loc[common_marker_genes, 'Markers'] = gene_function_panel.loc[common_marker_genes, 'Function']
gene_metadata['Markers'] = gene_metadata['Markers'].apply(lambda x: 'N.A.' if 'marker' not in str(x) else x)

# Cell id 
cell_counts = cd8_cells.obs['cell_id'].value_counts()
cell_counts.name = 'cell counts'
cluster_metadata = pd.DataFrame(cell_counts)

# Signature matrix 
signature_matrix = pd.DataFrame(columns=cd8_cells.var_names, index=cd8_cells.obs['cell_id'].cat.categories)
for cluster_id in cd8_cells.obs['cell_id'].cat.categories:
    signature_matrix.loc[cluster_id] = cd8_cells[cd8_cells.obs['cell_id'].isin([cluster_id]), :].X.mean(0)

signature_matrix = signature_matrix.transpose()

# Meta data 
# cluster_metadata.index = ['cell_id' + str(x) for x in signature_matrix.columns]
cluster_metadata['cell_id'] = cluster_metadata.index

# Assign marker per cell 
num_top_genes = 30
for cluster in signature_matrix.columns:
    top_genes = signature_matrix[cluster].sort_values(ascending=False).head(num_top_genes).index
    marker_series = gene_metadata.loc[top_genes, 'Markers']
    marker_series = marker_series[marker_series != 'N.A.']
    marker_counts = marker_series.value_counts()
    
    if not marker_counts.empty:
        most_common_marker = '_'.join(marker_counts.idxmax().split(' marker')).replace(' ', '-')
        cluster_metadata.loc[cluster, 'Cell_Type'] = most_common_marker
    else:
        cluster_metadata.loc[cluster, 'Cell_Type'] = 'Unknown'

# Name cluster
cluster_metadata['cell_id'] = cluster_metadata['cell_id'].astype(str)
cluster_metadata['Cell_Type'] = cluster_metadata['Cell_Type'].astype(str)  # 모든 Cell_Type 값을 문자열로 변환
cluster_metadata['name'] = cluster_metadata.apply(lambda x: x['cell_id'] + '_' + x['Cell_Type'], axis=1)
cluster_names = cluster_metadata['name'].tolist()
cluster_metadata.index = cluster_names

# Transfer 
cell_type_mapping = cluster_metadata.set_index('cell_id')['Cell_Type']
cd8_cells.obs['Cell_Type'] = cd8_cells.obs['cell_id'].map(cell_type_mapping)
cluster_name_mapping = cluster_metadata.set_index('cell_id')['name']
cd8_cells.obs['Cluster'] = cd8_cells.obs['cell_id'].map(cluster_name_mapping)
cd8_cells.obs['New_cluster'] = cd8_cells.obs['Cell_Type'].str.split('_').str[0]

