In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

In [None]:
sc._settings.ScanpyConfig.n_jobs = -1

In [None]:
def z_score_normalization(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

def reorder_columns(df, col_changed, col_position) -> pd.DataFrame:
    """
    Reordering colums. The second input can either be an integer for index or it can be a reference column name. If reference column name is chosen, the column will be placed after the reference column.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The first argument must be a pandas DataFrame")
    if not isinstance(col_changed, str):
        raise ValueError("The second argument must be a string representing a column name")
    if not isinstance(col_position, (str, int)):
        raise ValueError("The third argument must be either a string representing a column name or an integer representing a column index")
    if isinstance(col_position, int):
        cols = df.columns.tolist()
        if col_changed in cols and col_position <= len(cols) + 1:
            cols.remove(col_changed)
        
            index = col_position
            cols.insert(index, col_changed)
            
            df = df[cols]
    else:
        cols = df.columns.tolist()
        if col_changed in cols and col_position in cols:
            cols.remove(col_changed)
            index = cols.index(col_position)
            cols.insert(index+1, col_changed)
            
            df = df[cols]
    return df

In [None]:
df = pd.read_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/cHL_1_MIBI/quantification/cHL1_MIBI.csv')

In [None]:
df = reorder_columns(df, 'cellLabel', 'identifier')
df = reorder_columns(df, 'Annotation', 'cellLabel')
df.rename(columns={'Annotation': 'cell_type', 'centroidX':'x', 'centroidY': 'y'}, inplace=True)
df = reorder_columns(df, 'y', 'cellSize')
df = reorder_columns(df, 'x', 'cellSize')

In [None]:
df.columns

In [None]:

df.to_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_1_MIBI/quantification/processed/cHL1_MIBI_cleaned.csv', index=False)

In [None]:
df

In [None]:
X_columns = df.columns[:df.columns.get_loc('centroidX')]
obs_columns = df.columns[df.columns.get_loc('centroidX'):]
adata = ad.AnnData(
    X=df[X_columns],
    obs=df[obs_columns],
    var=pd.DataFrame(index=X_columns)
)

In [None]:
adata.raw = adata
adata.X = np.arcsinh(adata.X)
adata.layers['zscore'] = z_score_normalization(adata.X)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='cell_type')

In [None]:
sc.pl.matrixplot(adata, var_names=adata.var_names, groupby='cell_type', cmap='vlag', dendrogram=True, standard_scale='var', layer='zscore')

In [None]:
sc.tl.rank_genes_groups(adata, groupby='cell_type' , method='wilcoxon')

In [None]:
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.2, max_out_group_fraction = 1, key = 'rank_genes_groups', key_added='rank_genes_groups_filtered')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups_filtered')

## Harmonize celltype labels

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv')
df

In [None]:
len(df2), len(df)

In [None]:
df['cell_type'].value_counts()

In [None]:
df['cell_type'] = df['cell_type'].replace({'CD4':'CD4+_T_cell', 'CD8': 'CD8+T_cell', 'M2':'M2_Macrophage', 'B':'B_cell','Other':'undefined', 'DC': 'Dendritic_cell',
                                           'Cytotoxic CD4':'Cytotoxic_CD4+_T_cell', 'Tumor': 'Cancer', 'NK':'NK_cell', 'M1':'M1_Macrophage',
                                           'Cytotoxic CD8':'Cytotoxic_CD8+_T_cell'})

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv', index=False)

# Implement different levels of granularity

In [2]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv')
df['cell_type'].value_counts()

cell_type
CD4+_T_cell              457232
CD8+_T_cell              254045
M2_Macrophage            223290
B_cell                   203862
undefined                131420
Dendritic_cell           113181
Cytotoxic_CD4+_T_cell     70305
Treg                      63262
Cancer                    57917
NK_cell                   29376
Endothelial               24260
Neutrophil                18490
M1_Macrophage             13417
Cytotoxic_CD8+_T_cell      9796
Name: count, dtype: int64

In [4]:
df.dtypes

CD45                 float64
CD20                 float64
CD163                float64
Histone H3           float64
CD45RO               float64
CD28                 float64
CD153 (CD30L)        float64
Lag3                 float64
CD4                  float64
CD11c                float64
CD56                 float64
FoxP3                float64
GATA3                float64
Granzyme B           float64
PD-L1                float64
CD16                 float64
Ki-67                float64
PD-1                 float64
Pax-5                float64
Tox                  float64
CD161                float64
CD68                 float64
B2-Microglobulin     float64
CD8                  float64
CD3                  float64
HLA1                 float64
CD15                 float64
Tbet                 float64
CD14                 float64
CXCR5                float64
CD45RA               float64
HLA-DR               float64
CD57                 float64
IL-10                float64
CD30          

In [6]:
df[['cellLabel', 'identifier']]

Unnamed: 0,cellLabel,identifier
0,26325,31
1,5,31
2,7,31
3,8,31
4,9,31
...,...,...
1669848,72620,6
1669849,73041,6
1669850,73508,6
1669851,73541,6


In [None]:
df['cell_type'] = df['cell_type'].replace({'CD8+T_cell':'CD8+_T_cell'})
df['cell_type'].value_counts()

In [None]:
df['level_2_cell_type'] = df['cell_type']
df['level_2_cell_type'] = df['level_2_cell_type'].replace({'CD4+_T_cell':'Lymphoid_immune', 'CD8+_T_cell': 'Lymphoid_immune', 'M2_Macrophage':'Myeloid_immune', 'B_cell':'Lymphoid_immune',
                                                           'Dendritic_cell':'Myeloid_immune', 'Cytotoxic_CD4+_T_cell':'Lymphoid_immune', 'Treg':'Lymphoid_immune', 'NK_cell':'Lymphoid_immune',
                                                           'Endothelial':'Vascular', 'Neutrophil':'Myeloid_immune', 'M1_Macrophage':'Myeloid_immune', 'Cytotoxic_CD8+_T_cell':'Lymphoid_immune'})
df['level_2_cell_type'].value_counts()

In [None]:
df['level_1_cell_type'] = df['level_2_cell_type']
df['level_1_cell_type'] = df['level_1_cell_type'].replace({'Lymphoid_immune':'Immune', 'Myeloid_immune':'Immune', 'Vascular':'Stromal'})
df['level_1_cell_type'].value_counts()

In [None]:
df = reorder_columns(df, 'cell_type', 'level_1_cell_type')
df = reorder_columns(df, 'level_2_cell_type', 'level_1_cell_type')
df 

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv', index=False) 

# Encode uninfomrative variables 

In [2]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv')
df.columns

Index(['CD45', 'CD20', 'CD163', 'Histone H3', 'CD45RO', 'CD28',
       'CD153 (CD30L)', 'Lag3', 'CD4', 'CD11c', 'CD56', 'FoxP3', 'GATA3',
       'Granzyme B', 'PD-L1', 'CD16', 'Ki-67', 'PD-1', 'Pax-5', 'Tox', 'CD161',
       'CD68', 'B2-Microglobulin', 'CD8', 'CD3', 'HLA1', 'CD15', 'Tbet',
       'CD14', 'CXCR5', 'CD45RA', 'HLA-DR', 'CD57', 'IL-10', 'CD30', 'TIM3',
       'RORgT', 'TCRgd', 'CD86', 'CD25', 'Na-K ATPase', 'cellSize', 'x', 'y',
       'sample_id', 'cell_id', 'level_1_cell_type', 'level_2_cell_type',
       'cell_type'],
      dtype='object')

In [7]:
df['sample_id'].unique()

array([31, 15, 19, 14,  3, 10, 18, 22, 11,  5, 20, 29, 23, 30,  9, 28,  8,
       27,  7, 32, 26, 17,  4,  1, 24, 21, 25, 13, 16,  2, 12,  6])

In [7]:
df.dtypes

CD45                 float64
CD20                 float64
CD163                float64
Histone H3           float64
CD45RO               float64
CD28                 float64
CD153 (CD30L)        float64
Lag3                 float64
CD4                  float64
CD11c                float64
CD56                 float64
FoxP3                float64
GATA3                float64
Granzyme B           float64
PD-L1                float64
CD16                 float64
Ki-67                float64
PD-1                 float64
Pax-5                float64
Tox                  float64
CD161                float64
CD68                 float64
B2-Microglobulin     float64
CD8                  float64
CD3                  float64
HLA1                 float64
CD15                 float64
Tbet                 float64
CD14                 float64
CXCR5                float64
CD45RA               float64
HLA-DR               float64
CD57                 float64
IL-10                float64
CD30          

In [5]:
df['identifier'].unique()

array([31, 15, 19, 14,  3, 10, 18, 22, 11,  5, 20, 29, 23, 30,  9, 28,  8,
       27,  7, 32, 26, 17,  4,  1, 24, 21, 25, 13, 16,  2, 12,  6])

In [6]:
df = df.rename(columns={'cellLabel': 'cell_id', 'identifier': 'sample_id'})

In [8]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv', index=False)