In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

In [None]:
sc._settings.ScanpyConfig.n_jobs = -1

In [None]:
def z_score_normalization(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

def reorder_columns(df, col_changed, col_position) -> pd.DataFrame:
    """
    Reordering colums. The second input can either be an integer for index or it can be a reference column name. If reference column name is chosen, the column will be placed after the reference column.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The first argument must be a pandas DataFrame")
    if not isinstance(col_changed, str):
        raise ValueError("The second argument must be a string representing a column name")
    if not isinstance(col_position, (str, int)):
        raise ValueError("The third argument must be either a string representing a column name or an integer representing a column index")
    if isinstance(col_position, int):
        cols = df.columns.tolist()
        if col_changed in cols and col_position <= len(cols) + 1:
            cols.remove(col_changed)
        
            index = col_position
            cols.insert(index, col_changed)
            
            df = df[cols]
    else:
        cols = df.columns.tolist()
        if col_changed in cols and col_position in cols:
            cols.remove(col_changed)
            index = cols.index(col_position)
            cols.insert(index+1, col_changed)
            
            df = df[cols]
    return df

In [None]:
df = pd.read_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/cHL_CODEX/quantification/cHL_CODEX_annotation.csv')

In [None]:
df.columns

In [None]:
columns = df.columns.tolist()
new_order = columns[5:] + columns[:5]
df = df[new_order]
df.rename(columns={'cellType': 'cell_type'}, inplace=True)
df = reorder_columns(df, 'Y_cent', 'cellSize')
df = reorder_columns(df, 'X_cent', 'cellSize')
df.rename(columns={'X_cent': 'x', 'Y_cent': 'y'}, inplace=True)

In [None]:
df

In [None]:
df

In [None]:

df.to_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_CODEX/quantification/processed/cHL_CODEX_cleaned.csv', index=False)

In [None]:
X_columns = df.columns[:df.columns.get_loc('cellSize')]
obs_columns = df.columns[df.columns.get_loc('cellSize'):]
adata = ad.AnnData(
    X=df[X_columns],
    obs=df[obs_columns],
    var=pd.DataFrame(index=X_columns)
)

In [None]:
adata.X.max()

In [None]:
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='cell_type')

In [None]:
sc.pl.matrixplot(adata, var_names=adata.var_names, groupby='cell_type', cmap='vlag', dendrogram=True, standard_scale='var')

NK cell CD11b and IDO1 do not make much sense

In [None]:
adata.obs['cell_type'].value_counts()

In [None]:
sc.tl.rank_genes_groups(adata, groupby='cell_type' , method='wilcoxon')

In [None]:
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.2, max_out_group_fraction = 1, key = 'rank_genes_groups', key_added='rank_genes_groups_filtered')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups_filtered')

## Harmonize celltype labels

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_quantification.csv')
df2 = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_annotation.csv')

In [None]:
df

In [None]:
df['cell_type'].value_counts()

In [None]:
df['cell_type'] = df['cell_type'].replace({'CD4':'CD4+_T_cell', 'TReg':'Treg', 'B':'B_cell', 'M2':'M2_Macrophage', 'DC':'Dendritic_cell', 'CD8':'CD8+_T_cell',
                                           'Tumor':'Cancer', 'Cytotoxic CD8':'Cytotoxic_CD8+_T_cell','Mast':'Mast_cell', 'M1':'M1_Macrophage', 'NK':'NK_cell', 'Other':'unedfined'})

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_quantification.csv', index=False)

# Implement different levels of granularity

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_quantification.csv')
df['cell_type'].value_counts()

In [None]:
df['level_2_cell_type'] = df['cell_type']
df['level_2_cell_type'] = df['level_2_cell_type'].replace({'CD4+_T_cell':'Lymphoid_immune', 'CD8+_T_cell':'Lymphoid_immune', 'B_cell':'Lymphoid_immune', 'Dendritic_cell':'Myeloid_immune',
                                                           'Endothelial':'Vascular', 'NK_cell':'Lymphoid_immune', 'M2_Macrophage':'Myeloid_immune', 'Monocyte':'Myeloid_immune',
                                                           'Lymphatic':'Vascular', 'Neutrophil':'Myeloid_immune', 'Treg':'Lymphoid_immune', 'Mast_cell':'Myeloid_immune',
                                                           'M1_Macrophage':'Myeloid_immune', 'Cytotoxic_CD8+_T_cell':'Lymphoid_immune'})
df['level_2_cell_type'].value_counts()

In [None]:
df['level_1_cell_type'] = df['level_2_cell_type']
df['level_1_cell_type'] = df['level_1_cell_type'].replace({'Lymphoid_immune':'Immune', 'Myeloid_immune':'Immune', 'Vascular':'Stromal'})
df['level_1_cell_type'].value_counts()

In [None]:
df = reorder_columns(df, 'cell_type', 'level_1_cell_type')
df = reorder_columns(df, 'level_2_cell_type', 'level_1_cell_type')
df

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_quantification.csv', index=False)

# Encode uninfomrative variables with different dtypes

In [3]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_quantification.csv')
df.columns

Index(['BCL.2', 'CCR6', 'CD11b', 'CD11c', 'CD15', 'CD16', 'CD162', 'CD163',
       'CD2', 'CD20', 'CD206', 'CD25', 'CD30', 'CD31', 'CD4', 'CD44', 'CD45RA',
       'CD45RO', 'CD45', 'CD5', 'CD56', 'CD57', 'CD68', 'CD69', 'CD7', 'CD8',
       'Collagen.4', 'Cytokeratin', 'DAPI.01', 'EGFR', 'FoxP3', 'Granzyme.B',
       'HLA.DR', 'IDO.1', 'LAG.3', 'MCT', 'MMP.9', 'MUC.1', 'PD.1', 'PD.L1',
       'Podoplanin', 'T.bet', 'TCR.g.d', 'TCRb', 'Tim.3', 'VISA', 'Vimentin',
       'a.SMA', 'b.Catenin', 'cellLabel', 'cellSize', 'x', 'y',
       'level_1_cell_type', 'level_2_cell_type', 'cell_type'],
      dtype='object')

In [4]:
df.dtypes

BCL.2                float64
CCR6                 float64
CD11b                float64
CD11c                float64
CD15                 float64
CD16                 float64
CD162                float64
CD163                float64
CD2                  float64
CD20                 float64
CD206                float64
CD25                 float64
CD30                 float64
CD31                 float64
CD4                  float64
CD44                 float64
CD45RA               float64
CD45RO               float64
CD45                 float64
CD5                  float64
CD56                 float64
CD57                 float64
CD68                 float64
CD69                 float64
CD7                  float64
CD8                  float64
Collagen.4           float64
Cytokeratin          float64
DAPI.01              float64
EGFR                 float64
FoxP3                float64
Granzyme.B           float64
HLA.DR               float64
IDO.1                float64
LAG.3         

In [5]:
df[['cellLabel','x', 'y']] = df[['cellLabel','x', 'y']].astype('object')

In [6]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_CODEX_quantification.csv', index=False)