In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

In [None]:
sc._settings.ScanpyConfig.n_jobs = -1

In [None]:
def z_score_normalization(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

def reorder_columns(df, col_changed, col_position) -> pd.DataFrame:
    """
    Reordering colums. The second input can either be an integer for index or it can be a reference column name. If reference column name is chosen, the column will be placed after the reference column.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The first argument must be a pandas DataFrame")
    if not isinstance(col_changed, str):
        raise ValueError("The second argument must be a string representing a column name")
    if not isinstance(col_position, (str, int)):
        raise ValueError("The third argument must be either a string representing a column name or an integer representing a column index")
    if isinstance(col_position, int):
        cols = df.columns.tolist()
        if col_changed in cols and col_position <= len(cols) + 1:
            cols.remove(col_changed)
        
            index = col_position
            cols.insert(index, col_changed)
            
            df = df[cols]
    else:
        cols = df.columns.tolist()
        if col_changed in cols and col_position in cols:
            cols.remove(col_changed)
            index = cols.index(col_position)
            cols.insert(index+1, col_changed)
            
            df = df[cols]
    return df

In [None]:
df = pd.read_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/cHL_1_MIBI/quantification/cHL1_MIBI.csv')

In [None]:
df = reorder_columns(df, 'cellLabel', 'identifier')
df = reorder_columns(df, 'Annotation', 'cellLabel')
df.rename(columns={'Annotation': 'cell_type', 'centroidX':'x', 'centroidY': 'y'}, inplace=True)
df = reorder_columns(df, 'y', 'cellSize')
df = reorder_columns(df, 'x', 'cellSize')

In [None]:
df

In [None]:

df.to_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/Maps_data/cHL_1_MIBI/quantification/processed/cHL1_MIBI_cleaned.csv', index=False)

In [None]:
df

In [None]:
X_columns = df.columns[:df.columns.get_loc('centroidX')]
obs_columns = df.columns[df.columns.get_loc('centroidX'):]
adata = ad.AnnData(
    X=df[X_columns],
    obs=df[obs_columns],
    var=pd.DataFrame(index=X_columns)
)

In [None]:
adata.raw = adata
adata.X = np.arcsinh(adata.X)
adata.layers['zscore'] = z_score_normalization(adata.X)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='cell_type')

In [None]:
sc.pl.matrixplot(adata, var_names=adata.var_names, groupby='cell_type', cmap='vlag', dendrogram=True, standard_scale='var', layer='zscore')

In [None]:
sc.tl.rank_genes_groups(adata, groupby='cell_type' , method='wilcoxon')

In [None]:
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.2, max_out_group_fraction = 1, key = 'rank_genes_groups', key_added='rank_genes_groups_filtered')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups_filtered')

## Harmonize celltype labels

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv')
df

In [None]:
len(df2), len(df)

In [None]:
df['cell_type'].value_counts()

In [None]:
df['cell_type'] = df['cell_type'].replace({'CD4':'CD4+_T_cell', 'CD8': 'CD8+T_cell', 'M2':'M2_Macrophage', 'B':'B_cell','Other':'undefined', 'DC': 'Dendritic_cell',
                                           'Cytotoxic CD4':'Cytotoxic_CD4+_T_cell', 'Tumor': 'Cancer', 'NK':'NK_cell', 'M1':'M1_Macrophage',
                                           'Cytotoxic CD8':'Cytotoxic_CD8+_T_cell'})

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/cHL_1_MIBI_quantification.csv', index=False)