In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

In [None]:
sc._settings.ScanpyConfig.n_jobs = -1

In [None]:
def z_score_normalization(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

def reorder_columns(df, col_changed, col_position) -> pd.DataFrame:
    """
    Reordering colums. The second input can either be an integer for index or it can be a reference column name. If reference column name is chosen, the column will be placed after the reference column.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The first argument must be a pandas DataFrame")
    if not isinstance(col_changed, str):
        raise ValueError("The second argument must be a string representing a column name")
    if not isinstance(col_position, (str, int)):
        raise ValueError("The third argument must be either a string representing a column name or an integer representing a column index")
    if isinstance(col_position, int):
        cols = df.columns.tolist()
        if col_changed in cols and col_position <= len(cols) + 1:
            cols.remove(col_changed)
        
            index = col_position
            cols.insert(index, col_changed)
            
            df = df[cols]
    else:
        cols = df.columns.tolist()
        if col_changed in cols and col_position in cols:
            cols.remove(col_changed)
            index = cols.index(col_position)
            cols.insert(index+1, col_changed)
            
            df = df[cols]
    return df

In [None]:
df = pd.read_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/feto_maternal/quantification/Supplementary_table_3_single_cells_updated.csv')

In [None]:
df.columns

In [None]:
df

In [None]:
df['point_mean_delta'].isna().sum()

In [None]:
df.drop(columns=['Ki67_divided_by_H3', 'Ki67_binary', 'TIGIT_binary', 'TIM-3_binary', 'GrB_binary', 'iNOS_binary', 'Lck_binary', 'PD-1_binary', 
                 'HO-1_binary', 'Galectin-9_binary', 'PD-L1_binary', 'IDO-1_binary', 'overlap_arteries', 'overlap_vessels', 'overlap_decidua',
                 'overlap_cell_column', 'overlap_gland', 'FlowSom_cluster', 'label_nuclear', 'label_nuclear', 'microenvironment', 'microenvironment_figure'], inplace=True)
df.rename(columns={'lineage': 'cell_type'}, inplace=True)
df = reorder_columns(df, 'cell_type', 'cell_ID_in_Point')
df = reorder_columns(df, 'cell_size', 'area')
df['nucleated'] = df['nucleated'].replace({'yes': 1, 'no': 0})
df['nucleated'] = df['nucleated'].astype(int)
df.rename(columns={'centroid0': 'y', 'centroid1': 'x'}, inplace=True)
df = reorder_columns(df, 'y', 'x')

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.to_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/feto_maternal/quantification/processed/decidua_cleaned.csv', index=False)

In [None]:
X_columns = df.columns[:df.columns.get_loc('area')]
obs_columns = df.columns[df.columns.get_loc('area'):]
adata = ad.AnnData(
    X=df[X_columns],
    obs=df[obs_columns],
    var=pd.DataFrame(index=X_columns)
)

In [None]:
adata.X.max()

In [None]:
adata.raw = adata
adata.X = np.arcsinh(adata.X)
adata.layers['zscore'] = z_score_normalization(adata.X)

In [None]:
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='cell_type')

In [None]:
sc.pl.matrixplot(adata, var_names=adata.var_names, groupby='cell_type', cmap='vlag', dendrogram=True, use_raw=False, layer='zscore', standard_scale='var')

Myeloid compartment here is probably not clear, DC markers compared to Macrophages do not make much sense, not sure about 4 different NK cell clusters, no B cells?

In [None]:
sc.tl.rank_genes_groups(adata, groupby='cell_type' , method='wilcoxon')

In [None]:
sc.tl.filter_rank_genes_groups(adata, min_in_group_fraction=0.2, max_out_group_fraction = 1, key = 'rank_genes_groups', key_added='rank_genes_groups_filtered')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, standard_scale='var', key='rank_genes_groups_filtered')

## Harmonize celltype labels

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/feto_maternal_quantification.csv')
df

In [None]:
df['cell_type'].value_counts()

In [None]:
df['cell_type'] = df['cell_type'].replace({'CD4T':'CD4+_T_cell', 'DC':'Dendritic_cell', 'CD8T':'CD8+_T_cell', 'Fibroblasts':'Fibroblast', 'Mac2a':'M2a_Macrophage',
                                           'Tumor':'Cancer', 'Cytotoxic CD8':'Cytotoxic_CD8+_T_cell','Mast':'Mast_cell', 'M1':'M1_Macrophage', 'NK':'NK_cell',
                                           'Mac1b':'M1b_Macrophage', 'Mac2c':'M2c_Macrophage', 'Mac1a':'M1a_Macrophage','NKT':'NK_T_cell',
                                           'Mac2b':'M2b_Macrophage','other':'unedfined'})

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/feto_maternal_quantification.csv', index=False)

# Implement different levels of granularity

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/feto_maternal_quantification.csv')
df['cell_type'].value_counts()

In [None]:
df['level_2_cell_type'] = df['cell_type']
df['level_2_cell_type'] = df['level_2_cell_type'].replace({'M2a_Macrophage':'Myeloid_immune', 'NK1':'Lymphoid_immune', 'EVT1a':'Trophoblast', 'EVT1b':'Trophoblast',
                                                           'Endothelial':'Vascular', 'NK2':'Lymphoid_immune', 'M1b_Macrophage':'Myeloid_immune', 'M2c_Macrophage':'Myeloid_immune',
                                                           'NK3':'Lymphoid_immune', 'CD8+_T_cell':'Lymphoid_immune', 'M1a_Macrophage':'Myeloid_immune', 'EVT2':'Trophoblast',
                                                           'EVT1c':'Trophoblast', 'NK_T_cell':'Lymphoid_immune', 'Placental_Mac':'Myeloid_immune', 'NK4':'Lymphoid_immune',
                                                           'CD4+_T_cell':'Lymphoid_immune', 'Dendritic_cell':'Myeloid_immune', 'M2b_Macrophage':'Myeloid_immune', 'Mast_cell':'Myeloid_immune',
                                                           'Treg':'Lymphoid_immune'})
df['level_2_cell_type'].value_counts()

In [None]:
df['level_1_cell_type'] = df['level_2_cell_type']
df['level_1_cell_type'] = df['level_1_cell_type'].replace({'Fibroblast':'Stromal', 'Myeloid_immune':'Immune', 'Lymphoid_immune':'Immune', 'Myofibroblasts':'Stromal',
                                                           'Vascular':'Stromal', 'Glandular':'Stromal', 'muscle':'Stromal'})
df['level_1_cell_type'].value_counts()

In [None]:
df = reorder_columns(df, 'cell_type', 'level_1_cell_type')
df = reorder_columns(df, 'level_2_cell_type', 'level_1_cell_type')
df 

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/feto_maternal_quantification.csv', index=False)

# Encode uninfomrative variables 

In [3]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/feto_maternal_quantification.csv')
df.columns

Index(['CD11c', 'CD14', 'CD16', 'CD163', 'CD20', 'CD206', 'CD3', 'CD31', 'CD4',
       'CD44', 'CD45', 'CD56', 'CD57', 'CD68', 'CD8', 'CD80', 'CK7', 'DC-SIGN',
       'Ecad', 'FoxP3', 'Galectin-9', 'GrB', 'H3', 'HLA-DR', 'HLA-G', 'HO-1',
       'IDO-1', 'Ki67_raw', 'Lck', 'PD-1', 'PD-L1', 'SMA', 'TIGIT', 'TIM-3',
       'Tryptase', 'VIM', 'background', 'iNOS', 'area', 'cell_size',
       'eccentricity', 'major_axis_length', 'minor_axis_length', 'perimeter',
       'sample_id', 'x', 'y', 'area_nuclear', 'nucleated',
       'overlap_decidua_only', 'cell_id', 'level_1_cell_type',
       'level_2_cell_type', 'cell_type'],
      dtype='object')

In [10]:
df.dtypes

CD11c                   float64
CD14                    float64
CD16                    float64
CD163                   float64
CD20                    float64
CD206                   float64
CD3                     float64
CD31                    float64
CD4                     float64
CD44                    float64
CD45                    float64
CD56                    float64
CD57                    float64
CD68                    float64
CD8                     float64
CD80                    float64
CK7                     float64
DC-SIGN                 float64
Ecad                    float64
FoxP3                   float64
Galectin-9              float64
GrB                     float64
H3                      float64
HLA-DR                  float64
HLA-G                   float64
HO-1                    float64
IDO-1                   float64
Ki67_raw                float64
Lck                     float64
PD-1                    float64
PD-L1                   float64
SMA     

In [7]:
df = df.rename(columns={'cell_ID_in_Point': 'cell_id', 'Point': 'sample_id'})

In [8]:
df.isna().sum()

CD11c                   0
CD14                    0
CD16                    0
CD163                   0
CD20                    0
CD206                   0
CD3                     0
CD31                    0
CD4                     0
CD44                    0
CD45                    0
CD56                    0
CD57                    0
CD68                    0
CD8                     0
CD80                    0
CK7                     0
DC-SIGN                 0
Ecad                    0
FoxP3                   0
Galectin-9              0
GrB                     0
H3                      0
HLA-DR                  0
HLA-G                   0
HO-1                    0
IDO-1                   0
Ki67_raw                0
Lck                     0
PD-1                    0
PD-L1                   0
SMA                     0
TIGIT                   0
TIM-3                   0
Tryptase                0
VIM                     0
background              0
iNOS                    0
area        

In [12]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/feto_maternal_quantification.csv', index=False)