In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

In [None]:
def z_score_normalization(data):
    return (data - np.mean(data, axis=0)) / np.std(data, axis=0)

def reorder_columns(df, col_changed, col_position) -> pd.DataFrame:
    """
    Reordering colums. The second input can either be an integer for index or it can be a reference column name. If reference column name is chosen, the column will be placed after the reference column.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The first argument must be a pandas DataFrame")
    if not isinstance(col_changed, str):
        raise ValueError("The second argument must be a string representing a column name")
    if not isinstance(col_position, (str, int)):
        raise ValueError("The third argument must be either a string representing a column name or an integer representing a column index")
    if isinstance(col_position, int):
        cols = df.columns.tolist()
        if col_changed in cols and col_position <= len(cols) + 1:
            cols.remove(col_changed)
        
            index = col_position
            cols.insert(index, col_changed)
            
            df = df[cols]
    else:
        cols = df.columns.tolist()
        if col_changed in cols and col_position in cols:
            cols.remove(col_changed)
            index = cols.index(col_position)
            cols.insert(index+1, col_changed)
            
            df = df[cols]
    return df

In [None]:
df = pd.read_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/breast_fibro_IMC/quantification/breast_fibro_IMC.csv')

In [None]:
df[['celltype', 'cell_subtype']]

In [None]:
df.columns

In [None]:
df.FSP1

In [None]:
df.drop(columns=['Panel', 'acID', 'Compartment', 'ImageInfo', 'tumour_nontumour'], inplace=True) 
df.rename(columns={'cell_subtype2': 'cell_type', 'Center_X': 'x', 'Center_Y': 'y'}, inplace=True)

In [None]:
df.columns

In [None]:
df = df.iloc[:, 15:].join(df.iloc[:, :15])

In [None]:
df

In [None]:
df.to_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets2/breast_fibro_IMC/quantification/processed/breast_fibro_IMC_cleaned.csv', index=False)
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv', index=False)

In [None]:
df.drop(columns=['Histone H3', 'Iridium191', 'Iridium193'], inplace=True)

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv')

In [None]:
df.columns

In [None]:
X_columns = df.columns[:df.columns.get_loc('ImageNumber')]
obs_columns =df.columns[df.columns.get_loc('ImageNumber'):]
adata = ad.AnnData(
    X=df[X_columns],
    obs=df[obs_columns],
    var=pd.DataFrame(index=X_columns)
)

In [None]:

adata.raw = adata.copy()
adata.X = np.arcsinh(adata.X/1)
adata.layers['zscore'] = z_score_normalization(adata.X)
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')


In [None]:
sc.pp.neighbors(add)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color='cell_type')

In [None]:
sc.pl.matrixplot(adata, var_names=adata.var_names, groupby='cell_type', cmap='vlag', dendrogram=True, use_raw=False, standard_scale='var')

Hypoxic and normal should most likely be epithelial cells. Lymphatic vessel quite low PDPN, We should think about binning the T cell subsets

## Harmonize celltype labels

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv')
df

In [None]:
df['cell_type'] = df['cell_type'].replace({'normal': 'Epithelial', 'hypoxic': 'hypoxic_Epithelial', 'SMA_Fibro': 'SMA+_Fibroblast', 'CD4': 'CD4+_T_cell',
                                             'CD8': 'CD8+_T_cell', 'Bcell': 'B_cell', 'T cell': 'T_cell', 'other': 'undefined'})




In [None]:
df['cell_subtype'] = df['cell_subtype'].replace({'SMA_Fibro': 'SMA+_Fibroblast','Bcell': 'B_cell', 'Tcell': 'T_cell', 'other': 'undefined'})

In [None]:
df['celltype'] = df['celltype'].replace({'Tumour':'Cancer', 'other': 'undefined'})

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv', index=False)

# Implement different levels of granularity

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv')
df.columns

In [None]:
df.BatchID.nunique(), df.SampleID.nunique(), df.RoiID.nunique()

In [None]:
df.dtypes

In [None]:
df[['BatchID', 'SampleID', 'RoiID', 'CellID', 'CellNumber', 'ImageNumber']]

In [None]:
df['cell_type'].value_counts()

In [None]:
df['level_2_cell_type'] = df['cell_type']
df['level_2_cell_type'] = df['level_2_cell_type'].replace({'SMA+_Fibroblast': 'Fibroblast', 'CD4+_T_cell': 'Lymphoid_immune','Lymphatic':'Vascular', 'CD8+_T_cell': 'Lymphoid_immune',
                                                           'Myeloid':'Myeloid_immune', 'Macrophage':'Myeloid_immune', 'FN_Cdh11_mCAF':'Fibroblast', 'Blood':'Vascular','Neutrophil':'Myeloid_immune',
                                                           'vCAF':'Fibroblast', 'B_cell':'Lymphoid_immune', 'CD10_CAF':'Fibroblast', 'CD4_Treg':'Lymphoid_immune', 'CD8_Treg':'Lymphoid_immune',
                                                           'T_cell':'Lymphoid_immune', 'HEV':'Vascular', 'IDO_CD4':'Lymphoid_immune', 'ki67_CD4':'Lymphoid_immune', 'CXCL13_CAF':'Fibroblast', 'IDO_CD8':'Lymphoid_immune',
                                                           'CD34_CAF':'Fibroblast', 'CD4_CXCL13':'Lymphoid_immune', 'CD73_CAF':'Fibroblast', 'ki67_CD8':'Lymphoid_immune', 'CD8_CXCL13':'Lymphoid_immune','IDO_CAF':'Fibroblast',
                                                            'CCL21_CAF':'Fibroblast', 'CA9_CD10_CAF':'Fibroblast'})
df['level_2_cell_type'].value_counts()

In [None]:
df['level_1_cell_type'] = df['level_2_cell_type']
df['level_1_cell_type'] = df['level_1_cell_type'].replace({'Fibroblast':'Stromal', 'Lymphoid_immune':'Immune', 'Vascular':'Stromal', 'hypoxic Cancer':'Cancer', 'Vimentin high':'Stromal',
                                                            'Myeloid_immune':'Immune', 'Vascular':'Stromal'})
df['level_1_cell_type'].value_counts()

In [None]:
df = reorder_columns(df, 'cell_type', 'level_1_cell_type')
df = reorder_columns(df, 'level_2_cell_type', 'level_1_cell_type')
df

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv', index=False)

# Encode uninfomrative variables

In [4]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv')
df.columns

Index(['FSP1', 'aSMA', 'Histone H3', 'CD11b_1', 'HLA-DR', 'CD146',
       'Cadherin-11', 'FAP', 'CD11b', 'VCAM1', 'CD20', 'CD68', 'IDO', 'CD3',
       'Podoplanin', 'CD11c', 'Carbonic Anhydrase IX', 'CD73', 'MMP9',
       'p75 (CD271)', 'CD10', 'Vimentin', 'FOXP3', 'CXCL13', 'PNAd', 'CD8',
       'Fibronectin', 'LYVE-1', 'PDGFR-b', 'CD34', 'CD4', 'vWF', 'CXCL-12',
       'CCL21', 'Keratin Epithelial', 'Iridium191', 'Iridium193', 'Ki-67',
       'CD45', 'Myeloperoxidase MPO', 'ImageNumber', 'CellNumber', 'x', 'y',
       'Area', 'MajorAxisLength', 'MinorAxisLength', 'Area_Description',
       'BatchID', 'SampleID', 'RoiID', 'CellID', 'celltype', 'cell_subtype',
       'level_1_cell_type', 'level_2_cell_type', 'cell_type'],
      dtype='object')

In [5]:
df[['ImageNumber', 'CellNumber','BatchID', 'SampleID', 'RoiID', 'CellID']]

Unnamed: 0,ImageNumber,CellNumber,BatchID,SampleID,RoiID,CellID
0,1,56,20201013,TBB075,TBB075_1,TBB075_1_56
1,1,96,20201013,TBB075,TBB075_1,TBB075_1_96
2,1,101,20201013,TBB075,TBB075_1,TBB075_1_101
3,1,114,20201013,TBB075,TBB075_1,TBB075_1_114
4,1,138,20201013,TBB075,TBB075_1,TBB075_1_138
...,...,...,...,...,...,...
553116,108,5061,20201023,TBB226,TBB226_8,TBB226_8_5061
553117,108,5763,20201023,TBB226,TBB226_8,TBB226_8_5763
553118,109,318,20201023,TBB226,TBB226_9,TBB226_9_318
553119,109,1123,20201023,TBB226,TBB226_9,TBB226_9_1123


In [None]:
df = df.rename(columns={'ImageNumber': 'sample_id', 'CellNumber': 'cell_id', 'SampleID': 'PatientID', 'CellID': 'unique_cell_id'})
df = df.drop(columns=['BatchID'])

In [10]:
df.columns

Index(['FSP1', 'aSMA', 'Histone H3', 'CD11b_1', 'HLA-DR', 'CD146',
       'Cadherin-11', 'FAP', 'CD11b', 'VCAM1', 'CD20', 'CD68', 'IDO', 'CD3',
       'Podoplanin', 'CD11c', 'Carbonic Anhydrase IX', 'CD73', 'MMP9',
       'p75 (CD271)', 'CD10', 'Vimentin', 'FOXP3', 'CXCL13', 'PNAd', 'CD8',
       'Fibronectin', 'LYVE-1', 'PDGFR-b', 'CD34', 'CD4', 'vWF', 'CXCL-12',
       'CCL21', 'Keratin Epithelial', 'Iridium191', 'Iridium193', 'Ki-67',
       'CD45', 'Myeloperoxidase MPO', 'sample_id', 'cell_id', 'x', 'y', 'Area',
       'MajorAxisLength', 'MinorAxisLength', 'Area_Description', 'PatientID',
       'RoiID', 'unique_cell_id', 'celltype', 'cell_subtype',
       'level_1_cell_type', 'level_2_cell_type', 'cell_type'],
      dtype='object')

In [11]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/breast_fibro_IMC_quantification.csv', index=False)