In [8]:
import numpy as np
import pandas as pd
import scimap as sm
import scanpy as sc
import anndata as ad
import os
import shutil

In [2]:
def reorder_columns(df, col_changed, col_position) -> pd.DataFrame:
    """
    Reordering colums. The second input can either be an integer for index or it can be a reference column name. If reference column name is chosen, the column will be placed after the reference column.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The first argument must be a pandas DataFrame")
    if not isinstance(col_changed, str):
        raise ValueError("The second argument must be a string representing a column name")
    if not isinstance(col_position, (str, int)):
        raise ValueError("The third argument must be either a string representing a column name or an integer representing a column index")
    if isinstance(col_position, int):
        cols = df.columns.tolist()
        if col_changed in cols and col_position <= len(cols) + 1:
            cols.remove(col_changed)
        
            index = col_position
            cols.insert(index, col_changed)
            
            df = df[cols]
    else:
        cols = df.columns.tolist()
        if col_changed in cols and col_position in cols:
            cols.remove(col_changed)
            index = cols.index(col_position)
            cols.insert(index+1, col_changed)
            
            df = df[cols]
    return df

In [None]:
img_name = "TS-373_IMC21_UB_001"
adata = ad.read_h5ad(f"/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/phenotyping/phenotyping_outputs/uncorrected/standard/{img_name}standard.h5ad")
adata_old = ad.read_h5ad("/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/old_preprocessed/cells.h5ad")
channel_names = list(adata_old.var_names)
adata.uns['all_markers'] = channel_names

In [None]:
adata.obs

In [None]:
adata.obs['phenotype'].value_counts()

In [None]:
adata2 = ad.read_h5ad("/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned_nbh.h5ad")
adata2.uns['all_markers'] = channel_names
adata2 = adata2[adata2.obs['image_ID'] == f'{img_name}.csv']
adata2.obs['Phenotype3'].value_counts()

In [None]:
subset = img_name
image_path = f'/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/img/{subset}.tiff'
mask_path = f'/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/masks/{subset}.tiff'
sm.pl.image_viewer(image_path = image_path, adata = adata, overlay = 'phenotype', point_color='white', imageid = 'imageid', point_size=5, seg_mask = mask_path)

In [None]:
sm.pl.image_viewer(image_path = image_path, adata = adata2, overlay = 'Phenotype3', point_color='white', imageid = 'imageid', point_size=5, seg_mask = mask_path)

In [None]:
df = pd.read_csv(f'/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/merged_csv/{img_name}.csv', index_col=0)

In [None]:
adata2

In [None]:
df.rename(columns={'centroid-0': 'y', 'centroid-1': 'x'}, inplace=True)
df['image_id'] = img_name

In [None]:
phenotypes = adata2.obs[['Phenotype3', 'Object']]
phenotypes.reset_index(inplace=True)
phenotypes.drop(columns='index', inplace=True)
phenotypes

In [None]:
df2 = pd.merge(df, phenotypes, on='Object', how='inner')
df2.rename(columns={'Phenotype3': 'phenotype', 'Object': 'Cell_ID'}, inplace=True)
cell_id_col = df2.pop('Cell_ID')
df2.insert(len(df2.columns) - 1, 'Cell_ID', cell_id_col)
df2

In [None]:
df2.to_csv(f'/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/myeloma_IMC/quantification/{img_name}.csv', index=True)
shutil.copy(image_path, f'/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/myeloma_IMC/raw_images/multistack_tiffs/{img_name}.tiff')
shutil.copy(mask_path, f'/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/myeloma_IMC/segmentation/{img_name}.tiff')

In [None]:
# Good quality images and phenotypes checked, now creating the combined csv based on csv names
adata = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned_nbh_macs.h5ad')


In [None]:
lst = []
for i, csv in enumerate(os.listdir('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/myeloma_IMC/quantification/')):
    if not (csv.startswith('.') or csv.startswith('myeloma')) and csv.endswith('.csv'):
        #i = pd.read_csv(f'/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/myeloma_IMC/quantification/{csv}')
        df = adata[adata.obs['image_ID'] == csv].to_df()
        df2 = adata[adata.obs['image_ID'] == csv].obs
        i = pd.merge(df, df2, left_index=True, right_index=True)
        i.rename(columns={'Phenotype4': 'phenotype', 'Object': 'CellID', 'X_centroid': 'x', 'Y_centroid': 'y'}, inplace=True)
        i.drop(columns=['Phenotype', 'Phenotype2', 'Phenotype3', 'HLA-DR_counts', 'distance_to_bone'], inplace=True)
        lst.append(i)
comb = pd.concat(lst)
comb

In [None]:
comb.columns

In [None]:
comb.to_csv('/Volumes/Lukas_SSD/phenotyping_benchmark/datasets/myeloma_IMC/quantification/myeloma_IMC_combined.csv', index=False)

## Harmonize celltype labels

In [None]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv')
df

In [None]:
df.columns

In [None]:
df['cell_type'].value_counts()

In [None]:
df['cell_type'] = df['cell_type'].replace({'Plasma Cells/MM cells': 'Plasma_cell', 'Neutrophils': 'Neutrophil', 'Unknown':'undefined', 'CD8+Tcells':'CD8+_T_cell',
                                           'CD4+Tcells':'CD4+_T_cell', 'activated Macrophages/Monocytes': 'Macrophage', 'Macrophages/Monocytes': 'Macrophage',
                                           'Endothelial cells': 'Endothelial', 'Adipocytes': 'Adipocyte', 'Dendritic Cells': 'Dendritic_cell', 'Osteocytes': 'Osteocyte',
                                           'Osteoclasts': 'Osteoclast', 'Osteoblasts': 'Osteoblast', 'MPO+':'undefined'})

In [None]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv', index=False)

# Implement different levels of granularity

In [7]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv')
df.value_counts('cell_type')

cell_type
Plasma_cell       45244
Neutrophil         5936
undefined          5566
Macrophage         5315
CD8+_T_cell        3824
MPO+               2737
CD4+_T_cell        2383
Endothelial        2143
HSCs               1038
Adipocyte           889
Dendritic_cell      477
Osteoblast          421
Osteocyte           236
Osteoclast          178
Name: count, dtype: int64

In [8]:
df['cell_type'] = df['cell_type'].replace({'MPO+':'undefined'})

In [9]:
df['level_2_cell_type'] = df['cell_type']
df['level_2_cell_type'] = df['level_2_cell_type'].replace({'Plasma_cell': 'Lymphoid_immune', 'Neutrophil': 'Myeloid_immune', 'Macrophage': 'Myeloid_immune',
                                                           'CD8+_T_cell': 'Lymphoid_immune', 'CD4+_T_cell': 'Lymphoid_immune', 'Endothelial': 'Vascular', 'Dendritic_cell': 'Myeloid_immune',
                                                              'Osteocyte': 'Bone', 'Osteoclast': 'Bone', 'Osteoblast': 'Bone'})
df['level_2_cell_type'].value_counts()


level_2_cell_type
Lymphoid_immune    51451
Myeloid_immune     11728
undefined           8303
Vascular            2143
HSCs                1038
Adipocyte            889
Bone                 835
Name: count, dtype: int64

In [10]:
df['level_1_cell_type'] = df['level_2_cell_type']
df['level_1_cell_type'] = df['level_1_cell_type'].replace({'Lymphoid_immune': 'Immune', 'Myeloid_immune': 'Immune', 'Vascular': 'Stromal', 'Bone': 'Stromal', 'Adipocyte': 'Stromal'})
df['level_1_cell_type'].value_counts()


level_1_cell_type
Immune       63179
undefined     8303
Stromal       3867
HSCs          1038
Name: count, dtype: int64

In [11]:
df = reorder_columns(df, 'cell_type', 'level_1_cell_type')
df = reorder_columns(df, 'level_2_cell_type', 'level_1_cell_type')
df 

Unnamed: 0,CD38,Perilipin,Vimentin,B4GALT1,MPO,CathepsinK,ATP5A,RUNX2,HIF1A,CD11b,...,image_ID,disease,patient_ID,ROI,disease2,distance_to_bone_corrected,cellcharter_CN,level_1_cell_type,level_2_cell_type,cell_type
0,3.081162,2.445452,4.914248,3.262659,0.998546,1.111457,3.863021,1.563304,1.941053,2.574781,...,TS-373_IMC38_B_002.csv,MM_BD,IMC38,2,MM_BD,119.816526,adaptive_immune,Immune,Myeloid_immune,Macrophage
1,1.449686,1.445057,3.864532,1.809677,0.230983,0.228769,2.065104,0.610242,1.205404,1.476939,...,TS-373_IMC38_B_002.csv,MM_BD,IMC38,2,MM_BD,460.908885,stroma_adipocyte,Immune,Lymphoid_immune,Plasma_cell
2,2.286256,2.080325,4.303116,2.380829,0.659623,0.742665,3.227469,0.850710,1.449479,1.801367,...,TS-373_IMC38_B_002.csv,MM_BD,IMC38,2,MM_BD,259.138959,stroma_adipocyte,Immune,Lymphoid_immune,Plasma_cell
3,1.698429,2.508642,4.809707,2.022134,0.814226,0.279389,2.740041,0.786641,1.405506,0.985527,...,TS-373_IMC38_B_002.csv,MM_BD,IMC38,2,MM_BD,73.109507,stroma_adipocyte,Immune,Lymphoid_immune,Plasma_cell
4,3.502561,2.346567,4.019302,2.681798,4.687031,1.981153,3.708386,0.752709,4.973332,3.171923,...,TS-373_IMC38_B_002.csv,MM_BD,IMC38,2,MM_BD,61.911227,bone_myeloid,Immune,Myeloid_immune,Neutrophil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76382,0.397160,0.247466,1.228546,0.633130,0.000000,0.000000,0.285756,0.000000,0.000000,0.000000,...,TS-373_IMC07_UB_001.csv,MM_noBD,IMC07,1,MM_noBD,104.000000,bone,undefined,undefined,undefined
76383,0.956848,0.835091,1.290564,1.018363,0.090651,0.000000,0.649017,0.130068,0.058803,0.762014,...,TS-373_IMC07_UB_001.csv,MM_noBD,IMC07,1,MM_noBD,157.000000,stroma_adipocyte,Immune,Lymphoid_immune,CD8+_T_cell
76384,1.796810,1.089425,1.211743,1.395367,0.221954,0.138360,1.473655,0.531174,0.095839,0.945384,...,TS-373_IMC07_UB_001.csv,MM_noBD,IMC07,1,MM_noBD,313.000000,focal_pc_oxphos,Immune,Lymphoid_immune,Plasma_cell
76385,0.289430,0.099494,0.076847,0.235142,0.000000,0.000000,0.580869,0.000000,0.000000,0.106578,...,TS-373_IMC07_UB_001.csv,MM_noBD,IMC07,1,MM_noBD,487.935447,stroma_adipocyte,undefined,undefined,undefined


In [12]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv', index=False)

In [3]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv')
df.columns

Index(['CD38', 'Perilipin', 'Vimentin', 'B4GALT1', 'MPO', 'CathepsinK',
       'ATP5A', 'RUNX2', 'HIF1A', 'CD11b', 'CD45', 'CS', 'CD11c', 'CD36',
       'CD4', 'CD34', 'CD68', 'IL32', 'IDO', 'CD8', 'GranzymeK', 'PKM2',
       'IRF4', 'GLUT1', 'GranzymeB', 'Ki67', 'CollagenTypeI', 'CD3', 'CPT1A',
       'CD98', 'HLA-DR', 'ST6GAL1', 'CD138', 'CellID', 'area', 'y', 'x',
       'axis_major_length', 'axis_minor_length', 'eccentricity', 'image_ID',
       'disease', 'patient_ID', 'ROI', 'disease2',
       'distance_to_bone_corrected', 'cellcharter_CN', 'level_1_cell_type',
       'level_2_cell_type', 'cell_type'],
      dtype='object')

# Encode uninfomrative variables 

In [9]:
df = pd.read_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv')
df.dtypes

CD38                          float64
Perilipin                     float64
Vimentin                      float64
B4GALT1                       float64
MPO                           float64
CathepsinK                    float64
ATP5A                         float64
RUNX2                         float64
HIF1A                         float64
CD11b                         float64
CD45                          float64
CS                            float64
CD11c                         float64
CD36                          float64
CD4                           float64
CD34                          float64
CD68                          float64
IL32                          float64
IDO                           float64
CD8                           float64
GranzymeK                     float64
PKM2                          float64
IRF4                          float64
GLUT1                         float64
GranzymeB                     float64
Ki67                          float64
CollagenType

In [10]:
df = df.rename(columns={'CellID': 'cell_id', 'image_ID': 'sample_id'})
df.dtypes

CD38                          float64
Perilipin                     float64
Vimentin                      float64
B4GALT1                       float64
MPO                           float64
CathepsinK                    float64
ATP5A                         float64
RUNX2                         float64
HIF1A                         float64
CD11b                         float64
CD45                          float64
CS                            float64
CD11c                         float64
CD36                          float64
CD4                           float64
CD34                          float64
CD68                          float64
IL32                          float64
IDO                           float64
CD8                           float64
GranzymeK                     float64
PKM2                          float64
IRF4                          float64
GLUT1                         float64
GranzymeB                     float64
Ki67                          float64
CollagenType

In [11]:
df.drop(columns=['ROI'], inplace=True)
df.isna().sum()

CD38                          0
Perilipin                     0
Vimentin                      0
B4GALT1                       0
MPO                           0
CathepsinK                    0
ATP5A                         0
RUNX2                         0
HIF1A                         0
CD11b                         0
CD45                          0
CS                            0
CD11c                         0
CD36                          0
CD4                           0
CD34                          0
CD68                          0
IL32                          0
IDO                           0
CD8                           0
GranzymeK                     0
PKM2                          0
IRF4                          0
GLUT1                         0
GranzymeB                     0
Ki67                          0
CollagenTypeI                 0
CD3                           0
CPT1A                         0
CD98                          0
HLA-DR                        0
ST6GAL1 

In [12]:
df.to_csv('/Users/lukashat/Documents/PhD_Schapiro/Projects/phenotype_benchmark/datasets/myeloma_IMC_quantification.csv', index=False)