In [1]:
import numpy as np
import anndata as ad
import scimap as sm
import scanpy as sc
import pandas as pd
import anndata as ad
import os

Running SCIMAP  2.1.3


# In this NB we are preprocessing the assembled anndata object for furhter downstream analyses

In [3]:
adata = ad.read_h5ad("/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/adatas/oldstandard/cells_annotated.h5ad")

## Filtering and censoring

In [4]:
# Filter out unused IMC channels or non-important channels
columns_to_remove = [' 1', ' 2', ' 3', ' 4', ' 5', 'HistoneH3', '191Ir', '193Ir', ' 6']
mask = ~adata.var_names.isin(columns_to_remove)
adata = adata[:, mask].copy()
# Censor the data to the 99th percentile
for i in range(adata.n_vars):
    p99 = np.percentile(adata.X[:, i], 99)
    adata.X[:, i] = np.clip(adata.X[:, i], None, p99)
# Filter out cells with less than 4 pixels and nromalize with arcsinh function cofactor 1
adata = adata[adata.obs["area"] >= 4,:]
adata.raw = adata
adata.X = np.arcsinh(adata.X)
rename_dict = {'B': 'MM_BD', 'UB': 'MM_noBD', 'MGUS': 'MGUS/SMM'}
adata.obs['disease'] = adata.obs['disease'].map(rename_dict)

## Mapping disease subsets

In [5]:
# map a refined disease subgrouping
adata.obs['disease2'] = adata.obs['disease']
adata.obs['disease2'] = adata.obs['disease2'].astype(str)
adata.obs.loc[adata.obs['patient_ID'] == 'IMC02', 'disease2'] = 'MGUS'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC03', 'disease2'] = 'MGUS'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC04', 'disease2'] = 'MGUS'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC72', 'disease2'] = 'MGUS'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC82', 'disease2'] = 'MGUS'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC05', 'disease2'] = 'SMM'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC16-1', 'disease2'] = 'SMM'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC18-1', 'disease2'] = 'SMM'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC37-1', 'disease2'] = 'SMM'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC41', 'disease2'] = 'SMM'
adata.obs.loc[adata.obs['patient_ID'] == 'IMC59', 'disease2'] = 'SMM'
adata.obs['disease2'] = adata.obs['disease2'].astype('category')

## Integrating manually phenotyped images

4 images have been manually phenpotyped and these phenotypes need to be integrated into the existing anndata object

In [None]:
# map manual phenptyping for 4 images
phe_path = '/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/github/myeloma_standal/phenotyping/manual_rephenotyping'
for file in os.listdir(phe_path):
    if file.endswith('.h5ad'):
        name = file.split('standard')[0]
        adp = ad.read_h5ad(os.path.join(phe_path, file))
        cell_ids = adp.obs['CellID']
        image_ids = adp.obs['imageid']
        image_ids_cleaned = image_ids.str.replace('phenotyping_', '', regex=False)
        new_obs_names = ['Object {} in {}.csv'.format(cell_id, image_id) for cell_id, image_id in zip(cell_ids, image_ids_cleaned)]
        adp.obs_names = new_obs_names
        
        # Adhere to the naming convention used when creating the original adata
        adp.obs['phenotype'] = adp.obs['phenotype'].astype(str)
        adp.obs['phenotype'] = adp.obs['phenotype'].replace({
            'mDCs': 'Dendritic Cells',
            'Others': 'Unknown',
            'CD45+': 'Unknown',
        })
        adata.obs['Phenotype'] = adata.obs['Phenotype'].astype(str)

        if not adp.obs_names.equals(adata[adata.obs['image_ID'] == name + '.csv'].obs_names):
            for entry in  adata[adata.obs['image_ID'] == name + '.csv'].obs_names:
                if entry not in adp.obs_names:
                    print(entry)
                    
        common_indices = adp.obs_names.intersection(adata.obs_names)
        adata.obs.loc[common_indices, 'Phenotype'] = adp.obs.loc[common_indices, 'phenotype']
        adata.obs['Phenotype'] = adata.obs['Phenotype'].astype('category')
        print(file)
        print(adata.obs.loc[common_indices, 'Phenotype'].value_counts())
        
            

In [6]:
adata.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/QC/standard/cells_annotated_pp.h5ad')

## Creating Osteocyte Phenpotype

In [3]:
adata_wo = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/old/cells_annotated_pp_osteocytes.h5ad')

In [20]:
# Now we add Osteocytes to it using distance to bone
adata_wo = adata.copy()
adata_wo.obs['Phenotype2'] = adata_wo.obs['Phenotype'].astype(str)
is_osteocyte = (adata_wo.obs['distance_to_bone'] == 0) & (adata_wo.obs['Phenotype'] != 'Osteoclasts')
adata_wo.obs.loc[is_osteocyte, 'Phenotype2'] = 'Osteocyte'
adata_wo.obs['Phenotype2'] = adata_wo.obs['Phenotype2'].astype('category')

In [23]:
adata_wo.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/cells_annotated_pp_osteocytes.h5ad')

# Some patoemts need to be excluded as they received treatment

In [4]:
excluded_patients = ['IMC10', 'IMC14', 'IMC22', 'IMC76', 'IMC79', 'IMC88']

In [None]:
adata_wo

In [8]:
adata_wo_cleaned = adata_wo[~adata_wo.obs['patient_ID'].isin(excluded_patients), :]


In [None]:
adata_wo_cleaned

In [10]:
adata_wo_cleaned.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/cells_annotated_pp_osteocytes_cleaned.h5ad')

## There had been some problem with osteocyte creation, thats why we will redefine them using dataframe and a for loop rather directly in the adata object

In [28]:
adata = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned.h5ad')


In [29]:
del adata.obs['Phenotype2']

In [31]:
df = adata.obs[['Phenotype', 'distance_to_bone']].copy()
df['Phenotype2'] = df['Phenotype'].astype(str)
for i, row in df.iterrows():
    if row['distance_to_bone'] == 0 and row['Phenotype'] != 'Osteoclasts':
        df.loc[i, 'Phenotype2'] = 'Osteocyte'
df['Phenotype2'] = df['Phenotype2'].astype('category')

In [32]:
adata.obs['Phenotype2'] = df['Phenotype2']

In [None]:
adata.obs.dtypes

In [35]:
adata.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned.h5ad')

In [None]:
df['Phenotype2'].value_counts()

In [None]:
df['Phenotype'].value_counts()

## Some images need to be excluded and relabeled

IMC37: IMC37.2 and onwards exlcuded because they received treatment
IMC79-1: Received treatment
IMC08: BD instead of noBD
IMC18-2: BD instead of noBD

In [36]:
adata = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned.h5ad')

In [None]:
adata.obs['disease'].value_counts()

In [None]:
adata[adata.obs['patient_ID'] == 'IMC18-2']

In [38]:
relabel = ['IMC08', 'IMC18-2']
mask = adata.obs['patient_ID'].isin(relabel)
adata.obs.loc[mask, 'disease'] = 'MM_BD'
adata.obs.loc[mask, 'disease2'] = 'MM_BD'

In [40]:
# Saving before dropping more patients
adata.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned_241029.h5ad')

In [41]:
to_drop = ['IMC37-2', 'IMC37-3', 'IMC37-4', 'IMC79-1']
adata = adata[~adata.obs['patient_ID'].isin(to_drop), :]

In [43]:
adata.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned.h5ad')

## Producing a new cohort with just Myeloma vs MGUS/SMM

In [3]:
adata = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned.h5ad')

In [None]:
adata.obs['disease3'] = adata.obs['disease2'].replace(['MM_noBD', 'MM_BD'], 'MM')

In [None]:
adata.obs['disease3'].value_counts()

In [11]:
adata.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/myeloma_cohort.h5ad')

## Using classifier determined positivity to correct macrophage labels in tumor nests

In [23]:
adata = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_thresholds_scanorama.h5ad') #less strict
adata2 = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_thresholds_scanorama2.h5ad') #stricter
adata_f = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned_nbh.h5ad')
sc.pp.scale(adata_f)
adata.obs['Phenotype5'] = adata.obs['Phenotype3']
adata2.obs['Phenotype5'] = adata2.obs['Phenotype3']
adata_f.obs['Phenotype5'] = adata_f.obs['Phenotype3']
adata.obs['Phenotype5'] = adata.obs['Phenotype5'].astype(str)
adata2.obs['Phenotype5'] = adata2.obs['Phenotype5'].astype(str)
adata_f.obs['Phenotype5'] = adata_f.obs['Phenotype5'].astype(str)
# For adata2 only the one image
mask2 = (
    (adata2.obs['CD68_counts'] == 1)
    & (adata2.obs['IRF4_counts'] == 0)
    & (adata2.obs['image_ID'] == 'TS-373_IMC21_UB_002.csv')
    & (adata2.obs['Phenotype5'] == 'Plasma Cells/MM cells')
)
adata2.obs.loc[mask2, 'Phenotype5'] = 'Plasma Cells/MM cells_CD68_2'
mask1 = (
    (adata.obs['CD68_counts'] == 1)
    & (adata.obs['IRF4_counts'] == 0)
    & (adata.obs['image_ID'] != 'TS-373_IMC21_UB_002.cs')
    & (adata.obs['Phenotype5'] == 'Plasma Cells/MM cells')
)
adata.obs.loc[mask1, 'Phenotype5'] = 'Plasma Cells/MM cells_CD68'
adata.obs.loc[adata.obs['image_ID'] == 'TS-373_IMC21_UB_002.csv', 'Phenotype5'] = adata2.obs.loc[adata2.obs['image_ID'] == 'TS-373_IMC21_UB_002.csv', 'Phenotype5']

In [24]:
adata2.obs['Phenotype5'].value_counts()

Phenotype5
Plasma Cells/MM cells              352618
Neutrophils                        213151
Unknown                            142719
MPO+                                60363
CD8+Tcells                          47374
Macrophages/Monocytes               32662
CD4+Tcells                          32509
Endothelial cells                   27625
activated Macrophages/Monocytes     26916
CD68+                               25101
HSCs                                18576
Adipocytes                          15427
Dendritic Cells                     10355
Osteoblasts                          5884
Osteocyte                            5679
Osteoclasts                          1088
Plasma Cells/MM cells_CD68_2          533
Name: count, dtype: int64

In [25]:
adata_f.obs['Phenotype5'] = adata.obs['Phenotype5']
adata_f.obs['Phenotype5'].value_counts()

Phenotype5
Plasma Cells/MM cells              336820
Neutrophils                        213151
Unknown                            142719
MPO+                                60363
CD8+Tcells                          47374
Macrophages/Monocytes               32662
CD4+Tcells                          32509
Endothelial cells                   27625
activated Macrophages/Monocytes     26916
CD68+                               25101
HSCs                                18576
Plasma Cells/MM cells_CD68          15798
Adipocytes                          15427
Dendritic Cells                     10355
Osteoblasts                          5884
Osteocyte                            5679
Osteoclasts                          1088
Plasma Cells/MM cells_CD68_2          533
Name: count, dtype: int64

In [26]:
adata_f.obs['Phenotype5'] = adata_f.obs['Phenotype5'].replace(['CD68+', 'Macrophages/Monocytes', 'activated Macrophages/Monocytes', 'Plasma Cells/MM cells_CD68_2', 'Plasma Cells/MM cells_CD68'], 'Macrophages/Monocytes')
adata_f.obs['HLA-DR_counts'] = adata.obs['HLA-DR_counts']
adata_f.obs['Phenotype5'].value_counts()

Phenotype5
Plasma Cells/MM cells    336820
Neutrophils              213151
Unknown                  142719
Macrophages/Monocytes    101010
MPO+                      60363
CD8+Tcells                47374
CD4+Tcells                32509
Endothelial cells         27625
HSCs                      18576
Adipocytes                15427
Dendritic Cells           10355
Osteoblasts                5884
Osteocyte                  5679
Osteoclasts                1088
Name: count, dtype: int64

In [27]:
mask1 = (
    (adata_f.obs['HLA-DR_counts'] == 1)
    & (adata_f.obs['Phenotype5'] == 'Macrophages/Monocytes')
)
adata_f.obs.loc[mask1, 'Phenotype5'] = 'activated Macrophages/Monocytes'
adata_f.obs['Phenotype5'].value_counts()

Phenotype5
Plasma Cells/MM cells              336820
Neutrophils                        213151
Unknown                            142719
MPO+                                60363
activated Macrophages/Monocytes     56819
CD8+Tcells                          47374
Macrophages/Monocytes               44191
CD4+Tcells                          32509
Endothelial cells                   27625
HSCs                                18576
Adipocytes                          15427
Dendritic Cells                     10355
Osteoblasts                          5884
Osteocyte                            5679
Osteoclasts                          1088
Name: count, dtype: int64

In [28]:
mask3 = (
    adata_f.obs['Phenotype5'].isin(['Macrophages/Monocytes', 'activated Macrophages/Monocytes']) 
    & (adata_f[:, 'IRF4'].X.flatten() > 1)
)
adata_f.obs.loc[mask3, 'Phenotype5'] = 'Plasma Cells/MM cells'
adata_f.obs['Phenotype5'].value_counts()

Phenotype5
Plasma Cells/MM cells              346268
Neutrophils                        213151
Unknown                            142719
MPO+                                60363
activated Macrophages/Monocytes     52872
CD8+Tcells                          47374
Macrophages/Monocytes               38690
CD4+Tcells                          32509
Endothelial cells                   27625
HSCs                                18576
Adipocytes                          15427
Dendritic Cells                     10355
Osteoblasts                          5884
Osteocyte                            5679
Osteoclasts                          1088
Name: count, dtype: int64

In [29]:
adata_f.obs['Phenotype5'] = adata_f.obs['Phenotype5'].astype('category')
adata_f.obs = adata_f.obs.rename(columns={'Phenotype5': 'Phenotype4'})

In [30]:
adata_f.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned_nbh_macs.h5ad')

In [128]:
## Only once 
# adata.obs['Phenotype4'] = adata_f.obs['Phenotype4']
# adata2.obs['Phenotype4'] = adata_f.obs['Phenotype4']
# adata.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_thresholds_scanorama.h5ad')
# adata2.write_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_thresholds_scanorama2.h5ad')

## Writing updated csv files based on the current anndata

In [33]:
adata = ad.read_h5ad('/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/adatas/cells_annotated_pp_osteocytes_cleaned.h5ad')

In [34]:
intensity_path = '/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/updated_csvs/intensities'
regionprops_path = '/Users/lukashat/Documents/PhD_Schapiro/Projects/Myeloma_Standal/results/standard/updated_csvs/regionprops'

In [35]:
adata.obs.drop(columns=['Phenotype', 'Phenotype2', 'disease', 'distance_to_bone_corrected', 'disease2', 'patient_ID', 'ROI'], inplace=True)
adata.obs.rename(columns={'Phenotype3': 'Phenotype'}, inplace=True)

In [37]:
for image in adata.obs['image_ID'].unique():
    regionprops = adata[adata.obs['image_ID'] == image].obs.copy()
    regionprops = pd.DataFrame(regionprops)
    csv_name = regionprops['image_ID'].unique()[0]
    regionprops.drop(columns=['image_ID'], inplace=True)
    regionprops.to_csv(os.path.join(regionprops_path, csv_name), index=False)

    adata_subset = adata[adata.obs['image_ID'] == image].copy()
    intensities = adata_subset.to_df()
    intensities['Object'] = intensities.index.str.split(' ').str[1]
    intensities = intensities[['Object'] + [col for col in intensities.columns if col != 'Object']]
    intensities.to_csv(os.path.join(intensity_path, csv_name), index=False)