# Align Metadata

In this notebook, we align the metadata between the SERPENTINE and Chen et al datasets. When required, we will re-name columns so that they match across the two datasets. The result of this notebook is a merged unprocessed anndata object containing both the SERPENTINE and Chen et al datasets.

I anticipate that sometimes we will be able to or we will need to update the meta-data when new info becomes available, so any updates to existing objects should also be done in this notebook.

Also, in the case of the Mast cells, we will re-label them here since in our other notebook we had called them Myeloid but this is not correct.

In [1]:
# basic python packages
import numpy as np
import pandas as pd
import os
import sys 

# singlecell packages
import scanpy as sc
import anndata


In [2]:
# set the project directory 
os.chdir('/home/groups/singlecell/smorabito/analysis/SERPENTINE/')
data_dir = 'data/'
fig_dir = 'figures/'

In [3]:
# load the SERPENTINE dataset 
adata_met = sc.read_h5ad(
    '/home/groups/singlecell/smorabito/analysis/SERPENTINE/data/SERPENTINE_PCA_27-11-24_annotated.h5ad'
)

In [4]:
# load the Chen et al dataset
adata_pri = sc.read_h5ad(
    '/home/groups/singlecell/smorabito/analysis/SERPENTINE/data/Chen2024_Tumor_processed_201124.h5ad'
)

In [5]:
adata_met.obs.columns

Index(['Subproject_CNAG', 'Replicate', 'Patient', 'bc', 'batch', 'DOB',
       'Tumor_Type', 'Primary_tumor', 'Histology', 'Microsatellite_status',
       'Molecular_profile', 'N_pre_lines', 'Previous_therapies',
       'Metastases_location', 'Cohort', 'Treatment', 'Biopsy_pre-treat',
       'Biopsy_pre-C2', 'Biopsy', 'BiopsyEOT', 'Best_response', 'Tissue',
       'Timepoint', 'Sample', 'doublet_score', 'predicted_doublet',
       'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt',
       'leiden', 'AIFI_L1_prediction', 'AIFI_L1_score', 'AIFI_L2_prediction',
       'AIFI_L2_score', 'AIFI_L3_prediction', 'AIFI_L3_score',
       'Immune_All_Low_prediction', 'Immune_All_Low_score',
       'Immune_All_High_prediction', 'Immune_All_High_score', 'cell_type',
       'annotation'],
      dtype='object')

### Fix stuff in the primary tumor dataset

In [6]:
#---------------------------------------------------------#
# rename columns in the primary tumor dataset:
#---------------------------------------------------------#

new_cols = {
    'Ident' : 'Replicate',
    'MajorCellType': 'cell_type',
    'SubCellType' : 'orig_annotation',
    'Treatment' : 'Timepoint',
    'MSI/MSS' : 'Microsatellite_status',
    'Gender' : 'Sex',
    'Treatment Regimen': 'Treatment',
    'Cancer Type': 'Tumor_Type'
}

adata_pri.obs = adata_pri.obs.rename(columns=new_cols)


In [7]:
# fix the timepoint in the primary tumor dataset
adata_pri.obs.Timepoint = adata_pri.obs.Timepoint.replace(
    {'I': 1, 'II':2, 'III':3, 'IV':4}
)

  adata_pri.obs.Timepoint = adata_pri.obs.Timepoint.replace(
  adata_pri.obs.Timepoint = adata_pri.obs.Timepoint.replace(


In [8]:
# fix the timepoint in the primary tumor dataset
adata_pri.obs.Response = adata_pri.obs.Response.replace({
    'CR': 'Complete Response',
    'PR': 'Partial Response', 
    'SD': 'Stable Disease'
})

  adata_pri.obs.Response = adata_pri.obs.Response.replace({


In [9]:
# Cast age to a string 
adata_pri.obs.Age = adata_pri.obs.Age.astype(str)

In [10]:
# add a dataset column
adata_pri.obs['dataset'] = 'Chen2024'
adata_pri.obs.Tissue = 'Primary'
adata_pri.obs['Molecular_profile'] = 'Missing'

## Fix stuff in the SERPENTINE dataset

In [11]:
# cluster 33 is Mast cell not myeloid
adata_met.obs.cell_type = adata_met.obs.cell_type.cat.add_categories('Mast cells')
adata_met.obs.loc[adata_met.obs.leiden == '33', 'cell_type'] = 'Mast cells'

In [12]:
#---------------------------------------------------------#
# rename columns in the primary tumor dataset:
#---------------------------------------------------------#

new_cols = {
    'Best_response': 'Response',
    'DOB': 'Age'
}

adata_met.obs = adata_met.obs.rename(columns=new_cols)


In [13]:
# fix the response column
adata_met.obs.Response = adata_met.obs.Response.replace({
    'Progression disease': 'Progression Disease',
    'Clinical progression': 'Progression Disease',
    'Not revaluated': 'Missing', 
    'Screening failure': 'Missing'
})

adata_met.obs.Response = [x if "Stable" not in x else "Stable Disease" for x in adata_met.obs.Response.to_list()]
adata_met.obs.Response = [x if "Partial" not in x else "Partial Response" for x in adata_met.obs.Response.to_list()]


  adata_met.obs.Response = adata_met.obs.Response.replace({


In [14]:
# fix the Tissue name in the SERPENTINE dataset 
adata_met.obs.Response.value_counts()

Response
Progression Disease    196468
Stable Disease          55414
Partial Response        30866
Missing                  9535
Name: count, dtype: int64

In [15]:
# fix the timepoint in the primary tumor dataset
adata_met.obs.Tissue = adata_met.obs.Tissue.replace(
    {
        'Lymphnode (right axilary)': 'Lymph node',
        'Lymphnode (iliac)': 'Lymph node',
        'Lymphnode (hepatic iliar)': 'Lymph node',
        'Peritoneum (right iliac fossa)': 'Peritoneum',
        'Peritoneal implant (right iliac fossa)': 'Peritoneum'
    }
)

  adata_met.obs.Tissue = adata_met.obs.Tissue.replace(


In [16]:
# fix the weird Subcutaneous tissue character
adata_met.obs.Tissue = [x if "Subcutaneous" not in x else "Subcutaneous tissue" for x in adata_met.obs.Tissue.to_list()]

In [17]:
# add dataset columns:
adata_met.obs['dataset'] = 'SERPENTINE'
adata_met.obs['Sex'] = 'Missing'

In [18]:

# add the original annotations (from Gerard for SERPENTINE, and from Chen et al
adata_met.obs['bc'] = [x.split('-')[0] for x in adata_met.obs.index.to_list()]

# load the annotations
anno_df = pd.read_table('/home/groups/singlecell/smorabito/analysis/SERPENTINE/data/SP_annotations.csv', sep=',')
anno_df = anno_df.rename({'Unnamed: 0': 'barcode'}, axis=1)
anno_df['bc'] = [x.split('_')[-1] for x in anno_df.barcode.to_list()]
anno_df['bc_sample'] = anno_df['sample'].astype(str) + '-' + anno_df.bc.astype(str)

adata_met.obs['bc_sample'] = adata_met.obs.Replicate.astype(str) + '-' + adata_met.obs.bc.astype(str)
temp = adata_met.obs.merge(anno_df, how='left', on = 'bc_sample')
adata_met.obs['orig_annotation'] = temp['Annotation_2.0'].to_list()


## Combine anndatas

In [19]:
# common columns:
columns_keep = list(set(adata_pri.obs.columns.to_list()) & set(adata_met.obs.columns.to_list()))
columns_keep = adata_met.obs.columns[adata_met.obs.columns.isin(columns_keep)].to_list()

# remove the "prediction" and "score" columns from CellTypist (don't need them anymore)
columns_keep = [x for x in columns_keep if "score" not in x ]
columns_keep = [x for x in columns_keep if "prediction" not in x ]

# add back the doublet_score column
columns_keep.append('doublet_score')

columns_keep

['Replicate',
 'Patient',
 'Age',
 'Tumor_Type',
 'Microsatellite_status',
 'Molecular_profile',
 'Treatment',
 'Response',
 'Tissue',
 'Timepoint',
 'n_genes_by_counts',
 'total_counts',
 'total_counts_mt',
 'pct_counts_mt',
 'leiden',
 'cell_type',
 'dataset',
 'Sex',
 'orig_annotation',
 'doublet_score']

In [20]:
# subset metadata by common columns:
adata_pri.obs = adata_pri.obs[columns_keep]
adata_met.obs = adata_met.obs[columns_keep]

In [21]:
# fix the gene ids before merging
adata_met.var.gene_ids = adata_met.var.index.to_list()

In [22]:
# merge
adata = adata_pri.concatenate(adata_met) 
adata.shape

  adata = adata_pri.concatenate(adata_met)


(572169, 36017)

## Save the final result

In [23]:
# delete unused .obsm 
del adata.obsm['AIFI_L1']
del adata.obsm['AIFI_L2']
del adata.obsm['AIFI_L3']
del adata.obsm['Immune_All_High']
del adata.obsm['Immune_All_Low']
del adata.obsm['X_pca']
del adata.obsm['X_umap']


In [24]:
# reset X to counts layer
adata.X = adata.layers['counts'].copy()

In [25]:
# save the unprocessed object
adata.write_h5ad('{}SERPENTINE_Chen2024_merged_unprocessed_171224.h5ad'.format(data_dir))
