In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import harmonypy as hm
import anndata
import os
import sys
import scrublet

# new packages 
import muon as mu # multi-modal anndatas
import celltypist # automated label predictions
import scirpy # immune cell repertoire analysis

#sc.settings.set_figure_params(dpi=500, dpi_save=1000, figsize=(5,5), facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
os.chdir('/home/groups/singlecell/smorabito/analysis/SERPENTINE/')
data_dir = 'data/'
fig_dir = 'figures/'


In [7]:
# directory containing the counts downloaded from GEO
project_dir = "/home/groups/singlecell/smorabito/analysis/SERPENTINE/Chen_2024"

In [9]:
# load the dataset as anndata
# this is taking FOREVERRR
adata = sc.read_10x_mtx("{}/counts/".format(project_dir))

In [19]:
# add the meta-data
adata.shape

(975275, 36027)

In [16]:
meta = pd.read_table("{}/metadata.txt".format(project_dir), sep=' ')

In [20]:
adata.obs = meta

In [33]:
patient_meta = pd.read_table(
    "Chen2024_patient_metadata.txt".format(project_dir), sep='\t',
    encoding = 'utf-16'
)
patient_meta.head()

Unnamed: 0,Patient ID,Age,Gender,Cancer Type,Tumor Location,TNM,Tumor stage,dMMR/pMMR,MSI/MSS,POLE Mutation,TMB (Muts/Mb),Tumor Regression Ratio,Response,TRG status,Treatment Regimen
0,P01,51,Male,CRC,Descending colon,T4bN0M0,II,dMMR,MSS,Yes,Not avaliable,0.5648,CR,TRG0,Pembrolizumab
1,P02,56,Male,CRC,Ascending colon,T4bN2M1,IV,pMMR,MSS,No,3.58,-0.0196,SD,TRG3,Pembrolizumab
2,P03,66,Male,CRC,Low rectum,T3bN2aM0,III,pMMR,MSS,No,Not avaliable,0.4541,PR,TRG2,Pembrolizumab + CapeOx
3,P04,47,Female,CRC,Ascending colon,T4aN+M0,III,dMMR,MSI,No,487,0.918,CR,TRG0,Pembrolizumab
4,P05,63,Female,CRC,Low rectum,T3N+M0,III,pMMR,MSS,No,3.05,0.3333,PR,TRG2,Sintilimab+ CapeOx


In [28]:
adata.obs['barcode'] = adata.obs.index.to_list()

# merge with anndata obs
temp = adata.obs.merge(patient_meta, how='left', left_on='Patient', right_on='Patient ID')
temp.index = temp.barcode
adata.obs = temp

In [34]:
# save the results
adata.write('{}Chen2024_full.h5ad'.format(data_dir))

... storing 'orig.ident' as categorical
... storing 'Ident' as categorical
... storing 'Patient' as categorical
... storing 'Treatment' as categorical
... storing 'Tissue' as categorical
... storing 'MajorCellType' as categorical
... storing 'SubCellType' as categorical
... storing 'Patient ID' as categorical
... storing 'Gender' as categorical
... storing 'Cancer Type' as categorical
... storing 'Tumor Location' as categorical
... storing 'TNM' as categorical
... storing 'Tumor stage' as categorical
... storing 'dMMR/pMMR' as categorical
... storing 'MSI/MSS' as categorical
... storing 'POLE Mutation' as categorical
... storing 'TMB (Muts/Mb)' as categorical
... storing 'Response' as categorical
... storing 'TRG status' as categorical
... storing 'Treatment Regimen' as categorical
... storing 'feature_types' as categorical


In [35]:
# subset Tumor / Normal / Blood and save individual objects 
adata.obs.Tissue.value_counts()

Tissue
Blood     417162
Tumor     279886
Normal    260294
LN         11353
TN          6580
Name: count, dtype: int64

In [37]:
adata_t = adata[adata.obs.Tissue == 'Tumor'].copy() 
adata_b = adata[adata.obs.Tissue == 'Blood'].copy() 
adata_n = adata[adata.obs.Tissue == 'Normal'].copy() 

In [38]:
adata_t.write('{}Chen2024_Tumor.h5ad'.format(data_dir))
adata_b.write('{}Chen2024_Blood.h5ad'.format(data_dir))
adata_n.write('{}Chen2024_Normal.h5ad'.format(data_dir))

In [None]:
# save the anndata object as-is