## Notebook for the Integrated Healthy dataset and Colon Cancer Atlas
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 15 May 2023

#### Load required packages

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import scipy as sci

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


In [5]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Upload Data

In [6]:
input_path_healthy = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Datasets Integration/Integrated datasets/All_cells_5000_HVGs_scvi_scanvi.h5ad'
Healthy_adata = sc.read_h5ad(input_path_healthy)

In [7]:
X_is_raw(Healthy_adata)

True

In [8]:
input_path_cancer = '/Users/anna.maguza/Desktop/Data/Gut_project/Human_Colon_Cancer_Atlas/anndata/Colon_cancer_atlas_anndata.h5ad'
Cancer_adata = sc.read_h5ad(input_path_cancer)

In [9]:
X_is_raw(Cancer_adata)

False

In [10]:
# Upload unprocessed dataset (before HVGs extraction)
input_path_healthy_all_counts = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Healthy_reference/GCA_Kong_Smillie_Wang_unprocessed/GCA_Kong_Wang_Smillie_raw.h5ad'
healthy_all_counts = sc.read_h5ad(input_path_healthy_all_counts)

In [11]:
X_is_raw(healthy_all_counts)

True

### Preprocess X in Healthy data

In [12]:
healthy_all_counts.X

<557099x23616 sparse matrix of type '<class 'numpy.float32'>'
	with 794816608 stored elements in Compressed Sparse Row format>

In [13]:
Healthy_adata

AnnData object with n_obs × n_vars = 557099 × 5000
    obs: 'Sample_ID', 'Cell Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Gender', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'Age_group_colors', 'Cell Type_colors', 'Diagnosis_colors', 'Donor_ID_colors', 'Gender_colors', 'Library_Preparation_Protocol_colors', 'Location_colors', 'Study_name_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'neighb

In [14]:
healthy_all_counts

AnnData object with n_obs × n_vars = 557099 × 23616
    obs: 'Cell Type', 'batch', 'Sample_ID', 'n_genes', 'n_counts', 'Chem', 'Location', 'Donor_ID', 'Layer', 'Cell States', 'Gender', 'library_preparation_protocol__ontology_label', 'Diagnosis', 'n_genes_by_counts', 'total_counts_mito', 'percent_mito', 'total_counts_ribo', 'percent_ribo', 'doublet_scores', 'predicted_doublets', 'Study_name', 'Age_group', 'Cell States Kong', 'UniqueCell_ID', 'Age', 'Region code', 'Fraction', 'Cell States GCA'
    var: 'gene_id-Kong', 'gene_name-Kong', 'n_cells_by_counts-Kong', 'mean_counts-Kong', 'log1p_mean_counts-Kong', 'pct_dropout_by_counts-Kong', 'total_counts-Kong', 'log1p_total_counts-Kong', 'mito-Kong', 'ribo-Kong', 'highly_variable-Kong', 'highly_variable_rank-Kong', 'means-Kong', 'variances-Kong', 'variances_norm-Kong'

In [15]:
# Transfer var and X from unprocessed dataset to processed one
healthy_all_counts.obs = Healthy_adata.obs
healthy_all_counts.obsm = Healthy_adata.obsm
healthy_all_counts.uns = Healthy_adata.uns
healthy_all_counts.obsp = Healthy_adata.obsp

In [16]:
del Healthy_adata

### Preprocess obs in Cancer data

In [17]:
Cancer_adata.obs_keys()

['biosample_id',
 'donor_id',
 'SpecimenType',
 'TissueSource',
 'ProcessingMethod',
 'PatientTypeID',
 'sex',
 'Site',
 'Grade',
 'TumorStage',
 'LymphNodeStatus',
 'MMRStatusTumor',
 'MMRMLH1Tumor',
 'qc_geneCount',
 'qc_logMappedReads',
 'qc_meanReadsPerUmi',
 'qc_totalReads',
 'qc_logUmiCount',
 'qc_bcSwapFraction',
 'qc_geneSatFraction',
 'qc_seqDupEst',
 'qc_umiSatFraction',
 'qc_emptyDropPval',
 'qc_mitoFraction',
 'species',
 'species__ontology_label',
 'disease',
 'disease__ontology_label',
 'organ',
 'organ__ontology_label',
 'library_preparation_protocol',
 'library_preparation_protocol__ontology_label',
 'ClusterFull',
 'ClusterMidway',
 'ClusterTop']

In [18]:
Cancer_adata.obs['disease__ontology_label'].value_counts()

colon adenocarcinoma    258359
normal                  112864
Name: disease__ontology_label, dtype: int64

In [19]:
# Filter only cancer cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['disease__ontology_label'] == 'colon adenocarcinoma', :]

In [20]:
Cancer_adata.obs_keys

<bound method AnnData.obs_keys of View of AnnData object with n_obs × n_vars = 258359 × 43282
    obs: 'biosample_id', 'donor_id', 'SpecimenType', 'TissueSource', 'ProcessingMethod', 'PatientTypeID', 'sex', 'Site', 'Grade', 'TumorStage', 'LymphNodeStatus', 'MMRStatusTumor', 'MMRMLH1Tumor', 'qc_geneCount', 'qc_logMappedReads', 'qc_meanReadsPerUmi', 'qc_totalReads', 'qc_logUmiCount', 'qc_bcSwapFraction', 'qc_geneSatFraction', 'qc_seqDupEst', 'qc_umiSatFraction', 'qc_emptyDropPval', 'qc_mitoFraction', 'species', 'species__ontology_label', 'disease', 'disease__ontology_label', 'organ', 'organ__ontology_label', 'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'ClusterFull', 'ClusterMidway', 'ClusterTop'
    var: 'gene_name'>

In [21]:
# Delete unnecessary columns
del Cancer_adata.obs['disease'], Cancer_adata.obs['species'], Cancer_adata.obs['species__ontology_label'], Cancer_adata.obs['organ'], Cancer_adata.obs['library_preparation_protocol']
del Cancer_adata.obs['qc_geneCount']
del Cancer_adata.obs['qc_logMappedReads']
del Cancer_adata.obs['qc_meanReadsPerUmi']
del Cancer_adata.obs['qc_totalReads']
del Cancer_adata.obs['qc_logUmiCount']
del Cancer_adata.obs['qc_bcSwapFraction']
del Cancer_adata.obs['qc_geneSatFraction']
del Cancer_adata.obs['qc_seqDupEst']
del Cancer_adata.obs['qc_umiSatFraction']
del Cancer_adata.obs['qc_emptyDropPval']
del Cancer_adata.obs['qc_mitoFraction']

In [22]:
Cancer_adata.obs['Study_name'] = 'Colon_cancer_atlas'

  Cancer_adata.obs['Study_name'] = 'Colon_cancer_atlas'


In [23]:
# Rename columns in Cancer_adata as in Healthy_adata
Cancer_adata.obs.rename(columns={'biosample_id': 'Sample_ID',
                                  'donor_id': 'Donor_ID',
                                  'disease__ontology_label': 'Diagnosis',
                                  'library_preparation_protocol__ontology_label': 'Library_Preparation_Protocol',
                                  'sex': 'Gender',
                                  'organ': 'Location',
                                  'ClusterTop': 'Cell Type'}, inplace=True)

In [24]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
Cancer_adata.obs['Cell Type'].replace({'Epi': 'Epithelial', 
                                        'Plasma': 'Plasma cells',
                                        'TNKILC': 'T cells',
                                        'B': 'B cells',
                                        'Strom': 'Mesenchymal',
                                        'Mast': 'Myeloid'}, inplace=True)

In [25]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
healthy_all_counts.obs['Cell Type'].replace({'Stem Cell': 'Epithelial'}, inplace=True)

In [26]:
# Rename values in Cancer_adata.obs as in Healthy_adata.obs
Cancer_adata.obs['Gender'].replace({'male': 'Male', 
                                        'female': 'Female'}, inplace=True)

Cancer_adata.obs['Library_Preparation_Protocol'].replace({"10X 3' v2 sequencing": "10x 3' v2", 
                                        "10X 3' v3 sequencing": "10x 3' v3"}, inplace=True)

In [27]:
# Rename values in Cancer_adata.obs['ClusterTop'] as in Healthy_adata.obs['Cell Type']
healthy_all_counts.obs['Location'].replace({'SmallInt' : 'Small Intestine',
                                            'Small Bowel' : 'Small Intestine',
                                            'LargeInt': 'Large Intestine',
                                            'Colon': 'Large Intestine',
                                            'REC' : 'Rectum',
                                            'Epi': 'Epithelium',
                                            'LP': 'Lamina Propria'}, inplace=True)

### Make the QC the same as in the Healthy dataset

In [28]:
# Calculate quality metrics for cancer dataset
sc.pp.calculate_qc_metrics(Cancer_adata, inplace=True)

In [29]:
# Make gene_id as a first column in adata.var
Cancer_adata.var.insert(0, 'gene_id', Cancer_adata.var.index)
# Make a gene name as index 
Cancer_adata.var.index = Cancer_adata.var['gene_name']

In [30]:
# Calculate mitochondrial fraction
Cancer_adata.var['mito'] = Cancer_adata.var_names.str.startswith(("MT-"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(Cancer_adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)
Cancer_adata.var['ribo'] = Cancer_adata.var_names.str.startswith(("RPS","RPL"))  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(Cancer_adata, qc_vars=['ribo'], percent_top=None, log1p=False, inplace=True)

In [31]:
Cancer_adata.obs

Unnamed: 0_level_0,Sample_ID,Donor_ID,SpecimenType,TissueSource,ProcessingMethod,PatientTypeID,Gender,Site,Grade,TumorStage,...,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,total_counts_mito,pct_counts_mito,total_counts_ribo,pct_counts_ribo
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,Male,left,low,notT4,...,4603.801270,8.434855,14.948589,21.813485,32.439550,48.347822,68.937042,1.497394,445.594940,9.678848
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,Male,left,low,notT4,...,2108.768555,7.654334,26.236284,38.976408,57.089805,99.936938,94.047325,4.459822,371.848663,17.633450
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,Male,left,low,notT4,...,5887.731934,8.680796,12.383847,17.824891,26.005590,39.622050,76.699623,1.302702,400.767029,6.806816
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,Male,left,low,notT4,...,4969.738281,8.511324,13.649248,20.067856,29.738970,45.172732,75.586861,1.520943,429.411682,8.640530
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC,C103_T_1_1_0_c1_v2,C103,T,MGH,unsorted,C103_T,Male,left,low,notT4,...,5981.045410,8.696518,11.909785,17.189286,25.343654,39.579511,78.871780,1.318696,373.194397,6.239618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,Female,left,high,T4,...,5487.610352,8.610431,11.153262,16.671019,24.364308,39.993964,80.156494,1.460681,246.709534,4.495755
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,Female,left,high,T4,...,5541.674316,8.620233,13.388294,18.677148,26.559739,40.406723,94.976021,1.713851,358.535309,6.469801
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,Female,left,high,T4,...,2635.017578,7.877025,15.402242,22.468265,34.015013,65.510289,103.340820,3.921827,67.731804,2.570450
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT,C173_T_0_0_0_c1_v3,C173,T,DFCI,unsorted,C173_T,Female,left,high,T4,...,2512.186279,7.829307,22.150078,33.266912,49.832953,87.480684,78.763542,3.135259,368.961121,14.686854


In [32]:
# Delete unnecessary columns
del Cancer_adata.obs['log1p_total_counts'], Cancer_adata.obs['pct_counts_in_top_50_genes'], Cancer_adata.obs['log1p_n_genes_by_counts']
del Cancer_adata.obs['pct_counts_in_top_100_genes'], Cancer_adata.obs['pct_counts_in_top_200_genes'], Cancer_adata.obs['pct_counts_in_top_500_genes']

In [33]:
# Write objects
Cancer_adata.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Colon_cancer_atlas_anndata.h5ad')
healthy_all_counts.write_h5ad('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad')

# Prepare Epithelial cells

In [27]:
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Healthy_integrated_data_all_genes.h5ad'
healthy_all_counts = sc.read_h5ad(input)
input = '/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/all_cells/Colon_cancer_atlas_anndata.h5ad'
Cancer_adata = sc.read_h5ad(input)



In [28]:
healthy_all_counts.obs['Diagnosis'].value_counts()

Healthy adult                            295158
Fetal Healthy                            231646
Pediatric healthy                         29265
Adult Ulcerative Colitis Non-inflamed      1030
Name: Diagnosis, dtype: int64

In [29]:
# Filter out fetal and pediatric samples
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Diagnosis'] != 'Fetal Healthy', :]
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Diagnosis'] != 'Pediatric healthy', :]

In [17]:
healthy_all_counts.obs

Unnamed: 0_level_0,Sample_ID,Cell Type,Study_name,Donor_ID,Diagnosis,Age,Region code,Fraction,Gender,Library_Preparation_Protocol,...,dataset,n_genes_by_counts,total_counts,total_counts_mito,pct_counts_mito,total_counts_ribo,pct_counts_ribo,Cell_ID,_scvi_batch,_scvi_labels
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCATGATAGTCAAGGC-1-WTDAtest7770718,A30-SCL-6-SC-45P-2,B cells,Gut Cell Atlas,A30 (398B),Healthy adult,20-25,SCL,SC-45P,Female,3',...,reference,1356,4981.0,219.0,4.396708,1788.0,35.896404,GCATGATAGTCAAGGC-1-WTDAtest7770718,0,0
H158108_N1-GAACGGACACTTAACG,H158108_N1,Mesenchymal,Kong 2023,158108,Healthy adult,,,,Male,10x 3' v2,...,query,883,1628.0,17.0,1.044226,251.0,15.417690,H158108_N1-GAACGGACACTTAACG,0,3
H197396_N1-TATGCCCAGGGTGTGT,H197396_N1,T cells,Kong 2023,197396,Healthy adult,,,,Male,10x 3' v2,...,query,414,678.0,51.0,7.522124,100.0,14.749263,H197396_N1-TATGCCCAGGGTGTGT,0,9
N17_LP_A-AGCGAACTTATGGC,N17_LP_A,Plasma cells,Kong 2023,N17,Healthy adult,,,,Male,10x 3' v1,...,query,231,1526.0,2.0,0.131062,159.0,10.419397,N17_LP_A-AGCGAACTTATGGC,0,6
H158108_N1-GTTAAGCAGAGGTAGA,H158108_N1,Epithelial,Kong 2023,158108,Healthy adult,,,,Male,10x 3' v2,...,query,806,2072.0,15.0,0.723938,185.0,8.928572,H158108_N1-GTTAAGCAGAGGTAGA,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H180844_N1-ATAGACCTCTAACTGG,H180844_N1,Epithelial,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,...,query,379,752.0,79.0,10.505320,207.0,27.526596,H180844_N1-ATAGACCTCTAACTGG,0,2
H180844_N4-TTTGCGCCATGCGCAC,H180844_N4,Epithelial,Kong 2023,180844,Healthy adult,,,,Male,10x 3' v2,...,query,407,807.0,132.0,16.356878,49.0,6.071871,H180844_N4-TTTGCGCCATGCGCAC,0,2
N51_LP_A-ACCTTTAGTTTGTTTC,N51_LP_A,Plasma cells,Kong 2023,N51,Healthy adult,,,,Male,10x 3' v2,...,query,1470,9510.0,175.0,1.840168,2198.0,23.112513,N51_LP_A-ACCTTTAGTTTGTTTC,0,6
N21_LP_B-AGAAGATGGTAAGA,N21_LP_B,Myeloid,Kong 2023,N21,Healthy adult,,,,Female,10x 3' v1,...,query,1273,4985.0,115.0,2.306921,900.0,18.054161,N21_LP_B-AGAAGATGGTAAGA,0,4


### Unify Cell States in Healthy reference

In [30]:
# Subset only epitheleal cells
Cancer_adata = Cancer_adata[Cancer_adata.obs['Cell Type'] == 'Epithelial', :]
healthy_all_counts = healthy_all_counts[healthy_all_counts.obs['Cell Type'] == 'Epithelial', :]


In [31]:
healthy_all_counts.obs['Unified Cell States'] = healthy_all_counts.obs['Cell States']

  healthy_all_counts.obs['Unified Cell States'] = healthy_all_counts.obs['Cell States']


In [32]:
healthy_all_counts.obs['Unified Cell States'].replace({"Enterocytes TMIGD1 MEP1A": 'Enterocyte',
                                               'Enterocytes CA1 CA2 CA4-': 'Enterocyte',
                                               'Enterocytes TMIGD1 MEP1A GSTA1': 'Enterocyte',
                                               'Stem cells OLFM4': 'Stem cells',
                                               'Stem cells OLFM4 LGR5': 'Stem cells',
                                               'Stem_Cells_GCA': 'Stem cells',
                                               'Stem cells OLFM4 PCNA': 'Stem cells',
                                               'Stem_Cells_ext': 'Stem cells',
                                               'Stem cells OLFM4 GSTA1': 'Stem cells',
                                               'Tuft': 'Tuft cells',
                                               'Paneth': 'Paneth cells',
                                               'Goblet cells SPINK4': 'Goblet cells',
                                               'Goblet cell': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1-': 'Goblet cells',
                                               'Goblet cells MUC2 TFF1': 'Goblet cells',
                                               'BEST2+ Goblet cell': 'Goblet cells',
                                               'L cells (PYY+)': 'L cells',
                                               'EC cells (TAC1+)': 'Enterochromaffin cells',
                                               'EC cells (NPW+)': 'Enterochromaffin cells',
                                               'EECs': 'Enteroendocrine cells',
                                               'BEST4+ epithelial': 'Enterocytes BEST4'}, inplace=True)

### Counts Normalization

In [33]:
sc.pp.log1p(Cancer_adata)
sc.pp.log1p(healthy_all_counts)

### Extract Highly Variable Genes

In [34]:
healthy_all_counts.layers['raw_counts'] = healthy_all_counts.X.copy()

### HVGs selection
# Calculate HVGs for cancer dataset
sc.pp.highly_variable_genes(
    healthy_all_counts,
    flavor = "seurat_v3",
    n_top_genes = 3000,
    layer = "raw_counts",
    batch_key = "Library_Preparation_Protocol",
    subset = True,
    span = 1
)

In [37]:
df = healthy_all_counts.obs['Cell States'].value_counts()

: 

In [24]:
Cancer_adata.layers['raw_counts'] = Cancer_adata.X.copy()

In [25]:
# Extract same HVGs in the cancer dataset as in the healthy dataset

#Make indexes as string
Cancer_adata.var.index = Cancer_adata.var.index.astype(str)

# Ensure indexes are unique
Cancer_adata.var_names_make_unique()

# Identify common genes
common_genes = list(set(healthy_all_counts.var_names) & set(Cancer_adata.var_names))

# Filter genes
healthy_all_counts = healthy_all_counts[:, common_genes]
Cancer_adata = Cancer_adata[:, common_genes]

#Ensure the same order of the genes
Cancer_adata = Cancer_adata[:, healthy_all_counts.var_names]



In [26]:
# Save anndata objects
Cancer_adata.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Colon_cancer_atlas_normalized_3KA.h5ad')
healthy_all_counts.write('/Users/anna.maguza/Desktop/Data/Processed_datasets/Cancer_dataset_integration/input_files/Epithelial_cells/Epithelial_Healthy_anndata_normalized_3K.h5ad')

  df[key] = c
