## Notebook for Joanito-2022 data processing 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 18 October 2022

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py

In [2]:
def X_is_raw(adata):
    return np.array_equal(adata.X.sum(axis=0).astype(int), adata.X.sum(axis=0))

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.9.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


#### Upload Data

In [5]:
#Data Upload (csv)
Epithelial_meta = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/Joanito_cancer_dataset/Epithelial_metadata.csv', index_col=[1])
Nonepithelial_meta = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/Joanito_cancer_dataset/NonEpithelial_metadata.csv', index_col=[1])

In [6]:
#Data Upload (csv)
Patientmeta = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/Joanito_cancer_dataset/patient_clinical_information.csv', index_col=[1], encoding='cp1252')

### For anndata==0.8.0 and higher

In [7]:
def read_v3_10x_h5_mod(filename, *, start=None):
    """
    Read hdf5 file from Cell Ranger v3 or later versions.
    """
    with h5py.File(str(filename), 'r') as f:
        try:
            dsets = {}
            sc.readwrite._collect_datasets(dsets, f["matrix"])
            from scipy.sparse import csr_matrix

            M, N = dsets['shape']
            data = dsets['data']
            if dsets['data'].dtype == np.dtype('int32'):
                data = dsets['data'].view('float32')
                data[:] = dsets['data']
            matrix = csr_matrix(
                (data, dsets['indices'], dsets['indptr']),
                shape=(N, M),
            )
            adata = an.AnnData(
                matrix,
                obs=dict(obs_names=dsets['barcodes'].astype(str)),
                var=dict(
                    var_names=dsets['name'].astype(str),
                    feature_types=dsets['feature_type'].astype(str),
                    genome=dsets['genome'].astype(str),
                ),
            )
            return adata
        except KeyError:
            raise Exception('File is missing one or more required datasets.')

### For scanpy==1.6.0 and anndata==0.7.4 

In [9]:
def _collect_datasets(dsets: dict, group: h5py.Group):
    for k, v in group.items():
        if isinstance(v, h5py.Dataset):
            dsets[k] = v[()]
        else:
            _collect_datasets(dsets, v)

In [10]:
def read_v3_10x_h5_mod2(filename, *, start=None):
    """
    Read hdf5 file from Cell Ranger v3 or later versions.
    """
    with h5py.File(str(filename), 'r') as f:
        try:
            dsets = {}
            _collect_datasets(dsets, f["matrix"])

            from scipy.sparse import csr_matrix

            M, N = dsets['shape']
            data = dsets['data']
            if dsets['data'].dtype == np.dtype('int32'):
                data = dsets['data'].view('float32')
                data[:] = dsets['data']
            matrix = csr_matrix(
                (data, dsets['indices'], dsets['indptr']),
                shape=(N, M),
            )
            adata = an.AnnData(
                matrix,
                obs=dict(obs_names=dsets['barcodes'].astype(str)),
                var=dict(
                    var_names=dsets['name'].astype(str),
                    feature_types=dsets['feature_type'].astype(str),
                    genome=dsets['genome'].astype(str),
                ),
            )
            return adata
        except KeyError:
            raise Exception('File is missing one or more required datasets.')

In [8]:
# Please use read_v3_10x_h5_mod2 for earlier scanpy and anndata versions and read_v3_10x_h5_mod for anndata later than 0.8.0
Epithelian_data = read_v3_10x_h5_mod('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/Joanito_cancer_dataset/Epithelial_Count_matrix.h5')

  adata = an.AnnData(


In [9]:
# Please use read_v3_10x_h5_mod2 for earlier scanpy and anndata versions and read_v3_10x_h5_mod for anndata later than 0.8.0
Nonepithelian_data = read_v3_10x_h5_mod('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/Joanito_cancer_dataset/NonEpithelial_Count_matrix.h5')

  adata = an.AnnData(


#### Preparing anndata file (obs)

In [10]:
#Rename Nonepithelial_meta
Nonepithelial_meta = Nonepithelial_meta.rename(columns={"nCount_RNA": "cell.ID", "msi": "MSS/MSI"})


In [11]:
Nonepithelial_meta

Unnamed: 0_level_0,cell.ID,nFeature_RNA,percent.mt,sample.ID,patient.ID,sample.origin,dataset,cell.type
nCount_RNA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
29514,CRC16_MUX8563_AAACCTGCAAGCCGCT-1,2577,1.235932,MUX8563,CRC2794,Tumor,CRC-SG1,PlasmaB
3259,CRC16_MUX8563_AAACCTGTCTCGATGA-1,1531,6.171890,MUX8563,CRC2794,Tumor,CRC-SG1,T_NK
71212,CRC16_MUX8563_AAACCTGTCTCTGTCG-1,3083,0.891850,MUX8563,CRC2794,Tumor,CRC-SG1,PlasmaB
3771,CRC16_MUX8563_AAACGGGCAAGTTAAG-1,1686,3.639915,MUX8563,CRC2794,Tumor,CRC-SG1,T_NK
16948,CRC16_MUX8563_AAACGGGCAGTTCATG-1,4304,3.975425,MUX8563,CRC2794,Tumor,CRC-SG1,Fibroblast
...,...,...,...,...,...,...,...,...
10870,KUL5_EXT129_TTTGTCAGTTGGACCC-1,1230,1.356960,EXT129,SC044,Normal,KUL5,PlasmaB
1615,KUL5_EXT129_TTTGTCATCAGCATGT-1,983,9.032534,EXT129,SC044,Normal,KUL5,T_NK
1279,KUL5_EXT129_TTTGTCATCCATGAGT-1,850,7.793522,EXT129,SC044,Normal,KUL5,T_NK
25255,KUL5_EXT129_TTTGTCATCGGTTCGG-1,1595,1.074276,EXT129,SC044,Normal,KUL5,PlasmaB


In [12]:
#Merging cells and patients
merged_data_final = pd.concat([Epithelial_meta, Nonepithelial_meta])
merged_data_final = pd.merge(merged_data_final, Patientmeta, 
                   on='patient.ID', how='left')

In [13]:
print(merged_data_final.loc[49150:49160, ])

                                cell.ID  nFeature_RNA  percent.mt sample.ID  \
49150    KUL5_EXT129_CTGTGCTTCGCTTGTC-1          2926   15.401077    EXT129   
49151    KUL5_EXT129_GAAATGAGTTCCTCCA-1          5779   13.561872    EXT129   
49152    KUL5_EXT129_GATCGCGTCTGCTGTC-1          3934   19.530070    EXT129   
49153    KUL5_EXT129_GGAACTTCAGGAATGC-1          5551   11.547377    EXT129   
49154    KUL5_EXT129_TCACAAGGTTTGGGCC-1          3276    9.897333    EXT129   
49155  CRC16_MUX8563_AAACCTGCAAGCCGCT-1          2577    1.235932   MUX8563   
49156  CRC16_MUX8563_AAACCTGTCTCGATGA-1          1531    6.171890   MUX8563   
49157  CRC16_MUX8563_AAACCTGTCTCTGTCG-1          3083    0.891850   MUX8563   
49158  CRC16_MUX8563_AAACGGGCAAGTTAAG-1          1686    3.639915   MUX8563   
49159  CRC16_MUX8563_AAACGGGCAGTTCATG-1          4304    3.975425   MUX8563   
49160  CRC16_MUX8563_AAAGATGAGTTAACGA-1          3080    6.754530   MUX8563   

      patient.ID sample.origin dataset_x   cell.typ

#### Preparing anndata file (var and X)

In [14]:
#Concatenate 2 anndata files (nonepithelial and epithelian)
Joanito_2022 = Nonepithelian_data.concatenate(Epithelian_data)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [15]:
del Nonepithelian_data, Epithelian_data

#### Merge obs, vars and X

In [16]:
#Create anndata file 
Joanito_2022_anndata = an.AnnData(X=Joanito_2022.X,
                        obs=merged_data_final,
                        var=Joanito_2022.var)



In [17]:
#Chacking final anndata
Joanito_2022_anndata.X

<373058x33287 sparse matrix of type '<class 'numpy.float32'>'
	with 741331835 stored elements in Compressed Sparse Row format>

In [18]:
X_is_raw(Joanito_2022_anndata)

True

#### Rename obs

In [19]:
# Make cell.ID as index in obs
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.set_index('cell.ID')

In [20]:
# Rename columns in obs 
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"sample.ID": "Sample_ID"})
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"patient.ID": "Donor_ID"})
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"cell.type": "Cell Type"})
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"sample.origin": "Diagnosis"})
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"Sidedness": "Side"})
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"Site": "Location"})
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"Age at recruitment": "Tumor Stage"})
Joanito_2022_anndata.obs['Age_group'] = 'Adult'
Joanito_2022_anndata.obs['Study_name'] = 'Joanito_cancer'
Joanito_2022_anndata.obs = Joanito_2022_anndata.obs.rename(columns={"percent.mt": "pct_counts_mito"})
#del Joanito_2022_anndata.obs['nFeature_RNA']

In [21]:
Joanito_2022_anndata.obs['Gender'].replace({'M': 'Male', 
                                        'F': 'Female'}, inplace=True)

In [22]:
Joanito_2022_anndata.obs['Cell Type'].replace({'T_NK': 'T cells', 
                                        'PlasmaB': 'Plasma cells',
                                        'Fibroblast': 'Mesenchymal',
                                        'B': 'B cells'}, inplace=True)

In [23]:
# Calculate quality metrics for cancer dataset
sc.pp.calculate_qc_metrics(Joanito_2022_anndata, inplace=True)

In [24]:
Joanito_2022_anndata.obs

Unnamed: 0_level_0,nFeature_RNA,pct_counts_mito,Sample_ID,Donor_ID,Diagnosis,dataset_x,Cell Type,iCMS,msi,dataset_y,...,Age_group,Study_name,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes
cell.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CRC16_MUX8563_AAACGGGGTCGATTGT-1,5099,17.969349,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,CRC-SG1,...,Adult,Joanito_cancer,2577,7.854769,29514.0,10.292654,82.221996,84.563258,87.172189,91.044928
CRC16_MUX8563_AAAGATGCAGAAGCAC-1,4759,23.734351,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,CRC-SG1,...,Adult,Joanito_cancer,1531,7.334329,3259.0,8.089482,34.581160,42.405646,52.347346,68.364529
CRC16_MUX8563_AAAGCAATCTAACGGT-1,2580,24.403016,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,Normal,MSS,CRC-SG1,...,Adult,Joanito_cancer,3083,8.033983,71212.0,11.173430,89.725046,91.269730,92.697860,94.829523
CRC16_MUX8563_ACAGCCGGTCTCTTAT-1,2499,11.020450,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,CRC-SG1,...,Adult,Joanito_cancer,1686,7.430707,3771.0,8.235361,32.962079,41.500928,51.259613,68.363829
CRC16_MUX8563_ACAGCTATCCGTCATC-1,4937,23.243570,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,CRC-SG1,...,Adult,Joanito_cancer,4304,8.367532,16948.0,9.737964,31.602549,39.951617,49.032334,62.030918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KUL5_EXT129_TTTGTCAGTTGGACCC-1,1230,1.356960,EXT129,SC044,Normal,KUL5,Plasma cells,,,KUL5,...,Adult,Joanito_cancer,2926,7.981733,8000.0,8.987322,26.037500,35.625000,46.200000,61.250000
KUL5_EXT129_TTTGTCATCAGCATGT-1,983,9.032534,EXT129,SC044,Normal,KUL5,T cells,,,KUL5,...,Adult,Joanito_cancer,5779,8.662159,28255.0,10.249061,23.542736,32.503982,42.456202,57.165104
KUL5_EXT129_TTTGTCATCCATGAGT-1,850,7.793522,EXT129,SC044,Normal,KUL5,T cells,,,KUL5,...,Adult,Joanito_cancer,3934,8.277666,11016.0,9.307195,23.638344,32.452796,41.485113,55.637255
KUL5_EXT129_TTTGTCATCGGTTCGG-1,1595,1.074276,EXT129,SC044,Normal,KUL5,Plasma cells,,,KUL5,...,Adult,Joanito_cancer,5551,8.621914,26544.0,10.186597,27.946052,36.765371,46.470012,59.877939


In [25]:
# Delete unnecessary columns
del Joanito_2022_anndata.obs['log1p_total_counts'], Joanito_2022_anndata.obs['pct_counts_in_top_50_genes'], Joanito_2022_anndata.obs['log1p_n_genes_by_counts']
del Joanito_2022_anndata.obs['pct_counts_in_top_100_genes'], Joanito_2022_anndata.obs['pct_counts_in_top_200_genes'], Joanito_2022_anndata.obs['pct_counts_in_top_500_genes']

### Filter by quality metrics

In [26]:
#Filtering by genes
Joanito_2022_anndata = Joanito_2022_anndata[Joanito_2022_anndata.obs.n_genes_by_counts < 4000, :]
Joanito_2022_anndata = Joanito_2022_anndata[Joanito_2022_anndata.obs.n_genes_by_counts > 200, :]

#Filtering by counts
Joanito_2022_anndata = Joanito_2022_anndata[Joanito_2022_anndata.obs.total_counts < 20000, :]

### Save datasets

In [27]:
#Saving the final anndata file
Joanito_2022_anndata.write('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_all_cells.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c


In [28]:
# Filter only tumor cells
Joanito_2022_anndata = Joanito_2022_anndata[Joanito_2022_anndata.obs['Diagnosis'] == 'Tumor']

#Saving the final anndata file
Joanito_2022_anndata.write('/Users/anna.maguza/Desktop/Data/Gut_project/Joanito_cancer/anndata/Joanito_raw_anndata_tumor_cells.h5ad')