## Notebook for Joanito-2022 data processing 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 18 October 2022

#### Load packages

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import scrublet
import h5py

#### Setup Cells

In [None]:
%matplotlib inline

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.8.1 pandas==1.4.2 scikit-learn==1.1.1 statsmodels==0.13.2 python-igraph==0.9.11 louvain==0.7.1 pynndescent==0.5.7


#### Upload Data

In [None]:
#Data Upload (csv)
Epithelial_meta = pd.read_csv('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Joanito-Epithelial_metadata.csv', index_col=[1])
Nonepithelial_meta = pd.read_csv('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Joanito-Nonepithelial_metadata.csv', index_col=[1])

In [None]:
#Data Upload (csv)
Patientmeta = pd.read_csv('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Joanito-patient_clinical_information.csv', index_col=[1], encoding='cp1252')

In [None]:
def read_v3_10x_h5_mod(filename, *, start=None):
    """
    Read hdf5 file from Cell Ranger v3 or later versions.
    """
    with h5py.File(str(filename), 'r') as f:
        try:
            dsets = {}
            sc.readwrite._collect_datasets(dsets, f["matrix"])
            from scipy.sparse import csr_matrix

            M, N = dsets['shape']
            data = dsets['data']
            if dsets['data'].dtype == np.dtype('int32'):
                data = dsets['data'].view('float32')
                data[:] = dsets['data']
            matrix = csr_matrix(
                (data, dsets['indices'], dsets['indptr']),
                shape=(N, M),
            )
            adata = an.AnnData(
                matrix,
                obs=dict(obs_names=dsets['barcodes'].astype(str)),
                var=dict(
                    var_names=dsets['name'].astype(str),
                    feature_types=dsets['feature_type'].astype(str),
                    genome=dsets['genome'].astype(str),
                ),
            )
            return adata
        except KeyError:
            raise Exception('File is missing one or more required datasets.')

In [None]:
Epithelian_data = read_v3_10x_h5_mod('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Joanito-Epithelial_Count_matrix.h5')

  adata = an.AnnData(


In [None]:
Nonepithelian_data = read_v3_10x_h5_mod('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/Joanito-Nonepithelial_Count_matrix.h5')

  adata = an.AnnData(


#### Preparing anndata file (obs)

In [None]:
#Rename Nonepithelial_meta
Nonepithelial_meta = Nonepithelial_meta.rename(columns={"nCount_RNA": "cell.ID", "msi": "MSS/MSI"})


In [None]:
Nonepithelial_meta

Unnamed: 0,cell.ID,nFeature_RNA,percent.mt,sample.ID,patient.ID,sample.origin,dataset,cell.type
29514,CRC16_MUX8563_AAACCTGCAAGCCGCT-1,2577,1.235932,MUX8563,CRC2794,Tumor,CRC-SG1,PlasmaB
3259,CRC16_MUX8563_AAACCTGTCTCGATGA-1,1531,6.171890,MUX8563,CRC2794,Tumor,CRC-SG1,T_NK
71212,CRC16_MUX8563_AAACCTGTCTCTGTCG-1,3083,0.891850,MUX8563,CRC2794,Tumor,CRC-SG1,PlasmaB
3771,CRC16_MUX8563_AAACGGGCAAGTTAAG-1,1686,3.639915,MUX8563,CRC2794,Tumor,CRC-SG1,T_NK
16948,CRC16_MUX8563_AAACGGGCAGTTCATG-1,4304,3.975425,MUX8563,CRC2794,Tumor,CRC-SG1,Fibroblast
...,...,...,...,...,...,...,...,...
10870,KUL5_EXT129_TTTGTCAGTTGGACCC-1,1230,1.356960,EXT129,SC044,Normal,KUL5,PlasmaB
1615,KUL5_EXT129_TTTGTCATCAGCATGT-1,983,9.032534,EXT129,SC044,Normal,KUL5,T_NK
1279,KUL5_EXT129_TTTGTCATCCATGAGT-1,850,7.793522,EXT129,SC044,Normal,KUL5,T_NK
25255,KUL5_EXT129_TTTGTCATCGGTTCGG-1,1595,1.074276,EXT129,SC044,Normal,KUL5,PlasmaB


In [None]:
#Merging cells and patients
merged_data_final = pd.concat([Epithelial_meta, Nonepithelial_meta])
merged_data_final = pd.merge(merged_data_final, Patientmeta, 
                   on='patient.ID', how='left')

In [None]:
print(merged_data_final.loc[49150:49160, ])

                                cell.ID  nFeature_RNA  percent.mt sample.ID  \
49150    KUL5_EXT129_CTGTGCTTCGCTTGTC-1          2926   15.401077    EXT129   
49151    KUL5_EXT129_GAAATGAGTTCCTCCA-1          5779   13.561872    EXT129   
49152    KUL5_EXT129_GATCGCGTCTGCTGTC-1          3934   19.530070    EXT129   
49153    KUL5_EXT129_GGAACTTCAGGAATGC-1          5551   11.547377    EXT129   
49154    KUL5_EXT129_TCACAAGGTTTGGGCC-1          3276    9.897333    EXT129   
49155  CRC16_MUX8563_AAACCTGCAAGCCGCT-1          2577    1.235932   MUX8563   
49156  CRC16_MUX8563_AAACCTGTCTCGATGA-1          1531    6.171890   MUX8563   
49157  CRC16_MUX8563_AAACCTGTCTCTGTCG-1          3083    0.891850   MUX8563   
49158  CRC16_MUX8563_AAACGGGCAAGTTAAG-1          1686    3.639915   MUX8563   
49159  CRC16_MUX8563_AAACGGGCAGTTCATG-1          4304    3.975425   MUX8563   
49160  CRC16_MUX8563_AAAGATGAGTTAACGA-1          3080    6.754530   MUX8563   

      patient.ID sample.origin dataset_x   cell.typ

#### Preparing anndata file (var and X)

In [None]:
#Concatenate 2 anndata files (nonepithelial and epithelian)
Joanito_2022 = Nonepithelian_data.concatenate(Epithelian_data)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


#### Final Part in preparing anndata file: Merging obs (clinical and cell data) with X and var (gene counts and genes)

In [None]:
#Create anndata file 
Joanito_2022_anndata = an.AnnData(X=Joanito_2022.X,
                        obs=merged_data_final,
                        var=Joanito_2022.var)



In [None]:
#Chacking final anndata
Joanito_2022_anndata.X

<373058x33287 sparse matrix of type '<class 'numpy.float32'>'
	with 741331835 stored elements in Compressed Sparse Row format>

In [None]:
#Chacking final anndata
Joanito_2022_anndata.var

Unnamed: 0,feature_types,genome
MIR1302-2HG,Gene Expression,GRCh38_ensembl93
FAM138A,Gene Expression,GRCh38_ensembl93
OR4F5,Gene Expression,GRCh38_ensembl93
AL627309.1,Gene Expression,GRCh38_ensembl93
AL627309.3,Gene Expression,GRCh38_ensembl93
...,...,...
AC233755.2,Gene Expression,GRCh38_ensembl93
AC233755.1,Gene Expression,GRCh38_ensembl93
AC240274.1,Gene Expression,GRCh38_ensembl93
AC213203.1,Gene Expression,GRCh38_ensembl93


In [None]:
#Chacking final anndata
Joanito_2022_anndata.obs

Unnamed: 0,cell.ID,nFeature_RNA,percent.mt,sample.ID,patient.ID,sample.origin,dataset_x,cell.type,iCMS,msi,...,iCMS.transcriptomic,iCMS.inferCNV,KRAS,BRAF,TP53,APC,PIK3CA,LymphNode,Normal,Tumor
0,CRC16_MUX8563_AAACGGGGTCGATTGT-1,5099,17.969349,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,...,iCMS2,iCMS2,wt,wt,mut,mut,wt,0.0,1.0,4.0
1,CRC16_MUX8563_AAAGATGCAGAAGCAC-1,4759,23.734351,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,...,iCMS2,iCMS2,wt,wt,mut,mut,wt,0.0,1.0,4.0
2,CRC16_MUX8563_AAAGCAATCTAACGGT-1,2580,24.403016,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,Normal,MSS,...,iCMS2,iCMS2,wt,wt,mut,mut,wt,0.0,1.0,4.0
3,CRC16_MUX8563_ACAGCCGGTCTCTTAT-1,2499,11.020450,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,...,iCMS2,iCMS2,wt,wt,mut,mut,wt,0.0,1.0,4.0
4,CRC16_MUX8563_ACAGCTATCCGTCATC-1,4937,23.243570,MUX8563,CRC2794,Tumor,CRC-SG1,Epithelial,iCMS2,MSS,...,iCMS2,iCMS2,wt,wt,mut,mut,wt,0.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373053,KUL5_EXT129_TTTGTCAGTTGGACCC-1,1230,1.356960,EXT129,SC044,Normal,KUL5,PlasmaB,,,...,iCMS3,iCMS3,wt,wt,wt,wt,,0.0,2.0,3.0
373054,KUL5_EXT129_TTTGTCATCAGCATGT-1,983,9.032534,EXT129,SC044,Normal,KUL5,T_NK,,,...,iCMS3,iCMS3,wt,wt,wt,wt,,0.0,2.0,3.0
373055,KUL5_EXT129_TTTGTCATCCATGAGT-1,850,7.793522,EXT129,SC044,Normal,KUL5,T_NK,,,...,iCMS3,iCMS3,wt,wt,wt,wt,,0.0,2.0,3.0
373056,KUL5_EXT129_TTTGTCATCGGTTCGG-1,1595,1.074276,EXT129,SC044,Normal,KUL5,PlasmaB,,,...,iCMS3,iCMS3,wt,wt,wt,wt,,0.0,2.0,3.0


In [None]:
#Saving the final anndata file
Joanito_2022_anndata.write('/Users/annamaguza/Desktop/Desktop-Anna/LMU/Master-Thesis/Anna-Master-Project/joanito_2022_anndata_raw.h5ad', compression='gzip')