### Notebook for preprocession of adult healthy gut data

- **Developed by:** Anna Maguza
- **Place:** Wuerzburg Institute for System Immunology
- **Created date:** 10th April 2024
- **Last modified date:** 10th April 2024

#### Import packages

In [1]:
import numpy as np
import scanpy as sc
import pandas as pd
import numpy as np

#### Upload data

In [2]:
adata = sc.read_h5ad('/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Healthy_reference/Integrated/Integrated_4_datasets_05042024.h5ad')

#### Change obs

In [3]:
adata.obs['Cell Type'].replace({'Stem Cell': 'Epithelial'}, inplace=True)

In [4]:
adata.obs['Location'].replace({'SmallInt' : 'Small Intestine',
                                            'Small Bowel' : 'Small Intestine',
                                            'LargeInt': 'Large Intestine',
                                            'Colon': 'Large Intestine',
                                            'REC' : 'Rectum',
                                            'Epi': 'Epithelium',
                                            'LP': 'Lamina Propria'}, inplace=True)

In [5]:
adata.obs['Study_name'].replace({'Wang' : 'Wang, 2020',
                                            'Kong 2023' : 'Kong, 2023',
                                            'Smilie': 'Smilie, 2019',
                                            'Gut Cell Atlas': 'Elmentaite, 2021'}, inplace=True)

In [6]:
adata.obs['Library_Preparation_Protocol'].replace({"3'" : "10x 3' v1",
                                            "5'" : "10x 5' v1",
                                            "nan" : "10x 3' v1"}, inplace=True)

In [7]:
adata.obs.rename(columns={'Gender': 'Sex'}, inplace=True)
adata.obs.rename(columns={'Cell Type': 'Cell_Type'}, inplace=True)

+ Assign sex to donors

In [8]:
female_donors = ['Wang_Donor_2', 'N7', 'N8', 'N10', 'N13', 'N14', 'N18', 'N19', 'N20', 'N21', 'N23', 'N24', 'N44', 'N50', 'N106', 'N110', 'N111', 'N539']

adata.obs.loc[(adata.obs['Donor_ID'].isin(female_donors)) & (adata.obs['Sex'] == 'nan'), 'Sex'] = 'Female'

adata.obs.loc[adata.obs['Sex'] == 'nan', 'Sex'] = 'Male'

In [9]:
adata

AnnData object with n_obs × n_vars = 557099 × 23616
    obs: 'Sample_ID', 'Cell_Type', 'Study_name', 'Donor_ID', 'Diagnosis', 'Age', 'Region code', 'Fraction', 'Sex', 'Library_Preparation_Protocol', 'batch', 'Age_group', 'Location', 'Cell States', 'Cell States GCA', 'Chem', 'Layer', 'Cell States Kong', 'dataset', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'total_counts_ribo', 'pct_counts_ribo', 'Cell_ID', '_scvi_batch', '_scvi_labels', 'C_scANVI'
    var: 'gene_id-query', 'gene_name-query', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'mito', 'ribo'
    uns: 'Cell Type_colors', 'Donor_ID_colors', 'Library_Preparation_Protocol_colors', 'Study_name_colors', '_scvi_manager_uuid', '_scvi_uuid', 'hvg', 'neighbors', 'umap'
    obsm: 'X_scANVI', 'X_scvi', 'X_umap', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    obsp: 'connectivities', 'distances'

In [10]:
adata.write_h5ad('/mnt/LaCIE/annaM/gut_project/Processed_data/Gut_data/Healthy_reference/Integrated/Integrated_4_datasets_05042024.h5ad')