In [1]:
import scanpy as sc
import numpy as np
import anndata
import sys

sys.path.append('../')
import preprocessing_tools as pt

# Read and preprocess data

In [2]:
data_path = '../../original_datasets/Intestine/f34d2b82-9265-4a73-bda4-852933bf2a8d.h5ad'

In [3]:
adata = sc.read_h5ad(data_path)

# Subset to Epithelial cells

In [4]:
adata = adata[adata.obs['category'] == 'Epithelial']

# Subset to mature epithelial fates

In [5]:
mature_epithelial_fates = [
    "Enterocyte",
    "Colonocyte",
    "Goblet cell",
    "BEST4+ epithelial",
    "Paneth",
    "BEST2+ Goblet cell",
    "Tuft",
    "EECs",
    "Microfold cell",
    "EC cells (TAC1+)",
    "M/X cells (MLN/GHRL+)",
    "CLDN10+ cells",
    "I cells (CCK+)",
    "D cells (SST+)",
    "L cells (PYY+)",
    "K cells (GIP+)",
    "EC cells (NPW+)",
    "N cells (NTS+)",
    "Œ≤ cells (INS+)"
]

adata = adata[adata.obs['author_cell_type'].isin(mature_epithelial_fates)]

In [6]:
adata.obs['development_stage'].value_counts()

development_stage
seventh decade stage                  11940
10th week post-fertilization stage     9569
16th week post-fertilization stage     6525
sixth decade stage                     6359
15th week post-fertilization stage     6287
third decade stage                     5679
fifth decade stage                     4578
eighth decade stage                    4284
4-year-old stage                       3544
9-year-old stage                       2516
11th week post-fertilization stage     2044
12-year-old stage                      1710
17th week post-fertilization stage     1446
10-year-old stage                      1146
6-year-old stage                        968
14-year-old stage                       608
Carnegie stage 22                       457
12th week post-fertilization stage      438
13-year-old stage                       300
Carnegie stage 20                       160
11-year-old stage                       146
Carnegie stage 19                       110
Carnegie stage

# Subset to Adult stage

In [7]:
adult_stages = ['third decade stage', 'fifth decade stage', 'sixth decade stage', 'seventh decade stage', 'eighth decade stage']
adata = adata[adata.obs['development_stage'].isin(adult_stages)].copy()

# Subset assay

In [8]:
adata = adata[adata.obs['assay'] == "10x 5' v2"].copy()

# Set X to counts

In [9]:
adata.X = adata.raw.X.copy()

In [10]:
print(adata.X.max(), adata.X.min())

adata.layers['counts'] = adata.X.copy()
adata.raw = adata.copy()
adata.uns = {}

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=5)

adata.shape, adata.X.max(), adata.X.min()

27192.0 0.0


((29184, 21626), 27192.0, 0.0)

In [11]:
sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=True)



# Create cell ids

In [12]:
adata.obs['sc_cell_ids'] = list(range(adata.shape[0]))

# Write

In [13]:
adata.X = adata.layers['counts'].copy()

In [14]:
adata.obs['author_cell_type'].value_counts()

author_cell_type
Colonocyte               9728
Enterocyte               9565
BEST4+ epithelial        3256
Paneth                   2623
BEST2+ Goblet cell       2470
Goblet cell               671
Tuft                      550
Microfold cell            223
EC cells (TAC1+)           37
L cells (PYY+)             26
EECs                       14
I cells (CCK+)              6
N cells (NTS+)              5
M/X cells (MLN/GHRL+)       4
D cells (SST+)              3
K cells (GIP+)              3
Name: count, dtype: int64

# Remove low count cell types

In [15]:
adata = adata[~adata.obs['author_cell_type'].isin(['I cells (CCK+)', 'D cells (SST+)', 'N cells (NTS+)', 'M/X cells (MLN/GHRL+)', 'K cells (GIP+)'])]

In [17]:
adata.obs['donor_id']

barcodes
AACCGCGTCAACACCA-1-Human_colon_16S8000511    A32 (411C)
ACACCCTGTTCGGCAC-1-Human_colon_16S8000511    A32 (411C)
ACCAGTAAGAGTCGGT-1-Human_colon_16S8000511    A32 (411C)
ACGGGTCCACCAGGTC-1-Human_colon_16S8000511    A32 (411C)
ACTGATGTCAGGCGAA-1-Human_colon_16S8000511    A32 (411C)
                                                ...    
ACGAGGAAGCGTTCCG-1-Human_colon_16S8117828    A26 (386C)
CCGTTCACAACGATGG-1-Human_colon_16S8117828    A26 (386C)
TACGGGCTCATGCAAC-1-Human_colon_16S8117828    A26 (386C)
TAGTTGGAGGATGGTC-1-Human_colon_16S8117828    A26 (386C)
TATGCCCAGACAATAC-1-Human_colon_16S8117828    A26 (386C)
Name: donor_id, Length: 29163, dtype: category
Categories (5, object): ['A26 (386C)', 'A32 (411C)', 'A34 (417C)', 'A38 (432C)', 'A39 (440C)']

In [18]:
adata.write_h5ad('../../preprocessed_datasets/intestine.h5ad')

In [19]:
adata.X.max(), adata.X.min()

(27192.0, 0.0)

In [20]:
adata

View of AnnData object with n_obs × n_vars = 29163 × 5000
    obs: 'donor_id', 'Diagnosis', 'Fraction', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'doublet_scores', 'predicted_doublets', 'category', 'Age_group', 'author_cell_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'sex_ontology_term_id', 'suspension_type', 'hca_data_portal_donor_uuid', 'hca_data_portal_cellsuspension_uuid', 'tissue_type', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_counts', 'sc_cell_ids'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'feature_types', 'feature_is_filtered', 'gene_symbols', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'featu

In [21]:
adata.obs['assay'].value_counts()

assay
10x 5' v2    29163
Name: count, dtype: int64

In [22]:
adata.obs['donor_id'].value_counts()

donor_id
A34 (417C)    11939
A39 (440C)     4561
A38 (432C)     4346
A26 (386C)     4282
A32 (411C)     4035
Name: count, dtype: int64