In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import sys
sys.path.append('../')
import preprocessing_tools as pt

# Read data

In [2]:
h5_path = '/home/SE/Downloads/GSE146811_mmProstate10x_timecourse_rawCount.h5'
metadata_path = '/home/SE/Downloads/GSE146811_mmProstate10x_full_sample_final.tsv.gz'

metadata_df = pd.read_csv(metadata_path, sep ='\t')
metadata_df.set_index('sampleID', inplace=True)
adata = sc.read_10x_h5(h5_path)

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [3]:
metadata_df = metadata_df.loc[adata.obs_names]

In [4]:
adata.obs_names.tolist() == metadata_df.index.tolist()

True

In [5]:
adata.obs = metadata_df

# Create time column

In [6]:
batch_ids = adata.obs['batchID'].tolist()
p_batches = [x.replace('_1', '').replace('_2', '').replace('_Epi', '').replace('_NonEpi', '').replace('_Unsorted', '').replace('_intact', '') for x in batch_ids]

adata.obs['time'] = p_batches
adata.obs['time'] = adata.obs['time'].astype('category')

In [7]:
# ly6a, Psca, tacstd2: Prostate stem cell markers
# nkx3-1, Pbsn, dpp4, prom1, cd59a: Differentiated Cells / Canonical androgen recetor target genes
# Egf (Epidermal growth factor ligand), expressed by L1 within 24h of androgen addback, peaked after full regeneration
# Trp63, Krt5, and Krt14: Basal cells
# cd24a, krt8, krt18: Canonical luminal
# foxi1: L3
# Nrg2, Rspo3: androgen-driven growth factors expressed by mesenchymal cells
# Krt13: distinguishes between clusters of basal cells
# Wnt2, Wnt6, Wnt10a, rorb: Ligands/receptors associated with epithelial growth and differentiation (expressed in M1)
# Rspo1, Fgf10, Sult1e1: // (expressed in M2)
# Acta2, MyH11: Myofibroblasts.
# Notch3: separates them
# Fgfr2, Igf1, Lgr4: Annotated ligand/receptor
# Pla2g2a not found (though some related to it)

# Relevant genes mentioned by the original Autohors

In [8]:
org_genes = adata.var_names.tolist()

In [9]:
genes = ['ly6a', 'psca', 'tacstd2', 'nkx3-1', 'pbsn', 'dpp4', 'prom1', 'cd59a', 'egf', 'trp63', 'krt5', 'krt14', 'cd24a', 'krt8', 'krt18',\
         'foxi1', 'nrg2', 'rspo3', 'krt13', 'wnt2', 'wnt6', 'wnt10a', 'rorb',\
         'rspo1', 'fgf10', 'sult1e1', 'acta2', 'myh11', 'notch3', 'fgfr2', 'igf1', 'lgr4', 'mki67']

claudin_family = [x.lower() for x in org_genes if 'cldn' in x.lower()]
genes.extend(claudin_family)

In [10]:
matched_genes = []
matched_genes = [x for x in org_genes if x.lower() in genes]

len(matched_genes), len(genes)

(68, 68)

In [11]:
adata.X.max(), adata.X.min()

(37493.0, 0.0)

In [12]:
adata.layers['counts'] = adata.X.copy()

# Filter cells and genes

In [13]:
sc.pp.filter_cells(adata, min_counts=100)
gene_filter, _ = sc.pp.filter_genes(
    adata, 
    min_counts=5, 
    inplace=False
    )

  utils.warn_names_duplicates("var")


In [14]:
gene_filter

array([ True, False, False, ...,  True, False,  True])

In [15]:
matched_genes_indices = [adata.var_names.get_loc(x) for x in matched_genes]

In [16]:
for i in matched_genes_indices:
    gene_filter[i] = True

In [17]:
adata = adata[:, gene_filter]

# Norm and log1p

In [18]:
adata.X.max(), adata.X.min()

(37493.0, 0.0)

In [19]:
sc.pp.normalize_total(
    adata, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata)

  view_to_actual(adata)


In [20]:
adata

AnnData object with n_obs × n_vars = 87187 × 20739
    obs: 'barcode', 'barcodeInt', 'batchID', 'highLevelPred', 'highLevelPredAmbig', 'predType', 'predTypeAmbig', 'predTypeInt', 'predTypeIntAmbig', 'time', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'log1p'
    layers: 'counts'

In [21]:
sc.pp.highly_variable_genes(adata, n_top_genes=5000)
adata.var['keep'] = adata.var['highly_variable'] | adata.var_names.isin(matched_genes)
adata = adata[:, adata.var['keep']]

In [22]:
adata

View of AnnData object with n_obs × n_vars = 87187 × 5027
    obs: 'barcode', 'barcodeInt', 'batchID', 'highLevelPred', 'highLevelPredAmbig', 'predType', 'predTypeAmbig', 'predTypeInt', 'predTypeIntAmbig', 'time', 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'keep'
    uns: 'log1p', 'hvg'
    layers: 'counts'

In [23]:
adata.var_names_make_unique()

  utils.warn_names_duplicates("var")


In [24]:
len(np.unique(adata.var_names))

5027

In [25]:
adata.uns = {}
adata.obs['placeholder'] = ['1'] * adata.shape[0]

In [26]:
len([x for x in matched_genes if x in adata.var_names])

68

In [27]:
unique_timepoints = np.unique(adata.obs['time'])

In [28]:
adata.uns['rank_genes_groups_time'] = {}

for u_t in unique_timepoints:
    if u_t == 'T00':
        continue
        
    time_degs = pt.compute_degs(
        adata,
        cov_key='predType', 
        cond_key='time', 
        stim_name=u_t, 
        control_name='T00',
        condition_names=[u_t, 'T00'],
        synergy=False,
        method='wilcoxon'
        )
    
    adata.uns['rank_genes_groups_time'][u_t] = time_degs

ERROR in  Epi_Luminal_2Psca SKIPPING.. Could not calculate statistics for groups T01_Cast_Day1 since they only contain one sample.
ERROR in  SymDoublet_Epi_Imm SKIPPING.. reference = T00 needs to be one of groupby = ['T01_Cast_Day1'].
ERROR in  Epi_SV_Ionocyte SKIPPING.. 'NoneType' object has no attribute 'columns'
ERROR in  SymDoublet_Epi_Imm SKIPPING.. reference = T00 needs to be one of groupby = ['T02_Cast_Day7'].
ERROR in  SymDoublet_Str_Epi SKIPPING.. 'NoneType' object has no attribute 'columns'
ERROR in  SymDoublet_Str_Imm SKIPPING.. 'NoneType' object has no attribute 'columns'
ERROR in  SymDoublet_Epi_Imm SKIPPING.. reference = T00 needs to be one of groupby = [].
ERROR in  SymDoublet_Epi_Imm SKIPPING.. reference = T00 needs to be one of groupby = ['T04_Cast_Day28'].
ERROR in  Epi_SV_Ionocyte SKIPPING.. 'NoneType' object has no attribute 'columns'
ERROR in  SymDoublet_Epi_Imm SKIPPING.. reference = T00 needs to be one of groupby = ['T05_Regen_Day1'].
ERROR in  SymDoublet_Epi_Imm

In [29]:
adata.X.max(), adata.X.min()

(10.346954087444171, 0.0)

In [30]:
adata

AnnData object with n_obs × n_vars = 87187 × 5027
    obs: 'barcode', 'barcodeInt', 'batchID', 'highLevelPred', 'highLevelPredAmbig', 'predType', 'predTypeAmbig', 'predTypeInt', 'predTypeIntAmbig', 'time', 'n_counts', 'placeholder'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'keep'
    uns: 'rank_genes_groups_time'
    layers: 'counts'

In [31]:
adata.X = adata.layers['counts'].copy()

In [32]:
adata.X.max()

37493.0

In [33]:
adata.obs['sc_cell_ids'] = list(range(adata.shape[0]))
adata.obs['sc_cell_ids'] = adata.obs['sc_cell_ids'].astype('category')

In [34]:
adata.obs['sc_cell_ids']

T00_Epi_1_id-AAACCTGCAAACCTAC                0
T00_Epi_1_id-AAACCTGCAATGGAGC                1
T00_Epi_1_id-AAACCTGCAGCTGGCT                2
T00_Epi_1_id-AAACCTGCATGTCCTC                3
T00_Epi_1_id-AAACGGGAGCGGCTTC                4
                                         ...  
T10_Regen_Day28_2_id-TTTGTCAAGACGACGT    87182
T10_Regen_Day28_2_id-TTTGTCACAGACGCTC    87183
T10_Regen_Day28_2_id-TTTGTCAGTCTAGTGT    87184
T10_Regen_Day28_2_id-TTTGTCATCGCAAGCC    87185
T10_Regen_Day28_2_id-TTTGTCATCTGGGCCA    87186
Name: sc_cell_ids, Length: 87187, dtype: category
Categories (87187, int64): [0, 1, 2, 3, ..., 87183, 87184, 87185, 87186]

In [35]:
adata.write_h5ad('../../preprocessed_datasets/prostate.h5ad')

In [36]:
adata

AnnData object with n_obs × n_vars = 87187 × 5027
    obs: 'barcode', 'barcodeInt', 'batchID', 'highLevelPred', 'highLevelPredAmbig', 'predType', 'predTypeAmbig', 'predTypeInt', 'predTypeIntAmbig', 'time', 'n_counts', 'placeholder', 'sc_cell_ids'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'keep'
    uns: 'rank_genes_groups_time'
    layers: 'counts'

In [37]:
adata.X.max(), adata.X.min()

(37493.0, 0.0)