# Initial processing and QC of CABG snRNA-seq Data
## Samples CABG
## Analysis date 2022/02/15

Each single cell/nuclei RNA-seq dataset needs to be loaded individually such that any metadata (sample information such as sample name, replicate number, treatment group etc) can be added before they are all merged together.

__Overview of steps__
1. Create object for each sample
    1. Load matrix data
    2. Calculate QC metrics for each barcode/nuclei
    3. Adding metadata to the .obs field of the anndata object
2. Merge all samples
3. Filter using QC metrics
4. Write anndata object to file (.h5ad)


## Initalise packages 

In [1]:
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
import seaborn as sns
import scrublet as scr

sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=120, color_map='viridis')
sc.logging.print_header()

%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [5, 5]

scanpy==1.8.2 anndata==0.7.6 umap==0.5.2 numpy==1.20.1 scipy==1.7.1 pandas==1.3.4 scikit-learn==1.0.1 statsmodels==0.13.0 python-igraph==0.9.8 pynndescent==0.5.4


In [2]:
INDIR='/home/mlee/RDS/projects/ambition/live/CellRanger_output/GENEWIZ_Project_40-541317003'
INDIR2='/home/mlee/RDS/projects/ambition/live/CellRanger_output/GENEWIZ_Project_40-596030144'
INDIR3='/home/mlee/RDS/projects/ambition/live/CellRanger_output/GENEWIZ_Project_40-615194263'
OUTDIR='/home/mlee/RDS/projects/ambition/live/Analysis'

In [3]:
DATE='2022-02-15'

## 1. Create object for each sample

## Patient_85_Ischaemic

In [4]:
Patient_85_Ischaemic = sc.read_10x_mtx(
    INDIR + '/Patient_85_Ischaemic' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_85_Ischaemic

In [5]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [7]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [9]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [10]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.65
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 7.2%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 6.5%
Elapsed time: 2.8 seconds


In [11]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [12]:
adata.obs['Sample'] = 'Patient_85_Ischaemic'
adata.obs['Patient'] = 'Patient_85'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Ischaemic'

## Patient_85_Remote

In [17]:
Patient_85_Remote = sc.read_10x_mtx(
    INDIR + '/Patient_85_Remote' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_85_Remote

In [18]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [20]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [22]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [23]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.39
Detected doublet rate = 3.2%
Estimated detectable doublet fraction = 41.9%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 7.7%
Elapsed time: 3.5 seconds


In [24]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [25]:
adata.obs['Sample'] = 'Patient_85_Remote'
adata.obs['Patient'] = 'Patient_85'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Remote'

## Patient_80_Ischaemic

In [30]:
Patient_80_Ischaemic = sc.read_10x_mtx(
    INDIR2 + '/Patient_80_Ischaemic' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_80_Ischaemic

In [31]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [33]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [35]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [36]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.45
Detected doublet rate = 1.1%
Estimated detectable doublet fraction = 10.9%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 10.1%
Elapsed time: 0.2 seconds


In [37]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [38]:
adata.obs['Sample'] = 'Patient_80_Ischaemic'
adata.obs['Patient'] = 'Patient_80'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Ischaemic'

## Patient_80_Remote

In [43]:
Patient_80_Remote = sc.read_10x_mtx(
    INDIR2 + '/Patient_80_Remote' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata =Patient_80_Remote

In [44]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [46]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [48]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [49]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.41
Detected doublet rate = 0.8%
Estimated detectable doublet fraction = 7.1%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 11.3%
Elapsed time: 0.1 seconds


In [50]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [51]:
adata.obs['Sample'] = 'Patient_80_Remote'
adata.obs['Patient'] = 'Patient_80'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Remote'

## Patient_83_Ischaemic

In [56]:
Patient_80_Remote = sc.read_10x_mtx(
    INDIR2 + '/Patient_80_Remote' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_80_Remote

In [57]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [59]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [61]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [62]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.41
Detected doublet rate = 0.8%
Estimated detectable doublet fraction = 7.1%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 11.3%
Elapsed time: 0.2 seconds


In [63]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [64]:
adata.obs['Sample'] = 'Patient_83_Ischaemic'
adata.obs['Patient'] = 'Patient_83'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Ischaemic'

## Patient_83_Remote

In [69]:
Patient_83_Remote = sc.read_10x_mtx(
    INDIR2 + '/Patient_83_Remote' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata =Patient_83_Remote

In [70]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [72]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [74]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [75]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 0.0%
Elapsed time: 1.4 seconds


In [76]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [77]:
adata.obs['Sample'] = 'Patient_83_Remote'
adata.obs['Patient'] = 'Patient_83'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Remote'

## Patient_94_Ischaemic

In [82]:
Patient_94_Ischaemic = sc.read_10x_mtx(
    INDIR2 + '/Patient_94_Ischaemic' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_94_Ischaemic

In [83]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [84]:
adata

AnnData object with n_obs × n_vars = 393 × 36601
    var: 'gene_ids', 'feature_types'

In [85]:
sc.pp.filter_cells(adata, min_genes=200)

In [86]:
adata

AnnData object with n_obs × n_vars = 393 × 36601
    obs: 'n_genes'
    var: 'gene_ids', 'feature_types'

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [87]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [88]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.39
Detected doublet rate = 1.0%
Estimated detectable doublet fraction = 18.1%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 5.6%
Elapsed time: 0.1 seconds


In [89]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [90]:
adata.obs['Sample'] = 'Patient_94_Ischaemic'
adata.obs['Patient'] = 'Patient_94'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Ischaemic'

## Patient_94_Remote

In [95]:
Patient_94_Remote = sc.read_10x_mtx(
    INDIR2 + '/Patient_94_Remote' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_94_Remote

In [96]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [98]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [100]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [101]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.28
Detected doublet rate = 5.3%
Estimated detectable doublet fraction = 41.9%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 12.8%
Elapsed time: 0.4 seconds


In [102]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [103]:
adata.obs['Sample'] = 'Patient_94_Remote'
adata.obs['Patient'] = 'Patient_94'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Remote'

## Patient_102_Ischaemic

In [108]:
Patient_102_Ischaemic = sc.read_10x_mtx(
    INDIR2 + '/Patient_102_Ischaemic' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_102_Ischaemic

In [109]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [111]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [113]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [114]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.57
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 6.7%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 3.9%
Elapsed time: 0.7 seconds


In [115]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [116]:
adata.obs['Sample'] = 'Patient_102_Ischaemic'
adata.obs['Patient'] = 'Patient_102'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Ischaemic'

## Patient_102_Remote

In [121]:
Patient_102_Remote = sc.read_10x_mtx(
    INDIR2 + '/Patient_102_Remote' + '/outs/filtered_feature_bc_matrix',  # the directory with the `.mtx` file
    var_names='gene_symbols',                  # use gene symbols for the variable names (variables-axis index)
    cache=True)                                # write a cache file for faster subsequent reading
adata = Patient_102_Remote

In [122]:
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [124]:
sc.pp.filter_cells(adata, min_genes=200)

### Calculate QC metrics
May need to adjust for naming convertion for mito and ribo genes for zebra fish

In [126]:
mito_genes = adata.var_names.str.startswith('MT-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1).A1
ribo_genes = adata.var_names.str.startswith(('RPS','RPL')) # Seaches for all genes that start with 'RPS' or 'RPL'
adata.obs['percent_ribo'] = np.sum(
    adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [127]:
scrub = scr.Scrublet(adata.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.38
Detected doublet rate = 1.7%
Estimated detectable doublet fraction = 24.6%
Overall doublet rate:
	Expected   = 10.0%
	Estimated  = 7.0%
Elapsed time: 0.1 seconds


In [128]:
adata.obs['scrublet_score'] = doublet_scores
adata.obs['predicted_doublets'] = predicted_doublets.astype(str)

### Add metadata for the sample
You can add any information you want that might be useful to look at later. You can always add things later but it is easier if you do it now

In [129]:
adata.obs['Sample'] = 'Patient_102_Remote'
adata.obs['Patient'] = 'Patient_102'
adata.obs['Source'] = 'Nuclei'
adata.obs['Region'] = 'LV'
adata.obs['Group'] = 'IHD'
adata.obs['Sample_type'] = 'CABG'
adata.obs['Location'] = 'Remote'

## Merge all samples

In [239]:
adata_merged = anndata.AnnData.concatenate(Patient_80_Ischaemic, Patient_80_Remote, Patient_83_Ischaemic, Patient_83_Remote, Patient_85_Ischaemic, Patient_85_Remote, Patient_94_Ischaemic, Patient_94_Remote, Patient_102_Ischaemic, Patient_102_Remote, join = 'outer')

In [243]:
# Save the unfiltered merged anndata object to .h5ad file
adata_merged.write(OUTDIR + '/CABG_merged_unfiltered_' + DATE +'.h5ad')

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'predicted_doublets' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Sample' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Patient' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'Location' as categorical


## Filter using QC metrics

In [245]:
adata_merged_filtered = adata_merged.copy()
# Filter by UMI counts
sc.pp.filter_cells(adata_merged_filtered, min_counts=500)
sc.pp.filter_cells(adata_merged_filtered, max_counts=15000)

# Filter by number of genes expressedf
sc.pp.filter_cells(adata_merged_filtered, min_genes=300)
sc.pp.filter_cells(adata_merged_filtered, max_genes=6000)

# Filter by %mitochondrial genes, %ribosomal genes, and scublet score (doublet probability)
adata_merged_filtered = adata_merged_filtered[adata_merged_filtered.obs.percent_mito < 0.05, :]
adata_merged_filtered = adata_merged_filtered[adata_merged_filtered.obs.percent_ribo < 0.05, :]
adata_merged_filtered = adata_merged_filtered[adata_merged_filtered.obs.scrublet_score < 0.30, :]

### Write anndata object to file (.h5ad)

In [255]:
adata_merged_filtered.write(OUTDIR + '/CABG_merged_filtered_' + DATE +'.h5ad')

In [260]:
(OUTDIR + '/CABG_merged_filtered_' + DATE +'.h5ad')

'/home/mlee/RDS/projects/ambition/live/Analysis/CABG_merged_filtered_2022-02-15.h5ad'