# Notebook to filter out cells from the expression matrix which have abnormal levels of genes, or an abnormal mitochondrial content

Suitable for any dataset where we have multiple GSMs and an expression matrix for each GSM

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import argparse
import matplotlib.pyplot as plt

In [None]:
dataType='10x'
mitoName='MT-' #Prefix on mitochondrial genes (MT- for human, Mt- for Rat)
minGene=200
minCells=3

In [None]:
GSM_list = pd.read_csv('gsm.txt',header=None)
filenames = []
for i in GSM_list:
    filenames.append(f'./Data/Expression/{i}/raw')

# Reads in expression matrices for every GSM in your dataset    
adatas = [sc.read_10x_mtx(filename) for filename in filenames]

In [None]:
# To account for the possibility of barcodes appearing in multiple GSMs we have to append the GSM to the front of the barcode for all GSMs
for i , GSM in enumerate(GSM_list):
    adatas[i].obs.index = f'{GSM}_' + adatas[i].obs.index
    
# Creates one giant expression matrix from the whole experiment to do quality control on
adata = adatas[0].concatenate(adatas[1:],index_unique=None)
adata.var_names_make_unique()

In [None]:
#filtering criterion
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_genes(adata, min_cells=min_cells)

adata.var['mt'] = adata.var_names.str.startswith(mitoName)  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [None]:
# Set the thresholds based off the violin plots above (some iteration may be required)
adata = adata[adata.obs.total_counts < 10000, :] # Generally set in order to filter out possible doublets
adata = adata[adata.obs.pct_counts_mt < 10, :]
adata = adata[adata.obs.pct_counts_mt > 0, :] 

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(adata, x='n_genes_by_counts', y='pct_counts_mt')

Save list of all barcodes which passed quality control

In [None]:
filtered_barcodes = adata.obs_names.astype("str")
np.savetxt('allFilteredBarcodes.txt', filtered_barcodes, fmt="%s")

In [None]:
# Creates separate barcode list files for each GSM for use by the variant caller

barcodeSplit = pd.DataFrame(filtered_barcodes)[0].str.split('-',expand=True)[0].str.split('_',expand=True)

for i in barcodeSplit[0].unique():
    np.savetxt(f'{i}_filtered_barcodes.txt', barcodeSplit[barcodeSplit[0]==i][1], fmt="%s")