In [None]:
import scanpy as sc
import sys
import os
import matplotlib.pyplot as plt
import pandas as pd
import cellbender 
import scipy
%matplotlib inline
import scrublet as scr
from scipy import io
import matplotlib.pyplot as plt
import numpy as np

In [None]:
import sys

try:
    from importlib import metadata
except ImportError:
    import importlib_metadata as metadata

sys.modules['importlib.metadata'] = metadata

# Remove doublets for each sample using scrublet 
### documentation: https://github.com/swolock/scrublet

# <span style="color:green"> Read-in cellbender output and inspect raw data

In [None]:
input_dir = "cellbender_data/WT2_gex_cellbender_filtered"
count_matrix = scipy.io.mmread(input_dir + '/matrix.mtx').T.tocsc()
genes = np.array(scr.load_genes(input_dir + '/genes.tsv', delimiter='\t', column=1))
barcodes = pd.read_table(input_dir+"/barcodes.tsv", header = None)

### making sure everything looks good

In [None]:
print(count_matrix.shape[0], count_matrix.shape[1])

In [None]:
len(barcodes)

In [None]:
print(len(genes))

# <span style="color:green"> Initiate srublet object with default parameters

In [None]:
scrub = scr.Scrublet(count_matrix)
doublet_scores, predicted_doublets = scrub.scrub_doublets()


In [None]:
scrub.plot_histogram();

# <span style="color:green"> Adjust threshold if needed

In [None]:
scrub.call_doublets(threshold=0.27)

In [None]:
scrub.plot_histogram();

In [None]:
scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
scrub.plot_embedding('UMAP', order_points=True);

# <span style="color:green"> Filter data based on the threshold

In [None]:
threshold = 0.27
singlet = scrub.doublet_scores_obs_ < threshold
doublet = scrub.doublet_scores_obs_ >= threshold

In [None]:
# number of doublets
sum(doublet)

In [None]:
new_barcodes = barcodes[singlet]
print(len(new_barcodes), sum(singlet))

In [None]:
doublet_free_data = count_matrix[singlet]
doublet_free_data

In [None]:
transposed_matrix = doublet_free_data.transpose()
transposed_matrix

# <span style="color:green"> Export doublet-free data 

In [None]:
# !mkdir doublet_free_matrix 

In [None]:
!mkdir doublet_free_matrix/WT2_matrix

In [None]:
io.mmwrite('doublet_free_matrix/WT2_matrix/matrix', transposed_matrix)
new_barcodes.to_csv('doublet_free_matrix/WT2_matrix/barcodes.tsv', sep='\t', index = False, header = False)

#### convert genes (arrary) to genes_df (dataframe) and write a features.tsv. Or can just simply copy the genes.tsv file into the "doublet_free_matrix/WT2_matrix" directory and change the name to features.tsv

In [None]:
genes_df = pd.DataFrame(genes)

In [None]:
genes_df.to_csv("features.tsv", sep='\t', index = False, header = False)

In [None]:
!gzip doublet_free_matrix/WT2_matrix/*

In [None]:
!ls doublet_free_matrix/WT2_matrix