# QC analysis of single cell barcode dataset: This notebook will guide you through the process of quality control analyis on a single-cell RNA sequencing barcode dataset

In [1]:
# Parameters
DEV = False
test_DatasetID = ""
DS_test = False
title = "Recursion DLD1 Library4 Screen Day 5- combination of NGS 5570 and NGS 5704"
description = "Production scale screen in DLD-1 cells with 1/4 of the genome-wide CRISPR library (sub-lib-4, ~22K guides). 1 time point: Day-5. We performed 12 rxns of 10x 3' HT kit with an estimated loading of 52K cells.We performed a pilot study to test the library quality before the production-level sequencing(NGS5703). We will submit 36 libraries (12 GEX, 12 HTO and 12 sgRNA) We estimate needing - 600M per GEX, - 20M per HTO and - 50M per sgRNA libraries now that we have analyzed the QC run. This run is to add additional cells needed for analysis."
name_space = [{"id": "GRCh38", "type": "genome"}]
sources = [{"id": "sublib4", "name": "Recursion DLD1"}]
tech_name = "scRNA-seq"
author = "SG"
organism = "human"
experiment = "crispr"
topBarcodesToPlot = 5
bottomBarcodesToPlot = 5
fix_barcodes = False
valid_assignments = None
alt_experiments = ["crispr", "hashing"]


In [2]:
# Different scenario for original dataset or test dataset

if DS_test==False:
    #DSID = DatasetID
    Version = '4'   # the version of your dataset
else:
    #DSID = test_DatasetID
    Version = '2'   # the version of your dataset
    DEV = True
your_experiment = "crispr_hashing_harmonized"

In [3]:
if experiment== "hashing":
    columns_1 = ["DemuxType_crispr","DemuxAssignment_crispr"]
    columns_2 = ["DemuxType_hashing","DemuxAssignment_hashing"]
    ALT = "HTO"
else:
    columns_2 = ["DemuxType_crispr","DemuxAssignment_crispr"]
    columns_1 = ["DemuxType_hashing","DemuxAssignment_hashing"]
    ALT = "gRNA"

In [51]:
out_file = f"/gstore/scratch/u/ghaffars/Dataset/sublib4/{ALT}_complete.h5ad"
OUT = f"/gstore/project/crc_recursion_gw/DLD1_Sublib4/DS000016763/gRNA_HTO_rawcounts/{ALT}_complete.h5ad"

In [5]:
## Import Libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
%matplotlib inline 

#single cell libraries
import anndata as ad 
import pandas as pd
import numpy as np
import scanpy as sc


In [6]:
import sys
sys.path.append("..")
import Templates.tools.DatasetDB as DB
import Templates.tools.scBarcodeProc as Bproc
import Templates.tools.Barcode_Count_QC as bar

In [7]:
%load_ext autoreload
%autoreload 2

## Instantiate a dataset object given our dataset info

In [8]:
dsdbs= ['DS000016289','DS000016291','DS000016294','DS000016296','DS000016293',
 'DS000016295','DS000016632', 'DS000016634', 'DS000016633', 'DS000016631']

In [14]:
adata_s = []

for i in dsdbs:
    DSID = i
    DS = DB.DATASET(DSID, DEV, title=title, description=description, name_space=name_space, organism=organism,
                       sources=sources, tech_name=tech_name, author=author)
    adata, adatas = DS.load_dataset(Version, your_experiment, Corr=False)
    bdata= adata.copy()
    adata = bar.adata_cleaner (adata, adatas, experiment=experiment, fix_barcodes=fix_barcodes,alt_experiments=alt_experiments)
    df = adata.obs.join(bdata.obs[columns_1])
    adata.obs= df.copy()
    adata.obs = adata.obs.rename(columns={"demux_type":columns_2[0],"assignment":columns_2[1]})
    adata_s.append(adata) 



10:21:37 -> Collating dataset: 'DS000016289' metadata.
10:21:37 -> Retrieving all project: 'DS000016289@4' metadata.

10:21:38 -> Reading: 'DS000016289'.

10:21:38 -> Reading: 'DS000016289 -> Experiment: main'.

10:21:38 -> Reading: 'DS000016289 -> Experiment: main -> Alt. Exp.: crispr'.

10:21:38 -> Reading: 'DS000016289 -> Experiment: main -> Alt. Exp.: hashing'.

10:21:41 -> Reading: 'DS000016289 -> Experiment: crispr_hashing_harmonized'.

10:21:41 -> Reading: 'DS000016289 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:21:42 -> Reading: 'DS000016289 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.


  data_frame = cast(DataFrame, read_csv(path, **kwargs))



10:21:52 -> Collating dataset: 'DS000016291' metadata.
10:21:52 -> Retrieving all project: 'DS000016291@4' metadata.

10:21:52 -> Reading: 'DS000016291'.

10:21:52 -> Reading: 'DS000016291 -> Experiment: main'.

10:21:52 -> Reading: 'DS000016291 -> Experiment: main -> Alt. Exp.: crispr'.

10:21:53 -> Reading: 'DS000016291 -> Experiment: main -> Alt. Exp.: hashing'.

10:21:56 -> Reading: 'DS000016291 -> Experiment: crispr_hashing_harmonized'.

10:21:56 -> Reading: 'DS000016291 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:21:56 -> Reading: 'DS000016291 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.


  data_frame = cast(DataFrame, read_csv(path, **kwargs))



10:22:07 -> Collating dataset: 'DS000016294' metadata.
10:22:07 -> Retrieving all project: 'DS000016294@4' metadata.

10:22:07 -> Reading: 'DS000016294'.

10:22:07 -> Reading: 'DS000016294 -> Experiment: main'.

10:22:07 -> Reading: 'DS000016294 -> Experiment: main -> Alt. Exp.: crispr'.

10:22:07 -> Reading: 'DS000016294 -> Experiment: main -> Alt. Exp.: hashing'.

10:22:11 -> Reading: 'DS000016294 -> Experiment: crispr_hashing_harmonized'.

10:22:11 -> Reading: 'DS000016294 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:22:11 -> Reading: 'DS000016294 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.


  data_frame = cast(DataFrame, read_csv(path, **kwargs))



10:22:21 -> Collating dataset: 'DS000016296' metadata.
10:22:21 -> Retrieving all project: 'DS000016296@4' metadata.

10:22:22 -> Reading: 'DS000016296'.

10:22:22 -> Reading: 'DS000016296 -> Experiment: main'.

10:22:22 -> Reading: 'DS000016296 -> Experiment: main -> Alt. Exp.: crispr'.

10:22:22 -> Reading: 'DS000016296 -> Experiment: main -> Alt. Exp.: hashing'.

10:22:27 -> Reading: 'DS000016296 -> Experiment: crispr_hashing_harmonized'.

10:22:27 -> Reading: 'DS000016296 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:22:27 -> Reading: 'DS000016296 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.

10:22:37 -> Collating dataset: 'DS000016293' metadata.
10:22:37 -> Retrieving all project: 'DS000016293@4' metadata.

10:22:37 -> Reading: 'DS000016293'.

10:22:37 -> Reading: 'DS000016293 -> Experiment: main'.

10:22:37 -> Reading: 'DS000016293 -> Experiment: main -> Alt. Exp.: crispr'.

10:22:37 -> Reading: 'DS000016293 -> Experiment: main -> Alt. E

  data_frame = cast(DataFrame, read_csv(path, **kwargs))



10:23:20 -> Collating dataset: 'DS000016634' metadata.
10:23:20 -> Retrieving all project: 'DS000016634@4' metadata.

10:23:20 -> Reading: 'DS000016634'.

10:23:20 -> Reading: 'DS000016634 -> Experiment: main'.

10:23:20 -> Reading: 'DS000016634 -> Experiment: main -> Alt. Exp.: crispr'.

10:23:20 -> Reading: 'DS000016634 -> Experiment: main -> Alt. Exp.: hashing'.

10:23:24 -> Reading: 'DS000016634 -> Experiment: crispr_hashing_harmonized'.

10:23:24 -> Reading: 'DS000016634 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:23:24 -> Reading: 'DS000016634 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.

10:23:34 -> Collating dataset: 'DS000016633' metadata.
10:23:34 -> Retrieving all project: 'DS000016633@4' metadata.

10:23:34 -> Reading: 'DS000016633'.

10:23:34 -> Reading: 'DS000016633 -> Experiment: main'.

10:23:34 -> Reading: 'DS000016633 -> Experiment: main -> Alt. Exp.: crispr'.

10:23:35 -> Reading: 'DS000016633 -> Experiment: main -> Alt. E

  data_frame = cast(DataFrame, read_csv(path, **kwargs))


In [15]:
adata_s

[AnnData object with n_obs × n_vars = 132704 × 20872
     obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing'
     uns: '.internal'
     layers: 'counts',
 AnnData object with n_obs × n_vars = 135567 × 20872
     obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing'
     uns: '.internal'
     layers: 'counts',
 AnnData object with n_obs × n_vars = 136545 × 20872
     obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing'
     uns: '.internal'
     layers: 'counts',
 AnnData object with n_obs × n_vars = 121068 × 20872
     obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing'
     uns: '.internal'
     layers: 'counts',
 AnnData object with n_obs × n_vars = 109486 × 20872
     obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAs

In [16]:
concat = ad.concat(adata_s)

In [18]:
concat

AnnData object with n_obs × n_vars = 1289274 × 20872
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing'
    layers: 'counts'

In [19]:
concat.write_h5ad(out_file)

In [9]:
concat = sc.read(out_file)

In [10]:
concat

AnnData object with n_obs × n_vars = 1289274 × 20872
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing'
    layers: 'counts'

## Combine all obs

In [11]:
df_s = []

for i in dsdbs:
    DSID = i
    DS = DB.DATASET(DSID, DEV, title=title, description=description, name_space=name_space, organism=organism,
                       sources=sources, tech_name=tech_name, author=author)
    adata, adatas = DS.load_dataset(Version, your_experiment, Corr=False)
    df = adata.obs.copy()
    df_s.append(df) 


10:59:37 -> Collating dataset: 'DS000016289' metadata.
10:59:37 -> Retrieving all project: 'DS000016289@4' metadata.

10:59:38 -> Reading: 'DS000016289'.

10:59:38 -> Reading: 'DS000016289 -> Experiment: main'.

10:59:38 -> Reading: 'DS000016289 -> Experiment: main -> Alt. Exp.: crispr'.

10:59:38 -> Reading: 'DS000016289 -> Experiment: main -> Alt. Exp.: hashing'.

10:59:41 -> Reading: 'DS000016289 -> Experiment: crispr_hashing_harmonized'.

10:59:41 -> Reading: 'DS000016289 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:59:41 -> Reading: 'DS000016289 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.


  data_frame = cast(DataFrame, read_csv(path, **kwargs))



10:59:47 -> Collating dataset: 'DS000016291' metadata.
10:59:47 -> Retrieving all project: 'DS000016291@4' metadata.

10:59:47 -> Reading: 'DS000016291'.

10:59:47 -> Reading: 'DS000016291 -> Experiment: main'.

10:59:47 -> Reading: 'DS000016291 -> Experiment: main -> Alt. Exp.: crispr'.

10:59:48 -> Reading: 'DS000016291 -> Experiment: main -> Alt. Exp.: hashing'.

10:59:51 -> Reading: 'DS000016291 -> Experiment: crispr_hashing_harmonized'.

10:59:51 -> Reading: 'DS000016291 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

10:59:51 -> Reading: 'DS000016291 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.


  data_frame = cast(DataFrame, read_csv(path, **kwargs))



10:59:57 -> Collating dataset: 'DS000016294' metadata.
10:59:57 -> Retrieving all project: 'DS000016294@4' metadata.

10:59:57 -> Reading: 'DS000016294'.

10:59:57 -> Reading: 'DS000016294 -> Experiment: main'.

10:59:57 -> Reading: 'DS000016294 -> Experiment: main -> Alt. Exp.: crispr'.

10:59:57 -> Reading: 'DS000016294 -> Experiment: main -> Alt. Exp.: hashing'.

11:00:01 -> Reading: 'DS000016294 -> Experiment: crispr_hashing_harmonized'.

11:00:01 -> Reading: 'DS000016294 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

11:00:01 -> Reading: 'DS000016294 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.


  data_frame = cast(DataFrame, read_csv(path, **kwargs))



11:00:07 -> Collating dataset: 'DS000016296' metadata.
11:00:07 -> Retrieving all project: 'DS000016296@4' metadata.

11:00:07 -> Reading: 'DS000016296'.

11:00:07 -> Reading: 'DS000016296 -> Experiment: main'.

11:00:07 -> Reading: 'DS000016296 -> Experiment: main -> Alt. Exp.: crispr'.

11:00:08 -> Reading: 'DS000016296 -> Experiment: main -> Alt. Exp.: hashing'.

11:00:11 -> Reading: 'DS000016296 -> Experiment: crispr_hashing_harmonized'.

11:00:11 -> Reading: 'DS000016296 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

11:00:11 -> Reading: 'DS000016296 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.

11:00:16 -> Collating dataset: 'DS000016293' metadata.
11:00:16 -> Retrieving all project: 'DS000016293@4' metadata.

11:00:17 -> Reading: 'DS000016293'.

11:00:17 -> Reading: 'DS000016293 -> Experiment: main'.

11:00:17 -> Reading: 'DS000016293 -> Experiment: main -> Alt. Exp.: crispr'.

11:00:17 -> Reading: 'DS000016293 -> Experiment: main -> Alt. E

  data_frame = cast(DataFrame, read_csv(path, **kwargs))



11:00:46 -> Collating dataset: 'DS000016634' metadata.
11:00:46 -> Retrieving all project: 'DS000016634@4' metadata.

11:00:47 -> Reading: 'DS000016634'.

11:00:47 -> Reading: 'DS000016634 -> Experiment: main'.

11:00:47 -> Reading: 'DS000016634 -> Experiment: main -> Alt. Exp.: crispr'.

11:00:47 -> Reading: 'DS000016634 -> Experiment: main -> Alt. Exp.: hashing'.

11:00:50 -> Reading: 'DS000016634 -> Experiment: crispr_hashing_harmonized'.

11:00:50 -> Reading: 'DS000016634 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: crispr'.

11:00:50 -> Reading: 'DS000016634 -> Experiment: crispr_hashing_harmonized -> Alt. Exp.: hashing'.

11:00:56 -> Collating dataset: 'DS000016633' metadata.
11:00:56 -> Retrieving all project: 'DS000016633@4' metadata.

11:00:56 -> Reading: 'DS000016633'.

11:00:56 -> Reading: 'DS000016633 -> Experiment: main'.

11:00:56 -> Reading: 'DS000016633 -> Experiment: main -> Alt. Exp.: crispr'.

11:00:56 -> Reading: 'DS000016633 -> Experiment: main -> Alt. E

  data_frame = cast(DataFrame, read_csv(path, **kwargs))


In [12]:
combine_obs=pd.concat(df_s)

In [13]:
combine_obs

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing,cellline,timepoint,HTO,NGS_ID,Biological_replicate,10Xrun,sublibrary,gRNA_library_MOI,gene_symbol,class
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044,DLD1,day5,HTO-14,NGS5569,REPLICATE_2,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...","ENSG00000104936_3,ENSG00000184545_1,ENSG000001..."
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043,DLD1,day5,HTO-10,NGS5569,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000065320_1,ENSG00000144824_1","ENSG00000065320_1,ENSG00000144824_1"
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043,DLD1,day5,HTO-3,NGS5569,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104290_1,ENSG00000128218_1","ENSG00000104290_1,ENSG00000128218_1"
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-10,NGS5569,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-2,NGS5569,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-15,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-14,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,12,unknown,unknown,unknown,unknown
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-5,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown


In [14]:
QC = sc.read("/gstore/scratch/u/ghaffars/Dataset/sublib4/raw_qc.h5ad")

In [15]:
QC

AnnData object with n_obs × n_vars = 446413 × 36603
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'qc_pass', 'S_score', 'G2M_score', 'phase', 'SCN_class'
    var: 'Symbol'
    layers: 'counts'

In [16]:
combine_obs["qc_pass"] = combine_obs.index.isin(QC.obs.index)

In [23]:
combine_obs["NGS_ID"]=combine_obs["NGS_ID"].replace("NGS5569","NGS5570")

In [28]:
combine_obs['gene_symbol'] = np.where(combine_obs['gene_symbol']=='unknown', 
                                    combine_obs['DemuxAssignment_crispr'].apply(lambda x:x.split('_')[0]),combine_obs['gene_symbol']) 

In [32]:
combine_obs

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing,cellline,timepoint,HTO,NGS_ID,Biological_replicate,10Xrun,sublibrary,gRNA_library_MOI,gene_symbol,class,qc_pass
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044,DLD1,day5,HTO-14,NGS5570,REPLICATE_2,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...","ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",False
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000065320_1,ENSG00000144824_1","ENSG00000065320_1,ENSG00000144824_1",False
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043,DLD1,day5,HTO-3,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104290_1,ENSG00000128218_1","ENSG00000104290_1,ENSG00000128218_1",False
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-2,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-15,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-14,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,12,unknown,unknown,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-5,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False


In [33]:
combine_obs.to_csv("/gstore/project/crc_recursion_2/CellInfo_DLD1_day5/DLD1_sublib4_allCells_info.csv")

In [34]:
obs = pd.read_csv("/gstore/project/crc_recursion_2/CellInfo_DLD1_day5/DLD1_sublib4_allCells_info.csv",index_col=0)

In [35]:
concat.obs

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043
...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044


In [36]:
obs

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing,cellline,timepoint,HTO,NGS_ID,Biological_replicate,10Xrun,sublibrary,gRNA_library_MOI,gene_symbol,class,qc_pass
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044,DLD1,day5,HTO-14,NGS5570,REPLICATE_2,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...","ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",False
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000065320_1,ENSG00000144824_1","ENSG00000065320_1,ENSG00000144824_1",False
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043,DLD1,day5,HTO-3,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104290_1,ENSG00000128218_1","ENSG00000104290_1,ENSG00000128218_1",False
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-2,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-15,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-14,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,12,unknown,unknown,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-5,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False


In [37]:
df = concat.obs.copy()

In [38]:
df.index

Index(['SAM24443678_rep1-AAACCCAAGAATTTGG',
       'SAM24443678_rep1-AAACCCAAGATACCAA',
       'SAM24443678_rep1-AAACCCAAGCATTTCG',
       'SAM24443678_rep1-AAACCCAAGCGACCCT',
       'SAM24443678_rep1-AAACCCAAGCGCTGAA',
       'SAM24443678_rep1-AAACCCAAGCTTCGTA',
       'SAM24443678_rep1-AAACCCAAGGAATCGC',
       'SAM24443678_rep1-AAACCCAAGGATACAT',
       'SAM24443678_rep1-AAACCCAAGTCTTCCC',
       'SAM24443678_rep1-AAACCCACACAGCTGC',
       ...
       'SAM24449114_rep12-TTTGTTGTCCAAGCAT',
       'SAM24449114_rep12-TTTGTTGTCCATCTCG',
       'SAM24449114_rep12-TTTGTTGTCCGGCTTT',
       'SAM24449114_rep12-TTTGTTGTCCGTGGCA',
       'SAM24449114_rep12-TTTGTTGTCTAGACCA',
       'SAM24449114_rep12-TTTGTTGTCTAGAGCT',
       'SAM24449114_rep12-TTTGTTGTCTAGTTCT',
       'SAM24449114_rep12-TTTGTTGTCTCAACCC',
       'SAM24449114_rep12-TTTGTTGTCTGGACCG',
       'SAM24449114_rep12-TTTGTTGTCTTAATCC'],
      dtype='object', length=1289274)

In [39]:
df.columns

Index(['Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr',
       'DemuxType_hashing', 'DemuxAssignment_hashing'],
      dtype='object')

In [40]:
df = df.drop(columns=['Sample', 'Barcode', 'DemuxType_hashing', 'DemuxAssignment_hashing',
       'DemuxType_crispr', 'DemuxAssignment_crispr'])

In [41]:
df

SAM24443678_rep1-AAACCCAAGAATTTGG
SAM24443678_rep1-AAACCCAAGATACCAA
SAM24443678_rep1-AAACCCAAGCATTTCG
SAM24443678_rep1-AAACCCAAGCGACCCT
SAM24443678_rep1-AAACCCAAGCGCTGAA
...
SAM24449114_rep12-TTTGTTGTCTAGAGCT
SAM24449114_rep12-TTTGTTGTCTAGTTCT
SAM24449114_rep12-TTTGTTGTCTCAACCC
SAM24449114_rep12-TTTGTTGTCTGGACCG
SAM24449114_rep12-TTTGTTGTCTTAATCC


In [42]:
tmp= df.join(obs)

In [43]:
tmp

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing,cellline,timepoint,HTO,NGS_ID,Biological_replicate,10Xrun,sublibrary,gRNA_library_MOI,gene_symbol,class,qc_pass
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044,DLD1,day5,HTO-14,NGS5570,REPLICATE_2,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...","ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",False
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000065320_1,ENSG00000144824_1","ENSG00000065320_1,ENSG00000144824_1",False
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043,DLD1,day5,HTO-3,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104290_1,ENSG00000128218_1","ENSG00000104290_1,ENSG00000128218_1",False
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-2,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-15,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-14,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,12,unknown,unknown,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-5,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False


In [44]:
concat.obs

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043
...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044


In [45]:
concat.obs = tmp.copy()

In [46]:
concat.obs

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing,cellline,timepoint,HTO,NGS_ID,Biological_replicate,10Xrun,sublibrary,gRNA_library_MOI,gene_symbol,class,qc_pass
SAM24443678_rep1-AAACCCAAGAATTTGG,SAM24443678,AAACCCAAGAATTTGG,doublet,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",singlet,SAM24439044,DLD1,day5,HTO-14,NGS5570,REPLICATE_2,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104936_3,ENSG00000184545_1,ENSG000001...","ENSG00000104936_3,ENSG00000184545_1,ENSG000001...",False
SAM24443678_rep1-AAACCCAAGATACCAA,SAM24443678,AAACCCAAGATACCAA,doublet,"ENSG00000065320_1,ENSG00000144824_1",singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000065320_1,ENSG00000144824_1","ENSG00000065320_1,ENSG00000144824_1",False
SAM24443678_rep1-AAACCCAAGCATTTCG,SAM24443678,AAACCCAAGCATTTCG,doublet,"ENSG00000104290_1,ENSG00000128218_1",singlet,SAM24439043,DLD1,day5,HTO-3,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,"ENSG00000104290_1,ENSG00000128218_1","ENSG00000104290_1,ENSG00000128218_1",False
SAM24443678_rep1-AAACCCAAGCGACCCT,SAM24443678,AAACCCAAGCGACCCT,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-10,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24443678_rep1-AAACCCAAGCGCTGAA,SAM24443678,AAACCCAAGCGCTGAA,unknown,unknown,singlet,SAM24439043,DLD1,day5,HTO-2,NGS5570,REPLICATE_1,1,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAM24449114_rep12-TTTGTTGTCTAGAGCT,SAM24449114,TTTGTTGTCTAGAGCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-15,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTAGTTCT,SAM24449114,TTTGTTGTCTAGTTCT,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-14,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTCAACCC,SAM24449114,TTTGTTGTCTCAACCC,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,12,unknown,unknown,unknown,unknown,False
SAM24449114_rep12-TTTGTTGTCTGGACCG,SAM24449114,TTTGTTGTCTGGACCG,unknown,unknown,singlet,SAM24439044,DLD1,day5,HTO-5,NGS5704,REPLICATE_2,12,GMTY210:cropseq.crisprko.cas9.human.lib4.conce...,0.4,unknown,unknown,False


In [47]:
concat

AnnData object with n_obs × n_vars = 1289274 × 20872
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'qc_pass'
    layers: 'counts'

In [49]:
concat.obs['gRNA_library_MOI'] = concat.obs['gRNA_library_MOI'].astype("str")

In [52]:
concat.write(OUT)

In [53]:
gRNA_qc = concat[concat.obs['qc_pass']==True].copy()

In [54]:
gRNA_qc

AnnData object with n_obs × n_vars = 446413 × 20872
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'qc_pass'
    layers: 'counts'

In [55]:
gRNA_qc.write("/gstore/scratch/u/ghaffars/Dataset/sublib4/gRNA_qc.h5ad")