## Step 1: Load Data and Libraries

In [1]:
## Load Libraries
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad

In [2]:
## Load sublibrary 1
adata = sc.read_h5ad("/gstore/scratch/u/ghaffars/Dataset/sublib1/raw_qc.h5ad")
adata

AnnData object with n_obs × n_vars = 595603 × 36603
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'qc_pass', 'S_score', 'G2M_score', 'phase'
    var: 'Symbol'
    layers: 'counts'

In [3]:
adata.obs["gene_symbol"].nunique()

4975

## Step 2: Obtain Guides Labeled as Positive Controls in Sublibrary 1

In [4]:
## Obtain list of positive controls
positive_controls = adata[adata.obs["Biological_replicate"] == "POSITIVE_CONTROL",]

In [5]:
positive_controls

View of AnnData object with n_obs × n_vars = 1319 × 36603
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'qc_pass', 'S_score', 'G2M_score', 'phase'
    var: 'Symbol'
    layers: 'counts'

In [6]:
## Examine the Positive Control Targets
table_pos_cont = pd.crosstab(index = positive_controls.obs["gene_symbol"], columns = "count")
table_pos_cont 

col_0,count
gene_symbol,Unnamed: 1_level_1
ABCB6,1
ABCC8,1
ABCD1,1
ABCG2,1
ABHD6,1
...,...
ZNF804B,1
ZNF805,1
ZNF814,1
ZNRF4,1


It was discussed that those targets with only a few counts in the POSITIVE CONTROL data are likely mislabeled. The true electroporation positive control guides are those with a high frequency. So we will separate only those out. 

## Step 3: Cleaning the Positive Control to remove the samples that are likely mislabled

In [7]:
## Targets with counts greater than 3
print(table_pos_cont['count'][table_pos_cont['count'] > 3])

## Save names of target
pos_control_targets = table_pos_cont['count'][table_pos_cont['count'] > 3].index.tolist()



gene_symbol
CTNNB1     71
MYC       106
NTC        22
TCF7L2    384
Name: count, dtype: int64


In [8]:
pos_control_targets

['CTNNB1', 'MYC', 'NTC', 'TCF7L2']

NTC's are likely not a positive control, so we will keep them as well 

In [9]:
## Remove NTC's
try:
    pos_control_targets.remove("NTC")
except ValueError:
    pass

## Keep only positive controls with those targets
positive_controls_clean = positive_controls[positive_controls.obs["gene_symbol"].isin(pos_control_targets),]


In [10]:
positive_controls_clean

View of AnnData object with n_obs × n_vars = 561 × 36603
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'qc_pass', 'S_score', 'G2M_score', 'phase'
    var: 'Symbol'
    layers: 'counts'

## Step 4: Remove positive control samples and save data

In [11]:
## List of all samples
all_samples = adata.obs.index

## List of positive control samples
pos_control_samples = positive_controls_clean.obs.index

In [12]:
pos_control_samples

Index(['SAM24451293_rep7-GTTACGAGTTCAGCGC',
       'SAM24451293_rep7-GTTCGCTCAAACTCTG',
       'SAM24449110_rep13-AAACCCATCCATTGTT',
       'SAM24449110_rep13-AAAGAACGTGTACATC',
       'SAM24449110_rep13-AACAAGAAGTCATCGT',
       'SAM24449110_rep13-AACAGGGTCTTCGATT',
       'SAM24449110_rep13-AACCACAGTGCGTGCT',
       'SAM24449110_rep13-AACCATGAGAGCCGTA',
       'SAM24449110_rep13-AACCTGATCAGCGCGT',
       'SAM24449110_rep13-AACGAAAAGGTCATCT',
       ...
       'SAM24449110_rep19-TTGGATGAGTACCCTA',
       'SAM24449110_rep19-TTGGATGGTATCGGTT',
       'SAM24449110_rep19-TTGGATGTCACACCCT',
       'SAM24449110_rep19-TTGGTTTAGCTATCCA',
       'SAM24449110_rep19-TTGTGTTAGAACGCGT',
       'SAM24449110_rep19-TTTAGTCCACACCGCA',
       'SAM24449110_rep19-TTTAGTCGTACTAGCT',
       'SAM24449110_rep19-TTTGACTTCCATTCGC',
       'SAM24449110_rep19-TTTGTTGTCAATGTCG',
       'SAM24449110_rep19-TTTGTTGTCCCGTTCA'],
      dtype='object', length=561)

In [13]:
## Remove positive controls from data
adata_no_pos_controls = adata[~all_samples.isin(pos_control_samples),]


In [14]:
adata_no_pos_controls

View of AnnData object with n_obs × n_vars = 595042 × 36603
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'qc_pass', 'S_score', 'G2M_score', 'phase'
    var: 'Symbol'
    layers: 'counts'

In [15]:
## Currently missing .X object, so must make new ann data object
adata_no_pos_controls_out = ad.AnnData(adata_no_pos_controls.layers["counts"])

In [16]:
## Add row and column names
adata_no_pos_controls_out.obs_names = adata_no_pos_controls.obs_names
adata_no_pos_controls_out.var_names = adata_no_pos_controls.var_names

## Add sample info and gene info
adata_no_pos_controls_out.obs = adata_no_pos_controls.obs
adata_no_pos_controls_out.var = adata_no_pos_controls.var

In [17]:
## Store as a copy just in case to make future calculations easier
adata_no_pos_controls_out_final = adata_no_pos_controls_out.copy()

In [18]:
adata_no_pos_controls_out_final.layers["counts"] = adata_no_pos_controls_out_final.X

In [20]:
## Save h5ad object 
adata_no_pos_controls_out_final.write_h5ad("/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev/data/remove_pos_cont_counts_obs_var.h5ad")

## Save sample data
adata_no_pos_controls_out_final.obs.to_csv("/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev/data/remove_pos_cont_sample_data.csv")

## Save gene data
adata_no_pos_controls_out_final.var.to_csv("/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev/data/remove_pos_cont_gene_data.csv")