## Step 1: Load Libraries

In [1]:
import scanpy as sc
import numpy as np
import scipy
import pandas as pd
from scipy.io import mmwrite
import argparse
import os

## Step 2: Load Library 1 Data

In [2]:
file_load =  "/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev_NE/data/remove_pos_cont_counts_obs_var.h5ad"
data = sc.read_h5ad(file_load)

## Step 3: Obtain Column Meta Data

In [3]:
## Create NTC vs not label
data.obs["label"] = ['aNTC' if c == "nonessential" else 'zAll' for c in data.obs['class']]

In [4]:
data.obs['gem'] = data.obs['NGS_ID'].astype(str) + '-' + data.obs['10Xrun'].astype(str)

In [5]:
NE = data[data.obs["class"]=="nonessential"].copy()

In [6]:
NE

AnnData object with n_obs × n_vars = 2651 × 36603
    obs: 'Sample', 'Barcode', 'DemuxType_crispr', 'DemuxAssignment_crispr', 'DemuxType_hashing', 'DemuxAssignment_hashing', 'cellline', 'timepoint', 'HTO', 'NGS_ID', 'Biological_replicate', '10Xrun', 'sublibrary', 'gRNA_library_MOI', 'gene_symbol', 'class', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'qc_pass', 'S_score', 'G2M_score', 'phase', 'label', 'gem'
    var: 'Symbol'
    layers: 'counts'

In [7]:
ne_genes = [el for el in NE.obs['gene_symbol'].unique()]

In [8]:
len(ne_genes)

20

In [9]:
## Meta Data for pyGamPoi Model
meta = pd.DataFrame({'label':data.obs["label"],
                   'cell':data.obs.index,
                   'total_counts':data.obs["total_counts"],
                   'fct_counts_mt':data.obs["pct_counts_mt"]/100,
                   'batchid':data.obs["gem"],
                   'guidename':data.obs["DemuxAssignment_crispr"],
                   'targetname':data.obs["gene_symbol"]})

In [10]:
## Change guidename to not have commas to not confuse bash later
meta["guidename"] = meta["guidename"].str.replace(",", ":")


In [18]:
## File to save      
meta_file  = "/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev_NE/data/coldata_for_glmgampois.csv"
## Save
meta.to_csv(meta_file, index = False)

In [19]:
## Save row data
full_gene_file  = "/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev_NE/data/full_gene_data.csv"
data.var.to_csv(full_gene_file, index = False)

## Step 4: Split and Save by Target

In [13]:
## Create NTC vs not label
data.obs['tar']=np.where(data.obs["class"]=="nonessential", "NE",data.obs['gene_symbol'])

In [14]:
data.obs["tar"].nunique()

4956

In [15]:
A=data.obs[data.obs["tar"]=="NE"]

In [16]:
A[A["class"]!="nonessential"]

Unnamed: 0,Sample,Barcode,DemuxType_crispr,DemuxAssignment_crispr,DemuxType_hashing,DemuxAssignment_hashing,cellline,timepoint,HTO,NGS_ID,...,pct_counts_mt,total_counts_ribo,pct_counts_ribo,qc_pass,S_score,G2M_score,phase,label,gem,tar


In [17]:
## List of targets
targets = data.obs["tar"].unique()

## Create Folder To Save by Target
by_target_folder =  "/gstore/scratch/u/ghaffars/glmGamPoi/sublib1_bdev_NE/data/by_target/"

## Ensure folder exists
if not os.path.exists(by_target_folder):
    os.makedirs(by_target_folder)

## Save info about column names of top 3000 genes
print("length target is " + str(len(targets)))

## Iterate over target and save    
for i in range(len(targets)): 
    if i % 100 == 0:
            print(i)
    target = targets[i]
    data_target = data[data.obs["tar"] == target,]
    ## Save as .mm
    mmwrite(by_target_folder+ "count_mat"+target+".mm", data_target.X)
    sample_info = {'samples': data_target.obs.index}
    pd.DataFrame(sample_info).to_csv(by_target_folder+ "count_mat"+target+"_rownames.csv", index = False)

print("filtered and saved by target")
print("all done")


      

length target is 4956
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
filtered and saved by target
all done
