In [41]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

import scanpy as sc
import numpy as np
from scipy import sparse
import random
import numpy as np
import scipy.sparse as sp

## Create new dataset

In [42]:
# PARAMETERS FOR THE SUBSET
data_path = "/cluster/work/bewi/data/tahoe100/h5ad/controls_Trametinib_mergedtahoe100_hvg.h5ad"
adata = sc.read_h5ad(data_path)
export_path = "/cluster/work/bewi/members/rquiles/experiments/datasets/test_3cell_lines.h5ad"

change_col_names = {"drug":"Agg_Treatment", "cell_line": "covariates"}
control_name = "DMSO_TF"
cell_lines_keep = np.unique(adata.obs["cell_line"])[:3]
treatments_keep = np.unique(adata.obs["drugname_drugconc"])[[0,1,2]]

In [43]:
## UPDATE COLUMNS NEW DATASET
adata.obs = adata.obs.rename(columns=change_col_names)
adata.obs["control"] = adata.obs[change_col_names["drug"]] == control_name
adata.obs["control"] = adata.obs["control"].astype(int)
adata.uns["fields"] = []
adata.obs["dose"] = adata.obs["drugname_drugconc"].str.split(",").str[1].astype(float)

In [44]:
## SUBSET THE DATASET
idx=[] #List of row indexes to mantain in the AnnData
keep_rows = []

keep_rows = []
for cell_line in cell_lines_keep:
    for treatment in treatments_keep:
        mask = (adata.obs["covariates"] == cell_line) & (adata.obs["drugname_drugconc"] == treatment)
        row_indexes = adata.obs[mask].index
        keep_rows.extend(row_indexes)
        
# Randomize row order
random.shuffle(keep_rows)

#Subset the AnnData
filtered_adata = adata[keep_rows, :]

In [45]:
## PREPROCESS AND EXPORT
sc.pp.normalize_total(filtered_adata, target_sum=1e4)
sc.pp.log1p(filtered_adata)
sc.pp.highly_variable_genes(filtered_adata, n_top_genes=2000, subset=True)
sc.pp.scale(filtered_adata, max_value=10)
filtered_adata.write(export_path)

  view_to_actual(adata)


## Test

In [40]:
test = sc.read_h5ad(export_path)
test.obs

Unnamed: 0_level_0,sample,gene_count,tscp_count,mread_count,drugname_drugconc,Agg_Treatment,covariates,sublibrary,BARCODE,pcnt_mito,S_score,G2M_score,phase,pass_filter,cell_name,plate,source_plate,n_genes,control,dose
BARCODE_SUB_LIB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
95_165_144-lib_895,smp_1589,1551,2411,2992,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0069,lib_895,95_165_144,0.115305,-0.014286,-0.052885,G1,full,SK-MEL-2,plate1,plate1,1551,1,0.0
96_090_113-lib_1205,smp_1974,922,1251,1461,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0023,lib_1205,96_090_113,0.071143,-0.047961,-0.013736,G1,full,A549,plate5,plate5,922,1,0.0
95_046_043-lib_1330,smp_2069,1297,1865,2185,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0069,lib_1330,95_046_043,0.093298,-0.157895,-0.118681,G1,full,SK-MEL-2,plate6,plate6,1297,1,0.0
95_045_037-lib_1508,smp_2261,936,1173,1372,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0028,lib_1508,95_045_037,0.046036,-0.061905,0.140476,G2M,full,AN3 CA,plate8,plate8,936,1,0.0
26_079_034-lib_1506,smp_2192,1016,1322,1528,"[('Trametinib (DMSO_TF solvate)', 0.5, 'uM')]",Trametinib (DMSO_TF solvate),CVCL_0023,lib_1506,26_079_034,0.043116,-0.200957,-0.200000,G1,full,A549,plate8,plate8,1016,0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96_131_081-lib_1311,smp_2070,2942,6012,6869,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0023,lib_1311,96_131_081,0.055722,-0.092949,1.050549,G2M,full,A549,plate6,plate6,2942,1,0.0
95_090_127-lib_2383,smp_2261,1300,1904,2261,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0023,lib_2383,95_090_127,0.098214,-0.020375,-0.562529,G1,full,A549,plate8,plate8,1300,1,0.0
96_013_173-lib_1558,smp_2262,803,1095,1322,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0023,lib_1558,96_013_173,0.113242,-0.235019,-0.333150,G1,full,A549,plate8,plate8,803,1,0.0
96_071_170-lib_2304,smp_2070,1093,1611,1806,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0023,lib_2304,96_071_170,0.070763,-0.028571,0.188462,G2M,full,A549,plate6,plate6,1093,1,0.0
