## Split Plates

The objective of this script is to generate the split in the plates so that they do not have to be split every time the dataset is loaded during FCR

In [10]:
import scanpy as sc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import anndata as ad

In [11]:
for plate_number in range(12,15):
    print(f"Splitting Plate {plate_number}")
    plate_path = f"/cluster/work/bewi/data/tahoe100/h5ad/plate{plate_number}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad"
    adata = sc.read_h5ad(plate_path, backed="r")

    print("Splitting...\n")
    test_ratio = 0.2
    random_state = 42
    adata.obs["split"] = "train"
    idx_train, idx_test = train_test_split(
        adata.obs.index, test_size=test_ratio, random_state=random_state
    )
    adata.obs["split"].loc[idx_train] = "train"
    adata.obs.loc[idx_test, "split"] = "test"
    
    # Changes are saved automatically because backed='r+'
    adata.write(plate_path)

Splitting Plate 12
Splitting...



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs["split"].loc[idx_train] = "train"


Splitting Plate 13
Splitting...



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs["split"].loc[idx_train] = "train"


Splitting Plate 14
Splitting...



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs["split"].loc[idx_train] = "train"


## Check

In [12]:
adata_path = f"/cluster/work/bewi/data/tahoe100/h5ad/plate1_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad"
adata = sc.read_h5ad(adata_path, backed="r")
adata.obs

Unnamed: 0_level_0,sample,gene_count,tscp_count,mread_count,drugname_drugconc,drug,cell_line,sublibrary,BARCODE,pcnt_mito,S_score,G2M_score,phase,pass_filter,cell_name,plate,split
BARCODE_SUB_LIB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
01_001_025-lib_841,smp_1495,1676,2441,2892,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_0131,lib_841,01_001_025,0.025399,-0.066667,-0.095055,G1,full,A-172,plate1,test
01_001_026-lib_841,smp_1495,1657,2454,2925,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_0480,lib_841,01_001_026,0.042787,0.128571,0.650549,G2M,full,PANC-1,plate1,train
01_001_048-lib_841,smp_1495,1749,2521,2963,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_0293,lib_841,01_001_048,0.056724,0.242857,0.308791,G2M,full,HEC-1-A,plate1,train
01_001_076-lib_841,smp_1495,834,1038,1258,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_0397,lib_841,01_001_076,0.066474,0.009524,0.245788,G2M,full,LS 180,plate1,test
01_001_088-lib_841,smp_1495,1275,1710,2006,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_1097,lib_841,01_001_088,0.028655,-0.100000,-0.085348,G1,full,C32,plate1,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96_190_045-lib_912,smp_1590,1335,1900,2252,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0546,lib_912,96_190_045,0.058421,-0.028936,0.615753,G2M,full,SW480,plate1,train
96_190_086-lib_912,smp_1590,2676,6496,7730,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0152,lib_912,96_190_086,0.163639,-0.215881,-0.219544,G1,full,AsPC-1,plate1,train
96_191_055-lib_912,smp_1590,832,1254,1475,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0152,lib_912,96_191_055,0.066188,-0.095694,-0.076463,G1,full,AsPC-1,plate1,train
96_192_065-lib_912,smp_1590,576,792,948,"[('DMSO_TF', 0.0, 'uM')]",DMSO_TF,CVCL_0366,lib_912,96_192_065,0.036616,-0.086124,-0.086124,G1,full,SNU-423,plate1,test


In [18]:
adata[[2,1,4]].obs

Unnamed: 0_level_0,sample,gene_count,tscp_count,mread_count,drugname_drugconc,drug,cell_line,sublibrary,BARCODE,pcnt_mito,S_score,G2M_score,phase,pass_filter,cell_name,plate,split
BARCODE_SUB_LIB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
01_001_048-lib_841,smp_1495,1749,2521,2963,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_0293,lib_841,01_001_048,0.056724,0.242857,0.308791,G2M,full,HEC-1-A,plate1,train
01_001_026-lib_841,smp_1495,1657,2454,2925,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_0480,lib_841,01_001_026,0.042787,0.128571,0.650549,G2M,full,PANC-1,plate1,train
01_001_088-lib_841,smp_1495,1275,1710,2006,"[('Infigratinib', 0.05, 'uM')]",Infigratinib,CVCL_1097,lib_841,01_001_088,0.028655,-0.1,-0.085348,G1,full,C32,plate1,train
