In [1]:
import pathlib

import pandas as pd
import pyarrow.parquet as pq

In [2]:
cell_type = "PBMC"

In [3]:
path_to_indexes = pathlib.Path(
    f"../indexes/{cell_type}/MultiClass_MLP_data_split_indexes.tsv"
).resolve()
file_path = pathlib.Path(
    f"../../../data/{cell_type}_preprocessed_sc_norm.parquet"
).resolve(strict=True)

In [4]:
indexes_df = pd.read_csv(path_to_indexes, sep="\t")
print(indexes_df.shape)
indexes_df.head()

(8318724, 2)


Unnamed: 0,labeled_data_index,label
0,6135062,train
1,5358238,train
2,3875296,train
3,7361184,train
4,4631111,train


In [5]:
# replace the index with the labeled_data index
indexes_df = indexes_df.set_index("labeled_data_index")
# sort the index
indexes_df = indexes_df.sort_index()
indexes_df.head()

Unnamed: 0_level_0,label
labeled_data_index,Unnamed: 1_level_1
0,val
1,test
2,test
3,test
4,test


In [6]:
df = pd.read_parquet(
    file_path,
    columns=["oneb_Metadata_Treatment_Dose_Inhibitor_Dose"],
)
print(df.shape)
df.head()

(8318724, 1)


Unnamed: 0,oneb_Metadata_Treatment_Dose_Inhibitor_Dose
0,LPS_0.010_ug_per_ml_DMSO_0.025_%
1,LPS_0.010_ug_per_ml_DMSO_0.025_%
2,LPS_0.010_ug_per_ml_DMSO_0.025_%
3,LPS_0.010_ug_per_ml_DMSO_0.025_%
4,LPS_0.010_ug_per_ml_DMSO_0.025_%


In [7]:
# add data split label to the dataframe via index
df["data_split"] = indexes_df["label"]
df.head()

Unnamed: 0,oneb_Metadata_Treatment_Dose_Inhibitor_Dose,data_split
0,LPS_0.010_ug_per_ml_DMSO_0.025_%,val
1,LPS_0.010_ug_per_ml_DMSO_0.025_%,test
2,LPS_0.010_ug_per_ml_DMSO_0.025_%,test
3,LPS_0.010_ug_per_ml_DMSO_0.025_%,test
4,LPS_0.010_ug_per_ml_DMSO_0.025_%,test


In [8]:
# get counts of each data split and treatment
grouped_df = pd.DataFrame(
    df.groupby(
        ["data_split", "oneb_Metadata_Treatment_Dose_Inhibitor_Dose"]
    ).value_counts()
)
# melt the dataframe to get datasplits as columns
grouped_df = grouped_df.unstack(level=0)
# sort by treatment
grouped_df = grouped_df.sort_index()
grouped_df

Unnamed: 0_level_0,0,0,0,0,0
data_split,holdout,test,train,treatment_holdout,val
oneb_Metadata_Treatment_Dose_Inhibitor_Dose,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
DMSO_0.100_%_DMSO_0.025_%,60328.0,214165.0,171333.0,,42833.0
DMSO_0.100_%_DMSO_1.000_%,59877.0,88089.0,70471.0,,17618.0
DMSO_0.100_%_Z-VAD-FMK_100.000_uM,56755.0,77058.0,61646.0,,15411.0
DMSO_0.100_%_Z-VAD-FMK_30.000_uM,62967.0,84113.0,67290.0,,16823.0
Disulfiram_0.100_uM_DMSO_0.025_%,67467.0,84953.0,67962.0,,16990.0
Disulfiram_1.000_uM_DMSO_0.025_%,55502.0,87595.0,70076.0,,17519.0
Disulfiram_2.500_uM_DMSO_0.025_%,58013.0,93462.0,74770.0,,18692.0
Flagellin_0.100_ug_per_ml_DMSO_0.025_%,53708.0,,,145024.0,
Flagellin_1.000_ug_per_ml_DMSO_0.025_%,57805.0,,,143859.0,
Flagellin_1.000_ug_per_ml_Disulfiram_1.000_uM,32379.0,,,149118.0,
