# 3. Subsetting CPJUMP1 controls 

In this notebook, we subset control samples from the CPJUMP1 CRISPR dataset using stratified sampling. We generate 10 different random seeds to create multiple subsets, each containing 15% of the original control data stratified by plate and well metadata. This approach ensures reproducible sampling while maintaining the distribution of controls across experimental conditions.

The subsampled datasets are saved as individual parquet files for downstream analysis and model training purposes.


In [1]:
import sys
import json
import pathlib
import polars as pl

sys.path.append("../../")
from utils.data_utils import split_meta_and_features

Load helper functions

In [2]:
def load_group_stratified_data(
    profiles: str | pathlib.Path | pl.DataFrame,
    group_columns: list[str] = ["Metadata_Plate", "Metadata_Well"],
    sample_percentage: float = 0.2,
    seed: int = 0
) -> pl.DataFrame:
    """Memory-efficiently sample a percentage of rows from each group in a dataset.

    This function performs stratified sampling by loading only the grouping columns first
    to dtermine group memberships and sizes, then samples indices from each group, and
    finally loads the full dataset filtered to only the sampled rows. This approach
    minimizes memory usage compared to loading the entire dataset upfront.

    Parameters
    ----------
    dataset_path : str or pathlib.Path
        Path to the parquet dataset file to sample from
    group_columns : list[str], default ["Metadata_Plate", "Metadata_Well"]
        Column names to use for grouping. Sampling will be performed independently
        within each unique combination of these columns
    sample_percentage : float, default 0.2
        Fraction of rows to sample from each group (must be between 0.0 and 1.0)

    Returns
    -------
    pl.DataFrame
        Subsampled dataframe containing the sampled rows from each group,
        preserving all original columns

    Raises
    ------
    ValueError
        If sample_percentage is not between 0 and 1
    FileNotFoundError
        If dataset_path does not exist
    """
    # validate inputs
    if not 0 <= sample_percentage <= 1:
        raise ValueError("sample_percentage must be between 0 and 1")

    # convert str types to pathlib types
    if isinstance(profiles, str):
        profiles = pathlib.Path(profiles).resolve(strict=True)

    # load only the grouping columns to determine groups
    if isinstance(profiles, pl.DataFrame):
        # if a polars DataFrame is provided, use it directly
        metadata_df = profiles.select(group_columns).with_row_index("original_idx")
    else:
        metadata_df = pl.read_parquet(profiles, columns=group_columns).with_row_index(
            "original_idx"
        )

    # sample indices for each group based on the group_columns
    sampled_indices = (
        metadata_df
        # group rows by the specified columns (e.g., Plate and Well combinations)
        .group_by(group_columns)
        # for each group, randomly sample a fraction of the original row indices
        .agg(
            pl.col("original_idx")
            .sample(fraction=sample_percentage, seed=seed)  # sample specified percentage from each group
            .alias("sampled_idx")  # rename the sampled indices column
        )
        # extract only the sampled indices column, discarding group identifiers
        .select("sampled_idx")
        # convert list of indices per group into individual rows (flatten the structure)
        .explode("sampled_idx")
        # extract the sampled indices as a single column series
        .get_column("sampled_idx")
        .sort()
    )

    # load the entire dataset and filter to sampled indices
    sampled_df = (
        profiles
        .with_row_index("idx")
        .filter(pl.col("idx").is_in(sampled_indices.implode()))
        .drop("idx")
    )

    return sampled_df

Setting input and output paths

In [3]:
# setting data path
data_dir = pathlib.Path("../0.download-data/data").resolve(strict=True)
download_module_results_dir = pathlib.Path("../0.download-data/results").resolve(
    strict=True
)

# setting directory where all the single-cell profiles are stored
profiles_dir = (data_dir / "sc-profiles").resolve(strict=True)
    
exp_metadata_path = (
    profiles_dir / "cpjump1" / "CPJUMP1-experimental-metadata.csv"
).resolve(strict=True)

# Setting feature selection path
shared_features_config_path = (
    profiles_dir / "cpjump1" / "feature_selected_sc_qc_features.json"
).resolve(strict=True)

# setting cpjump1 data dir
cpjump_crispr_data_dir = (data_dir / "sc-profiles" / "cpjump1-crispr-negcon").resolve()
cpjump_crispr_data_dir.mkdir(exist_ok=True)


# setting negative control 
negcon_data_dir = (profiles_dir / "cpjump1" / "negcon").resolve()
negcon_data_dir.mkdir(exist_ok=True)
poscon_data_dir = (profiles_dir / "cpjump1" / "poscon").resolve()
poscon_data_dir.mkdir(exist_ok=True)


Loading data

In [4]:
# Load experimental metadata
# selecting plates that pertains to the cpjump1 CRISPR dataset
exp_metadata = pl.read_csv(exp_metadata_path)
crispr_plate_names = (
    exp_metadata.select("Assay_Plate_Barcode").unique().to_series().to_list()
)
crispr_plate_paths = [
    (profiles_dir / "cpjump1" / f"{plate}_feature_selected_sc_qc.parquet").resolve(
        strict=True
    )
    for plate in crispr_plate_names
]
# Load shared features
with open(shared_features_config_path) as f:
    loaded_shared_features = json.load(f)

shared_features = loaded_shared_features["shared-features"]


In [5]:
control_df = []
for plate_path in crispr_plate_paths:
    
    # load plate data and filter to controls 
    plate_controls_df = pl.read_parquet(plate_path).filter(
        pl.col("Metadata_pert_type") == "control"
    )

    # split features
    controls_meta, _ = split_meta_and_features(plate_controls_df)

    # select metadata and shared features together
    controls_df = plate_controls_df.select(controls_meta + shared_features)

    # then append to list
    control_df.append(controls_df)

# concatenate dataframes 
controls_df = pl.concat(control_df)


In [6]:
negcon_df = controls_df.filter(pl.col("Metadata_control_type") == "negcon")
negcon_df

Metadata_broad_sample,Metadata_ImageNumber,Metadata_Plate,Metadata_Site,Metadata_Well,Metadata_TableNumber,Metadata_ObjectNumber_cytoplasm,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_Nuclei,Metadata_ObjectNumber_cells,Metadata_ObjectNumber,Metadata_gene,Metadata_pert_type,Metadata_control_type,Metadata_target_sequence,Metadata_negcon_control_type,__index_level_0__,Nuclei_Texture_InverseDifferenceMoment_ER_5_01_256,Cytoplasm_AreaShape_Zernike_4_2,Cytoplasm_AreaShape_Zernike_9_3,Nuclei_RadialDistribution_RadialCV_AGP_2of4,Nuclei_Correlation_Correlation_DNA_HighZBF,Cells_Texture_Correlation_HighZBF_5_00_256,Cells_AreaShape_Solidity,Nuclei_RadialDistribution_MeanFrac_HighZBF_4of4,Nuclei_AreaShape_Orientation,Nuclei_Texture_Correlation_ER_5_02_256,Cytoplasm_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_InfoMeas2_DNA_10_01_256,Cells_RadialDistribution_FracAtD_DNA_2of4,Cells_RadialDistribution_MeanFrac_HighZBF_1of4,Nuclei_RadialDistribution_MeanFrac_Mito_4of4,Cells_Correlation_RWC_DNA_ER,Cells_Texture_InfoMeas2_ER_3_00_256,Cells_RadialDistribution_MeanFrac_HighZBF_2of4,Cytoplasm_RadialDistribution_RadialCV_ER_3of4,Nuclei_RadialDistribution_RadialCV_HighZBF_2of4,…,Nuclei_AreaShape_MinFeretDiameter,Nuclei_AreaShape_Zernike_5_3,Nuclei_AreaShape_Zernike_4_2,Nuclei_RadialDistribution_MeanFrac_HighZBF_3of4,Cytoplasm_Granularity_1_Brightfield,Nuclei_Correlation_Correlation_ER_Mito,Nuclei_AreaShape_Zernike_6_0,Cytoplasm_AreaShape_Solidity,Nuclei_RadialDistribution_FracAtD_DNA_3of4,Nuclei_AreaShape_Zernike_8_4,Nuclei_Intensity_MassDisplacement_HighZBF,Cytoplasm_Texture_Correlation_LowZBF_3_03_256,Cells_RadialDistribution_RadialCV_Mito_1of4,Nuclei_Intensity_MassDisplacement_AGP,Nuclei_Correlation_Correlation_AGP_Mito,Nuclei_RadialDistribution_MeanFrac_HighZBF_1of4,Cells_RadialDistribution_RadialCV_DNA_2of4,Nuclei_RadialDistribution_RadialCV_RNA_3of4,Cells_AreaShape_Zernike_7_3,Nuclei_RadialDistribution_MeanFrac_ER_1of4,Cytoplasm_Correlation_Overlap_ER_RNA,Cells_Texture_Correlation_LowZBF_5_01_256,Cytoplasm_Texture_Correlation_HighZBF_3_01_256,Nuclei_Texture_Correlation_LowZBF_3_02_256,Cells_RadialDistribution_RadialCV_AGP_3of4,Nuclei_RadialDistribution_MeanFrac_AGP_1of4,Cells_RadialDistribution_MeanFrac_Brightfield_1of4,Cytoplasm_Correlation_Correlation_DNA_HighZBF,Nuclei_Intensity_MassDisplacement_DNA,Cytoplasm_RadialDistribution_MeanFrac_DNA_4of4,Cells_Correlation_Correlation_AGP_DNA,Cytoplasm_Texture_InfoMeas2_RNA_3_01_256,Cells_RadialDistribution_RadialCV_DNA_4of4,Nuclei_Correlation_Correlation_DNA_LowZBF,Cytoplasm_Texture_Correlation_LowZBF_5_00_256,Cytoplasm_RadialDistribution_RadialCV_HighZBF_2of4,Nuclei_RadialDistribution_MeanFrac_Brightfield_1of4
str,i64,str,i64,str,str,i64,f64,f64,i64,i64,str,str,str,str,str,i64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
"""BRDN0001147100""",109,"""BR00118047""",1,"""A13""","""308628357980232446257703298793…",1,1.0,1.0,1,1,,"""control""","""negcon""","""ACAGCGCTCTCGTGTACTAT""","""NO_SITE (5 zeros)""",7010,-1.028713,1.059229,0.115462,0.473156,1.324485,-0.084368,0.71923,-0.81904,1.265652,0.042607,0.541244,1.751658,-1.741249,1.467187,-0.309894,0.577766,0.859873,0.780703,-1.449886,-0.195239,…,0.208276,0.865554,-0.603492,1.166788,-0.990647,0.781832,-1.52143,1.362438,0.532911,0.120666,-0.12049,0.124058,1.330898,-0.068765,1.012425,-1.437147,-2.012827,-0.017953,-0.365368,0.513308,-0.541348,0.344488,0.414022,-0.256932,-1.467478,-0.275899,0.554505,0.40101,-0.714905,-0.055816,0.79722,1.111975,-0.111111,-1.704028,-0.116456,1.41343,-0.510386
"""BRDN0001147100""",109,"""BR00118047""",1,"""A13""","""308628357980232446257703298793…",2,2.0,2.0,2,2,,"""control""","""negcon""","""ACAGCGCTCTCGTGTACTAT""","""NO_SITE (5 zeros)""",7011,0.122049,0.004791,-0.857309,0.038176,-0.078726,-0.079663,0.17033,0.967084,1.055542,0.686653,-0.291474,-0.070826,-0.156032,0.269196,0.217574,0.521351,0.71286,0.264704,0.20849,1.046584,…,-0.494419,-0.712622,-0.373149,-0.305804,-0.191004,0.618058,0.97991,0.643491,-0.105978,0.734167,-0.689359,-0.330821,-0.467458,-0.354211,0.688672,1.372547,0.891099,0.978844,-1.100139,-0.91033,0.072436,-0.275345,1.882924,0.087411,0.508982,-0.439045,0.008573,0.585336,-0.823073,-0.250795,-0.142025,1.011612,0.861527,-0.863903,-0.447796,-0.354031,0.312585
"""BRDN0001147100""",109,"""BR00118047""",1,"""A13""","""308628357980232446257703298793…",3,3.0,3.0,3,3,,"""control""","""negcon""","""ACAGCGCTCTCGTGTACTAT""","""NO_SITE (5 zeros)""",7012,0.370399,-0.262972,-1.404616,-0.693991,0.502991,0.618994,-0.066264,0.098727,1.524291,-1.025337,0.297346,0.339764,0.420589,0.519443,-0.287341,0.159021,0.614568,0.398127,1.703434,-0.608472,…,-0.434808,0.72182,-0.773704,-0.517222,-0.026364,0.007683,-0.784433,0.630488,0.626985,-0.974103,0.125999,0.111416,1.104746,-0.371487,-1.39123,-0.90087,1.074054,-0.850756,-1.633757,1.140424,-0.000907,-0.535922,-0.056651,-0.43723,-0.377323,1.054326,-0.015501,-0.080123,-0.539807,-0.26611,1.090343,0.151405,-0.609261,-0.735018,-0.15961,0.330829,-0.514569
"""BRDN0001147100""",109,"""BR00118047""",1,"""A13""","""308628357980232446257703298793…",4,4.0,4.0,4,4,,"""control""","""negcon""","""ACAGCGCTCTCGTGTACTAT""","""NO_SITE (5 zeros)""",7013,-0.787941,-0.374708,0.78542,-0.419399,-0.267538,-0.381069,1.084841,0.33705,1.199166,0.54354,-1.196335,-0.198555,1.621134,-0.343463,-0.217215,0.258121,0.864777,-0.369991,-1.101691,-0.227312,…,0.632325,-0.417053,-1.454626,-0.310669,-0.304528,0.087402,0.437442,1.017849,0.961979,-0.784957,0.182204,0.043337,-0.386272,-0.386587,-0.058351,-0.600421,-0.361494,0.086438,0.527366,-0.233363,0.012496,-0.111335,-0.311494,-0.374822,-0.709454,1.474652,-0.065686,0.385537,-0.705235,-0.634491,0.101058,0.561559,-1.088893,0.515875,0.273117,0.459208,-0.088421
"""BRDN0001147100""",109,"""BR00118047""",1,"""A13""","""308628357980232446257703298793…",5,5.0,5.0,5,5,,"""control""","""negcon""","""ACAGCGCTCTCGTGTACTAT""","""NO_SITE (5 zeros)""",7014,-0.42181,-0.080816,2.363815,0.220237,-0.272926,-0.324976,-0.597176,0.279955,0.70838,0.387093,0.468885,-0.135322,0.864667,0.116455,-0.335405,0.231344,-1.679674,-1.150475,-0.361051,0.137044,…,0.230192,-0.72051,-0.301345,0.562988,-0.249575,0.674224,2.867441,-0.959345,-0.268888,-0.757371,-0.029052,-0.882626,-0.531049,-1.307925,0.457686,0.987683,-1.385461,-0.58981,0.040728,-0.037037,-2.149111,-0.479564,1.679714,0.407997,-1.281839,-0.073656,-0.153475,-0.337976,-0.755771,0.203726,1.045206,-0.463189,-0.398109,-0.935839,-0.754607,0.125589,-0.289761
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BRDN0001146476""",3348,"""BR00118043""",9,"""P12""","""119960994267979118783648899547…",129,129.0,129.0,129,129,,"""control""","""negcon""","""ACTAGCCTGTTCGCGAGTAG""","""NO_SITE (5 zeros)""",387345,-0.262129,-0.451563,1.08365,-0.565492,-1.377005,1.032865,0.917688,1.435191,-0.724395,-0.10208,-2.342868,-1.672678,0.069266,-0.514682,0.426476,0.916992,0.383645,0.030663,1.310647,0.233206,…,-0.315055,-0.966533,-1.484386,-0.332623,-1.991891,-0.966093,0.854135,1.092622,1.67505,0.101743,0.682051,1.732903,1.658742,-0.342558,-0.857745,-1.493278,2.048434,-0.546937,0.563514,-0.797819,-0.165102,0.24081,0.652409,0.935035,0.961801,1.385283,-0.730183,-1.097008,-0.058342,0.023226,-0.051859,0.304116,-0.882071,1.085634,1.677004,-0.305542,0.938798
"""BRDN0001146476""",3348,"""BR00118043""",9,"""P12""","""119960994267979118783648899547…",130,130.0,130.0,130,130,,"""control""","""negcon""","""ACTAGCCTGTTCGCGAGTAG""","""NO_SITE (5 zeros)""",387346,-0.109972,-0.19425,-0.317486,2.963791,-2.336638,0.948756,0.861868,1.636599,-1.328884,0.288599,-1.774356,-0.683632,-1.983382,0.807104,-1.039564,-0.404527,-0.776086,1.507437,-2.858381,-0.458741,…,-0.630868,0.274432,-1.190532,-0.699095,-1.330493,0.634812,1.268949,1.523775,0.150757,0.191521,0.251803,0.379428,-0.661837,-0.99228,0.113257,0.508658,-1.787135,-0.199529,-0.56785,0.601707,0.21728,0.737103,0.975092,-0.467949,0.474577,3.040004,-0.158844,0.663654,-1.193295,0.462751,1.726538,-0.804577,0.33413,1.068649,1.244595,0.024562,-0.273189
"""BRDN0001146476""",3348,"""BR00118043""",9,"""P12""","""119960994267979118783648899547…",131,131.0,131.0,131,131,,"""control""","""negcon""","""ACTAGCCTGTTCGCGAGTAG""","""NO_SITE (5 zeros)""",387347,2.224269,1.624416,-0.059406,-1.096799,0.290633,0.354355,0.58592,-0.388792,-1.734747,0.369879,0.89592,-1.229484,-0.254668,0.834035,-0.797126,-0.390029,0.896478,0.624295,-2.139969,-1.546824,…,0.12378,-1.113251,-1.336006,0.197241,-0.739147,0.618262,1.521116,0.810462,-0.811166,-0.878595,-0.516524,-1.129364,1.057978,-0.973787,-0.247707,0.433158,0.588788,-1.622477,-1.437321,1.6247,-0.756856,-0.452766,-0.077749,-0.363133,0.854473,0.944136,-0.382993,0.059144,0.01668,0.187787,-1.788669,0.270751,-0.527319,-0.466399,0.25835,-0.921918,-0.395227
"""BRDN0001146476""",3348,"""BR00118043""",9,"""P12""","""119960994267979118783648899547…",132,132.0,132.0,132,132,,"""control""","""negcon""","""ACTAGCCTGTTCGCGAGTAG""","""NO_SITE (5 zeros)""",387348,0.534403,1.721981,-0.474416,-0.101497,-0.185324,-0.776172,0.805823,0.11892,-0.412498,1.22031,0.231701,-0.736331,0.768541,0.502666,0.276547,-0.068513,0.152254,0.216654,-0.083206,-0.22703,…,1.177736,0.58795,-1.435349,0.191335,-0.458654,-0.262132,0.203373,0.141632,0.072797,-0.281663,0.203744,-0.069872,1.110613,-1.637295,-2.043472,0.980754,0.071489,-1.606803,-1.762426,0.788461,-1.689507,-0.747968,-0.024951,0.813029,-0.276101,0.278273,-0.457901,0.181623,-0.473566,0.709965,0.530347,0.437383,-0.751677,-0.731785,0.079244,-0.518756,-0.472694


generating 10 seeds of randomly sampled negative controls

In [7]:
for seed_val in range(10):

    # load the dataset with group stratified sub sampling
    subsampled_df = load_group_stratified_data(
        profiles=negcon_df,
        group_columns=["Metadata_Plate", "Metadata_Well"],
        sample_percentage=0.15,
        seed=seed_val,
    )

    # save the file
    subsampled_df.write_parquet(
        negcon_data_dir / f"cpjump1_crispr_negcon_seed{seed_val}.parquet"
    )


Selecting only positive controls and saving it 

In [None]:
# write as parquet file
poscon_cp_df = controls_df.filter((pl.col("Metadata_control_type") == "poscon_cp")).select("Metadata_gene")
poscon_cp_df.write_parquet(poscon_data_dir / "poscon_cp_df.parquet")
