# Aggregate feature selected profiles

## Import libraries

In [1]:
import pathlib

import pandas as pd
from pycytominer import aggregate

## Set paths and variables

In [2]:
# set paths
input_profile_dir = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_norm_fs.parquet"
).resolve(strict=True)
output_profile_dir = pathlib.Path(
    "../data/CP_scDINO_features/combined_CP_scDINO_norm_fs_aggregated.parquet"
).resolve()
fs_df = pd.read_parquet(input_profile_dir)
fs_df.head()

Unnamed: 0,Metadata_plate,Metadata_Well,Metadata_number_of_singlecells,Metadata_compound,Metadata_dose,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Time,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,1.0,...,0.45565,0.394903,1.649449,0.01033,0.426311,-0.33932,1.586745,-0.362606,0.81121,1.215151
1,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,15.0,...,1.055771,0.704042,-1.417577,0.262575,1.097736,-1.700848,0.563939,0.238832,0.71739,0.768155
2,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,16.0,...,-0.834577,0.310103,-0.678053,1.008923,-0.144754,-1.514908,-0.6095,0.447918,1.357684,1.000216
3,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,26.0,...,-1.519428,1.510967,-0.939257,-0.584025,2.944528,-0.227416,0.076889,-1.431668,-0.09765,0.970673
4,1,C-10,150,Staurosporine,78.13,test,11,3,10.0,37.0,...,-1.1729,0.214924,2.372842,-0.591197,-0.3042,0.505422,-0.360512,1.295638,1.948911,0.107559


## Perform aggregation

In [3]:
metadata_cols = fs_df.columns[fs_df.columns.str.contains("Metadata")].to_list()
feature_cols = fs_df.columns[~fs_df.columns.str.contains("Metadata")].to_list()

aggregated_df = aggregate(
    fs_df,
    features=feature_cols,
    strata=["Metadata_Well", "Metadata_Time", "Metadata_dose"],
    operation="median",
)
aggregated_df = pd.merge(
    aggregated_df,
    fs_df[metadata_cols],
    how="left",
    on=["Metadata_Well", "Metadata_Time", "Metadata_dose"],
)
# rearrange the columns such that the metadata columns are first
for col in reversed(aggregated_df.columns):
    if col.startswith("Metadata_"):
        tmp_pop = aggregated_df.pop(col)
        aggregated_df.insert(0, col, tmp_pop)

print(aggregated_df.shape)
aggregated_df.to_parquet(output_profile_dir)
aggregated_df.head()

(142040, 2425)


Unnamed: 0,Metadata_Well,Metadata_Time,Metadata_dose,Metadata_plate,Metadata_number_of_singlecells,Metadata_compound,Metadata_control,Metadata_ImageNumber,Metadata_FOV,Metadata_Cells_Number_Object_Number,...,channel_DNA_cls_feature_91_scDINO,channel_DNA_cls_feature_92_scDINO,channel_DNA_cls_feature_93_scDINO,channel_DNA_cls_feature_94_scDINO,channel_DNA_cls_feature_95_scDINO,channel_DNA_cls_feature_96_scDINO,channel_DNA_cls_feature_97_scDINO,channel_DNA_cls_feature_98_scDINO,channel_DNA_cls_feature_99_scDINO,channel_DNA_cls_feature_9_scDINO
0,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,18.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
1,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,21.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
2,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,27.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
3,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,38.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
4,C-02,0.0,0.0,1,192,Staurosporine,negative,1,3,41.0,...,0.229693,-0.149462,0.096256,-0.093128,0.044964,0.164799,0.055467,0.005083,-0.242997,0.102387
