# Determine expression relationships between constructs
Correlate post feature selection well-aggregated morphology features across the same concentrations.

## Imports

In [1]:
import pathlib
import sys

import pandas as pd

## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

## Define paths

### Input paths

In [3]:
# Path to correlation class
sys.path.append(
    f"{root_dir}/0.data_analysis/utils"
)

# Class for calculating correlations
from CorrelateData import CorrelateData

platedf_path = pathlib.Path(root_dir / "../nf1_cellpainting_data/3.processing_features/data/single_cell_profiles/Plate_4_bulk_camerons_method.parquet").resolve(strict=True)
platedf = pd.read_parquet(platedf_path)

print(platedf.shape)
platedf.head()

(60, 1153)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,Metadata_Concentration,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_03_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_01_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,B,2,B2,111,NF1,WT,1000,,0,0.0,...,-0.495849,-0.497374,-0.519248,-0.494432,0.195967,0.539823,-0.425107,0.020265,-0.314718,-0.037639
1,B,3,B3,133,NF1,WT,1000,Scramble,1,0.05,...,-0.172777,-0.316575,-0.320163,-0.23808,0.345292,0.215819,-0.260976,0.079127,-0.298335,-0.161293
2,B,4,B4,97,NF1,WT,1000,Scramble,1,0.005,...,-0.178477,-0.36632,-0.351161,-0.237826,0.176531,0.211642,-0.36644,0.046231,-0.220766,-0.1841
3,B,5,B5,124,NF1,WT,1000,,0,0.0,...,-0.17288,-0.233792,-0.229009,-0.233701,0.20331,0.026878,-0.265577,-0.019935,-0.373997,-0.267422
4,B,6,B6,102,NF1,WT,1000,Scramble,1,0.005,...,-0.156743,-0.240875,-0.439017,-0.329501,0.282654,0.331662,0.260922,0.392112,-0.301106,-0.209467


### Output paths

In [4]:
data_path = pathlib.Path("construct_correlation_data")
data_path.mkdir(parents=True, exist_ok=True)

## Label untreated cells

In [5]:
platedf["Metadata_siRNA"].fillna("No Construct", inplace=True)
platedf.dropna(inplace=True)

In [6]:
meta_cols = platedf.filter(like="Metadata").columns
feat_cols = platedf.drop(columns=meta_cols).columns

## Compute Correlations

In [7]:
# Store correlations
corrdfs = []

cp = CorrelateData()

# Include cells with no construct treatment
platedfz = platedf.loc[platedf["Metadata_Concentration"] == 0.0].copy()

# Compute correlations for each concentration
for conc, concdf in platedf.groupby("Metadata_Concentration"):

    # Include the cells not treated with a construct in the correlation comparisons
    concdf = pd.concat([
        concdf,
        platedfz.copy()
    ], axis=0)

    # Correlates all wells between the same siRNA-genotype combinations
    corrdfs.append(cp.intra_correlations(
        _df = concdf.reset_index(drop=True).copy(),
        _antehoc_group_cols = ["Metadata_siRNA", "Metadata_genotype"],
        _feat_cols = feat_cols,
        _posthoc_group_cols = ["Metadata_Well"],
    )
    )

    # Save the concentration and type of comparison
    corrdfs[-1]["Metadata_Concentration"] = conc

    # Don't compute correlations for cells not treated with a construct
    # The cells in this group is already compared to the constructs at every other concentration
    if conc == 0.0:
        continue

    # Correlates all wells between different siRNA-well combinations
    corrdfs.append(cp.inter_correlations(
        _df = platedf.reset_index(drop=True).copy(),
        _antehoc_group_cols = ["Metadata_siRNA", "Metadata_genotype"],
        _feat_cols = feat_cols,
        _posthoc_group_cols = ["Metadata_Well"],
    )
    )

    # Save the concentration
    corrdfs[-1]["Metadata_Concentration"] = conc

## Store Correlation Data

In [8]:
corrdfs = pd.concat(corrdfs, axis=0)
corrdfs.to_parquet(f"{data_path}/plate_4_sc_feature_selected_camerons_agg_well_correlations.parquet", index=False)

In [10]:
print(corrdfs.shape)
corrdfs.head()

(7369, 8)


Unnamed: 0,correlation,Metadata_Well__group0,Metadata_Well__group1,Metadata_siRNA__group0,Metadata_siRNA__group1,Metadata_genotype__group0,Metadata_genotype__group1,Metadata_Concentration
0,0.288538,C2,C5,No Construct,No Construct,Null,Null,0.0
1,0.508665,C2,C8,No Construct,No Construct,Null,Null,0.0
2,0.372519,C2,D2,No Construct,No Construct,Null,Null,0.0
3,0.4761,C2,D5,No Construct,No Construct,Null,Null,0.0
4,0.45751,C2,D8,No Construct,No Construct,Null,Null,0.0
