# Well-Aggregated Plate and Genotype Correlation Analysis
Correlations between groups defined by genotype and plate are determined to understand the similarities between group morphologies.
There are two genotypes {WT, Null}, and three plates {Plate 3, Plate 3 prime, Plate 5} explored in this correlation analysis.
These correlations are computed between cell morphologies aggregated to the well level after feature selection.

In [1]:
import pathlib
import sys

import pandas as pd

# Path to correlation class
sys.path.append(
    "../utils"
)

# Class for calculating correlations
from CorrelateData import CorrelateData

## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

# Inputs

In [3]:
data_path = pathlib.Path(root_dir / "nf1_painting_repo/3.processing_features/data/single_cell_profiles").resolve(strict=True)

plate3df_path = pathlib.Path(root_dir / data_path / "Plate_3_bulk_camerons_method.parquet").resolve(strict=True)
plate3pdf_path = pathlib.Path(root_dir / data_path / "Plate_3_prime_bulk_camerons_method.parquet").resolve(strict=True)
plate5df_path = pathlib.Path(root_dir / data_path / "Plate_5_bulk_camerons_method.parquet").resolve(strict=True)

plate3df = pd.read_parquet(plate3df_path)
plate3pdf = pd.read_parquet(plate3pdf_path)
plate5df = pd.read_parquet(plate5df_path)

# Outputs

In [4]:
plate_correlation_path = pathlib.Path("construct_correlation_data")
plate_correlation_path.mkdir(parents=True, exist_ok=True)

# Process Bulk Plate Data

## Combine data
Concat plate data and retain common columns.

In [5]:
plates_cols = plate3df.columns.intersection(plate3pdf.columns).intersection(plate5df.columns)
platesdf = pd.concat([plate3df, plate3pdf, plate5df], axis=0)
platesdf = platesdf[plates_cols]

In [6]:
platesdf.head()

Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Plate,Cytoplasm_AreaShape_Eccentricity,Cytoplasm_AreaShape_FormFactor,Cytoplasm_AreaShape_MajorAxisLength,...,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_GFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256
0,B,1,B1,45,NF1,WT,Plate_3,0.767925,0.15051,0.53336,...,0.959132,1.040629,0.99025,1.022979,-1.190657,-0.854267,-0.880062,-0.599407,-0.757853,0.147649
1,B,2,B2,139,NF1,WT,Plate_3,0.398968,0.094713,0.556944,...,0.068585,0.088951,0.048348,-0.055821,0.269382,-0.152886,-0.17863,-0.072712,-0.025244,-0.300286
2,B,3,B3,297,NF1,WT,Plate_3,0.344648,-0.057618,0.207649,...,0.019349,-0.082521,-0.012171,-0.018876,0.665645,-0.040054,-0.027139,0.036813,0.169018,-0.777099
3,B,4,B4,559,NF1,WT,Plate_3,0.114339,-0.558137,-0.233815,...,-0.132589,-0.113166,0.002563,0.006693,0.391801,-0.414899,-0.346908,-0.310155,-0.21136,-0.147805
4,B,9,B9,71,NF1,Null,Plate_3,0.410983,0.819365,0.228507,...,0.896223,0.887652,0.857612,0.940805,-1.179173,-0.45506,-0.400958,-0.474513,-0.415342,0.225277


In [7]:
# Morphology and metadata columns
morph_cols = [col for col in platesdf.columns if "Metadata" not in col]
meta_cols = platesdf.columns.difference(morph_cols)

# Correlate wells
Wells are correlated between plate and genotype.

In [8]:
cd = CorrelateData()
correlationsdf = []

## Well Correlations (same genotypes and different plates)

In [9]:
for genotype in platesdf["Metadata_genotype"].unique():

    correlation_params = {
    }

    correlationsdf.append(
        cd.inter_correlations(
            _df=platesdf.loc[platesdf["Metadata_genotype"] == genotype].copy(),
            _antehoc_group_cols=["Metadata_Plate"],
            _feat_cols=morph_cols,
            _posthoc_group_cols=["Metadata_Well", "Metadata_genotype"],
            _drop_cols=["Metadata_Well"]
        )
    )

## Well Correlations (different genotypes and all possible plates)
Well correlations between different genotypes are computed, regardless of the plate

In [10]:
correlationsdf.append(
    cd.inter_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Plate", "Metadata_Well"],
        _drop_cols=["Metadata_Well"]
    )
)

## Well Correlations (same genotype and same plate)

In [11]:
correlationsdf.append(
    cd.intra_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_Plate", "Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Well"],
        _drop_cols=["Metadata_Well"]
    )
)

# Save Plate Correlations

In [12]:
correlationsdf = pd.concat(correlationsdf, axis=0)
correlationsdf.to_parquet(plate_correlation_path / "well_agg_plate_genotype_correlations.parquet")

In [13]:
correlationsdf.head()

Unnamed: 0,correlation,Metadata_Plate__group0,Metadata_Plate__group1,Metadata_genotype__group0,Metadata_genotype__group1
0,0.041393,Plate_3,Plate_3_prime,WT,WT
1,0.161703,Plate_3,Plate_3_prime,WT,WT
2,0.30589,Plate_3,Plate_3_prime,WT,WT
3,-0.204942,Plate_3,Plate_3_prime,WT,WT
4,0.307921,Plate_3,Plate_3_prime,WT,WT
