# Well-Aggregated Plate and Genotype Correlation Analysis
Correlations between groups defined by genotype and plate are determined to understand the similarities between group morphologies.
These correlations are computed between cell morphologies aggregated to the well level.

In [1]:
import pathlib
import sys

import pandas as pd

# Path to correlation class
sys.path.append(
    "../utils"
)

# Class for calculating correlations
from CorrelateData import CorrelateData

## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

# Inputs

In [3]:
data_path = pathlib.Path(root_dir / "nf1_painting_repo/3.processing_features/data/single_cell_profiles").resolve(strict=True)

plate3df_path = pathlib.Path(root_dir / data_path / "Plate_3_bulk_camerons_method.parquet").resolve(strict=True)
plate3pdf_path = pathlib.Path(root_dir / data_path / "Plate_3_prime_bulk_camerons_method.parquet").resolve(strict=True)
plate5df_path = pathlib.Path(root_dir / data_path / "Plate_5_bulk_camerons_method.parquet").resolve(strict=True)

plate3df = pd.read_parquet(plate3df_path)
plate3pdf = pd.read_parquet(plate3pdf_path)
plate5df = pd.read_parquet(plate5df_path)

# Outputs

In [4]:
plate_correlation_path = pathlib.Path("construct_correlation_data/well_agg_plate_genotype_correlations.parquet")
plate_correlation_path.mkdir(parents=True, exist_ok=True)

# Process Bulk Plate Data

## Combine data
Concat plate data and retain common columns.

In [5]:
plates_cols = plate3df.columns.intersection(plate3pdf.columns).intersection(plate5df.columns)
platesdf = pd.concat([plate3df, plate3pdf, plate5df], axis=0)
platesdf = platesdf[plates_cols]

In [6]:
# Morphology and metadata columns
morph_cols = [col for col in platesdf.columns if "Metadata" not in col]
meta_cols = platesdf.columns.difference(morph_cols)

# Correlate wells
Wells are correlated between plate and genotype.

In [7]:
cd = CorrelateData()
correlationsdf = []

In [8]:
cd.intra_correlations(
    _df=plate3df.loc[plate3df["Metadata_genotype"] == "WT"].copy(),
    _antehoc_group_cols=["Metadata_Plate", "Metadata_genotype"],
    _feat_cols=morph_cols,
    _posthoc_group_cols=["Metadata_Well"],
    _drop_cols=["Metadata_Well"]
)

Unnamed: 0,correlation,Metadata_Plate__group0,Metadata_Plate__group1,Metadata_genotype__group0,Metadata_genotype__group1
0,-0.026079,Plate_3,Plate_3,WT,WT
1,-0.200851,Plate_3,Plate_3,WT,WT
2,-0.427771,Plate_3,Plate_3,WT,WT
3,0.735014,Plate_3,Plate_3,WT,WT
4,-0.000172,Plate_3,Plate_3,WT,WT
...,...,...,...,...,...
271,0.495673,Plate_3,Plate_3,WT,WT
272,0.142012,Plate_3,Plate_3,WT,WT
273,0.653337,Plate_3,Plate_3,WT,WT
274,0.236784,Plate_3,Plate_3,WT,WT


## Well Correlations (same genotypes different plates)

In [9]:
for genotype in platesdf["Metadata_genotype"].unique():

    correlation_params = {
    }

    correlationsdf.append(
        cd.inter_correlations(
            _df=platesdf.loc[platesdf["Metadata_genotype"] == genotype].copy(),
            _antehoc_group_cols=["Metadata_Plate"],
            _feat_cols=morph_cols,
            _posthoc_group_cols=["Metadata_Well", "Metadata_genotype"],
            _drop_cols=["Metadata_Well"]
        )
    )

## Well Correlations (different genotypes and all possible plates)

In [10]:
correlationsdf.append(
    cd.inter_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Plate", "Metadata_Well"],
        _drop_cols=["Metadata_Well"]
    )
)

## Well Correlations (same genotype and same plate)

In [11]:
correlationsdf.append(
    cd.intra_correlations(
        _df=platesdf.copy(),
        _antehoc_group_cols=["Metadata_Plate", "Metadata_genotype"],
        _feat_cols=morph_cols,
        _posthoc_group_cols=["Metadata_Well"],
        _drop_cols=["Metadata_Well"]
    )
)

# Save Plate Correlations

In [12]:
correlationsdf = pd.concat(correlationsdf, axis=0)

correlationsdf.to_parquet()

b'PAR1\x15\x04\x15\x80\x87\n\x15\x92\x87\nL\x15\xf0\xa0\x01\x15\x00\x12\x00\x00\xc0\x83\x05\xf4\xff\xffYl\xcd\xc2|1\xa5?T\xb3\xa6\x8c\xb0\xb2\xc4?\xf2+\x04\xd0\xb3\x93\xd3?\x90\x0eq=\x8e;\xca\xbf\xef\xa9\x8f\x7f\xfa\xb4\xd3?o\xc0\xaf\xf1\xe6\x17\xa0\xbf\xf2\xea|\xb4\x88H\xc5?\xc70I#\xd8\xe5\xd9\xbf\x04=\xd4+\xae\x19\xc5?\xd8\xc0\x14\xfa\xfd\x1c\xb6?m\xff\xd10a\x9d\xaf?\xb4\xd6\xff\x11R\xba\xdd\xbfo,\x0c{+"\xc2?\xf3\x08t\xfa"X\xba?\x8aO\xd1\xa5\x9c\xc4\xd0?\xc8\x943\xdf\xfb)\xdd\xbf\xf7\x10\xc9\xe6hC\xc1?\x14z\xd0\x12\xa0\x8e\xc4?\x0e\x98\xf5\xc0\x9b\xe9\xd8?HK\nn\n\xfd\xd8\xbfv\xc5\xdd\x95\xa2\x80\xad?[\x19c%\xd3\xb6\xc3?k\x89\x9d\xe3\x9e,\xc7?\xad;\x1b~\x1b\x94\xda\xbfw\xbf6e/\xb3\xd6?\xe2f\xc2K\x08\x9a\xd9?\xe19\x7f\x19|\xcd\xd3?"\xb6s\xce\xa8\x8a\xd6?v\xe4\xcbjM\x13\xda?\x03cf\t\xa5)\xdf?\xb2\x89\xd6\x90\xb9\xc2\xda?\xd5\xdb\xb3fR\xbf\xd0?\xee\xc6\xfd\xdcH\x16\xd2?@\xacG\x06\xe9\x12\xdc?F\x95\x93\x0fc\x13\xe0?\x8ec\xd6\x8f8\x0f\xc5?\x8c\xc7B\x94\x9d\x14\xd0?\xbd\x1b\x0fg\xbeQ\xda?\x

In [13]:
correlationsdf.head()

Unnamed: 0,correlation,Metadata_Plate__group0,Metadata_Plate__group1,Metadata_genotype__group0,Metadata_genotype__group1
0,0.041393,Plate_3,Plate_3_prime,WT,WT
1,0.161703,Plate_3,Plate_3_prime,WT,WT
2,0.30589,Plate_3,Plate_3_prime,WT,WT
3,-0.204942,Plate_3,Plate_3_prime,WT,WT
4,0.307921,Plate_3,Plate_3_prime,WT,WT
