# Correlate Cell Profiler Aggregated Wells in Plate 5

In [1]:
import pathlib
import sys

import pandas as pd

## Find the root of the git repo on the host system

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

## Define paths

### Input paths

In [3]:
# Path to correlation class
sys.path.append(
    f"{root_dir}/0.data_analysis/utils"
)

# Class for calculating correlations
from CorrelateData import CorrelateData

platedf_path = pathlib.Path(root_dir / "nf1_painting_repo/3.processing_features/data/single_cell_profiles/Plate_5_sc_feature_selected.parquet").resolve(strict=True)
platedf = pd.read_parquet(platedf_path)

### Output paths

In [4]:
data_path = pathlib.Path("plate_5_sc_feature_selected_camerons_agg_well_corr_data")
data_path.mkdir(parents=True, exist_ok=True)

## Drop missing columns

In [5]:
platedf.dropna(inplace=True)

## Aggregate cells with cameron's method

In [6]:
meta_cols = platedf.filter(like="Metadata").columns
feat_cols = platedf.drop(columns=meta_cols).columns

median_cols = {col_name: "median" for col_name in platedf.columns if col_name not in meta_cols}

# Set metadata columns to lambda functions set to the first row
meta_cols = {
    col_name: lambda x: x.iloc[0]
    for col_name in meta_cols
}

# Combine the dictionaries
median_cols.update(meta_cols)

# Aggregate the plate data
welldf = platedf.groupby("Metadata_Well").agg(median_cols)

## Compute Correlations

In [7]:
cd = CorrelateData()
correlationsdf = []

# Correlates aggregated wells across genotype
correlationsdf.append(cd.inter_correlations(
    welldf.reset_index(drop=True),
    ["Metadata_Well"],
    feat_cols,
    ["Metadata_genotype"]
))

In [8]:
# Correlates aggregated wells within genotype
correlationsdf.append(cd.intra_correlations(
    welldf.reset_index(drop=True),
    ["Metadata_Well"],
    feat_cols,
    ["Metadata_genotype"]
))

## Store Correlation Data

In [9]:
correlationsdf = pd.concat(correlationsdf, axis=0)
correlationsdf.to_parquet(f"{data_path}/plate_5_sc_feature_selected_camerons_agg_well_corr.parquet")

In [10]:
correlationsdf.head()

Unnamed: 0,correlation,Metadata_genotype__group0,Metadata_genotype__group1,Metadata_Well__group0,Metadata_Well__group1
0,0.643891,HET,HET,B5,B5
1,0.636822,HET,HET,B5,B5
2,0.45699,HET,HET,B5,B5
3,0.584771,HET,HET,B5,B5
4,0.576349,HET,HET,B5,B5
