In [1]:
import pandas as pd
import pathlib
import numpy as np

In [2]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

## Function for pairwise correlations

In [3]:
def generate_correlations(df, feat_cols):
    # Copy df so that data is not lost
    df_corr = df.copy()

    # Generate Pearson correlations between all wells
    correlations = df_corr.loc[:, feat_cols].transpose().corr(method='pearson')

    # Remove the lower triangle
    correlations = correlations.where(np.triu(np.ones(correlations.shape), k=1).astype(bool))

    # Flip, reset index, and add column names
    correlations = correlations.stack().reset_index()
    correlations.columns = ['group0_index', 'group1_index', 'correlation']

    # Map index to corresponding Metadata_Well__group
    correlations['Metadata_Well__group0'] = df.loc[correlations['group0_index'], 'Metadata_Well'].values
    correlations['Metadata_Well__group1'] = df.loc[correlations['group1_index'], 'Metadata_Well'].values

    correlations['Metadata_genotype__group0'] = df.loc[correlations['group0_index'], 'Metadata_genotype'].values
    correlations['Metadata_genotype__group1'] = df.loc[correlations['group1_index'], 'Metadata_genotype'].values

    # Map index to Metadata_plate
    correlations['Metadata_plate__group0'] = df.loc[correlations['group0_index'], 'Metadata_Plate'].values
    correlations['Metadata_plate__group1'] = df.loc[correlations['group1_index'], 'Metadata_Plate'].values

    # Conditionally include Metadata_seed_density
    if 'Metadata_seed_density' in df.columns:
        correlations['Metadata_seed_density__group0'] = df.loc[correlations['group0_index'], 'Metadata_seed_density'].values
        correlations['Metadata_seed_density__group1'] = df.loc[correlations['group1_index'], 'Metadata_seed_density'].values
    else: # Default to 0 since the column has to be of the same type (can't be a str)
        correlations['Metadata_seed_density__group0'] = 0
        correlations['Metadata_seed_density__group1'] = 0

    # Drop the index columns
    correlations = correlations.drop(columns=['group0_index', 'group1_index'])

    return correlations

## Load in data and compute correlations per well

### Plate 4 only controls

In [4]:
# Load in plate 4 dataframe (only controls)
plate4_path = pathlib.Path(f"{root_dir}/../nf1_cellpainting_data/3.processing_features/data/single_cell_profiles/Plate_4_bulk_camerons_method.parquet")
plate4df = pd.read_parquet(plate4_path)
# Fill missing values in Metadata_siRNA column with 'No Construct'
plate4df['Metadata_siRNA'] = plate4df['Metadata_siRNA'].fillna('No Construct')
# Only include rows where Metadata_siRNA contains 'No Construct'
plate4df = plate4df[plate4df['Metadata_siRNA'].str.contains('No Construct')]

print(plate4df.shape)
plate4df.head()

(15, 1153)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_siRNA,Metadata_RNAiMax,Metadata_Concentration,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_03_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_01_256,Nuclei_Texture_SumVariance_DAPI_3_01_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,B,2,B2,111,NF1,WT,1000,No Construct,0,0.0,...,-0.495849,-0.497374,-0.519248,-0.494432,0.195967,0.539823,-0.425107,0.020265,-0.314718,-0.037639
3,B,5,B5,124,NF1,WT,1000,No Construct,0,0.0,...,-0.17288,-0.233792,-0.229009,-0.233701,0.20331,0.026878,-0.265577,-0.019935,-0.373997,-0.267422
6,B,8,B8,125,NF1,WT,1000,No Construct,0,0.0,...,-0.273476,-0.227579,-0.272331,-0.303979,0.359007,0.097057,-0.400673,0.006489,-0.27764,-0.260283
9,B,11,B11,101,NF1,WT,1000,No Construct,1,0.0,...,-0.064904,-0.131255,-0.002931,-0.031453,0.264286,0.019452,-0.242469,0.003148,-0.305344,-0.263122
10,C,2,C2,140,NF1,Null,1000,No Construct,0,0.0,...,-0.134203,0.000661,0.083006,0.069496,0.216541,-0.214588,-0.402276,-0.242664,-0.354394,-0.432532


### Plate 3

In [5]:
# Load in plate 3 dataframe
plate3_path = pathlib.Path(f"{root_dir}/../nf1_cellpainting_data/3.processing_features/data/single_cell_profiles/Plate_3_bulk_camerons_method.parquet")
plate3df = pd.read_parquet(plate3_path)

print(plate3df.shape)
plate3df.head()

(48, 1161)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_Plate,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_03_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_01_256,Nuclei_Texture_SumVariance_DAPI_3_03_256,Nuclei_Texture_SumVariance_GFP_3_01_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,B,1,B1,45,NF1,WT,500,Plate_3,-0.428712,0.767925,...,-0.854267,-0.880062,-0.599407,-0.757853,0.589546,0.98551,0.466899,0.147649,0.526576,0.56245
1,B,2,B2,139,NF1,WT,1000,Plate_3,-0.394507,0.398968,...,-0.152886,-0.17863,-0.072712,-0.025244,0.313276,-0.168541,-0.344835,-0.300286,-0.34456,-0.370466
2,B,3,B3,297,NF1,WT,2000,Plate_3,-0.293458,0.344648,...,-0.040054,-0.027139,0.036813,0.169018,-0.382035,-0.266505,-0.441555,-0.777099,-0.409013,-0.377588
3,B,4,B4,559,NF1,WT,4000,Plate_3,0.145103,0.114339,...,-0.414899,-0.346908,-0.310155,-0.21136,0.256204,-0.133034,-0.358743,-0.147805,-0.378605,-0.351599
4,B,9,B9,71,NF1,Null,500,Plate_3,-0.751483,0.410983,...,-0.45506,-0.400958,-0.474513,-0.415342,0.61032,0.952098,-0.051223,0.225277,1.029452,0.239045


### Plate 3 prime

In [6]:
# Load in plate 3 prime dataframe
plate3p_path = pathlib.Path(f"{root_dir}/../nf1_cellpainting_data/3.processing_features/data/single_cell_profiles/Plate_3_prime_bulk_camerons_method.parquet")
plate3pdf = pd.read_parquet(plate3p_path)

# Update Metadata_Plate for all rows
plate3pdf['Metadata_Plate'] = 'Plate_3_prime'

print(plate3pdf.shape)
plate3pdf.head()

(48, 1136)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_seed_density,Metadata_Plate,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_BoundingBoxArea,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_CY5_3_02_256,Nuclei_Texture_SumEntropy_DAPI_3_01_256,Nuclei_Texture_SumEntropy_RFP_3_00_256,Nuclei_Texture_SumVariance_CY5_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_03_256
0,B,1,B1,30,NF1,WT,500,Plate_3_prime,0.655545,0.543898,...,-0.041663,-0.162629,0.297921,-0.98454,0.592404,0.038935,-0.586258,0.471944,0.1346,-0.251341
1,B,2,B2,55,NF1,WT,1000,Plate_3_prime,0.357419,0.289749,...,-0.245817,-0.118157,-0.364698,-1.308534,0.359833,0.462807,-0.651352,-0.001366,-0.183358,-0.042808
2,B,3,B3,77,NF1,WT,2000,Plate_3_prime,0.405366,0.292209,...,-0.491104,-0.398379,-0.148732,0.482892,-0.152515,0.204035,-0.074894,-0.564649,-0.274598,-0.171764
3,B,4,B4,219,NF1,WT,4000,Plate_3_prime,0.03974,-0.05598,...,-0.141572,-0.292074,-0.185948,0.303884,-0.126844,-0.062962,-0.232299,-0.626833,-0.345926,-0.356443
4,B,9,B9,47,NF1,Null,500,Plate_3_prime,0.542509,0.424221,...,-0.059991,0.084494,-0.175281,-0.48408,0.400114,0.202381,-0.431911,0.556435,-0.047649,-0.21765


### Plate 5

In [7]:
# Load in plate 3 rpime dataframe
plate5_path = pathlib.Path(f"{root_dir}/../nf1_cellpainting_data/3.processing_features/data/single_cell_profiles/Plate_5_bulk_camerons_method.parquet")
plate5df = pd.read_parquet(plate5_path)

print(plate5df.shape)
plate5df.head()

(48, 1164)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_Well,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_Plate,Cytoplasm_AreaShape_Area,Cytoplasm_AreaShape_Compactness,Cytoplasm_AreaShape_Eccentricity,...,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256,Nuclei_Texture_SumEntropy_DAPI_3_02_256,Nuclei_Texture_SumEntropy_RFP_3_01_256,Nuclei_Texture_SumVariance_CY5_3_03_256,Nuclei_Texture_SumVariance_DAPI_3_03_256,Nuclei_Texture_SumVariance_GFP_3_03_256,Nuclei_Texture_SumVariance_RFP_3_01_256
0,B,1,B1,79,NF1,WT,Plate_5,0.083938,-0.066136,0.3485,...,-0.299631,-0.08148,-0.301456,-0.125438,0.19862,0.00016,-0.574768,-0.294731,-0.258817,-0.316784
1,B,2,B2,97,NF1,WT,Plate_5,-0.131937,-0.206127,0.316345,...,-0.345351,-0.459259,-0.293587,-0.310423,0.436946,0.261724,-0.264806,-0.064707,-0.369254,-0.236444
2,B,3,B3,79,NF1,WT,Plate_5,-0.182499,0.033284,0.217583,...,-0.109244,-0.067043,-0.219248,-0.17034,0.522507,-0.06101,-0.45781,-0.055988,-0.452981,-0.321178
3,B,4,B4,112,NF1,WT,Plate_5,0.481475,-0.001622,0.107639,...,-0.161333,-0.128519,-0.282394,-0.109331,0.389289,-0.113606,-0.558904,-0.199895,-0.5013,-0.339054
4,B,9,B9,173,NF1,Null,Plate_5,0.042459,-0.381537,0.086107,...,-0.103168,-0.022629,-0.113879,-0.098732,0.44531,-0.171365,-0.577611,-0.087306,-0.476644,-0.341957


## Concat data and generate correlations

In [8]:
# List of dataframes
dfs = [plate3df, plate4df, plate3pdf, plate5df]

# Specified metadata columns to keep
metadata_columns = ['Metadata_Plate', 'Metadata_Well', 'Metadata_genotype', 'Metadata_seed_density']

# Find the common feature columns (not starting with 'Metadata')
common_feature_columns = set(dfs[0].columns) - set(metadata_columns)
for df in dfs[1:]:
    common_feature_columns.intersection_update(set(df.columns) - set(metadata_columns))

# Convert to sorted list for consistent ordering
common_feature_columns = sorted(common_feature_columns)

# Create a list of all necessary columns, metadata first
all_columns = metadata_columns + sorted(common_feature_columns)

# Reindex each dataframe to have all necessary columns, filling missing values with NaN
dfs_reindexed = [df.reindex(columns=all_columns, fill_value=pd.NA) for df in dfs]

# Concatenate the dataframes
result = pd.concat(dfs_reindexed, ignore_index=True)

print(result.shape)
result.head()

(159, 860)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_genotype,Metadata_seed_density,Cells_AreaShape_MedianRadius,Cells_AreaShape_Zernike_2_0,Cells_AreaShape_Zernike_3_1,Cells_AreaShape_Zernike_4_0,Cells_AreaShape_Zernike_4_2,Cells_AreaShape_Zernike_5_1,...,Nuclei_Texture_InfoMeas1_RFP_3_02_256,Nuclei_Texture_InfoMeas2_GFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_00_256,Nuclei_Texture_InfoMeas2_RFP_3_01_256,Nuclei_Texture_InfoMeas2_RFP_3_02_256,Nuclei_Texture_InfoMeas2_RFP_3_03_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_00_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_01_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_02_256,Nuclei_Texture_InverseDifferenceMoment_RFP_3_03_256
0,Plate_3,B1,WT,500,0.014679,-0.317731,-0.481998,0.159338,-0.152619,-0.088,...,-0.908469,1.033512,0.959132,1.040629,0.99025,1.022979,-0.854267,-0.880062,-0.599407,-0.757853
1,Plate_3,B2,WT,1000,0.570749,-0.361819,-0.2073,-0.140704,-0.131434,0.166185,...,0.356421,0.172682,0.068585,0.088951,0.048348,-0.055821,-0.152886,-0.17863,-0.072712,-0.025244
2,Plate_3,B3,WT,2000,0.107733,-0.055177,-0.174336,0.073085,-0.038556,-0.077802,...,0.406876,-0.223375,0.019349,-0.082521,-0.012171,-0.018876,-0.040054,-0.027139,0.036813,0.169018
3,Plate_3,B4,WT,4000,-0.346284,0.208626,0.022589,-0.240306,-0.058304,-0.079331,...,0.450849,-0.193052,-0.132589,-0.113166,0.002563,0.006693,-0.414899,-0.346908,-0.310155,-0.21136
4,Plate_3,B9,Null,500,0.436601,-0.293247,-0.0741,-0.176156,0.063355,0.116358,...,-0.621968,1.081576,0.896223,0.887652,0.857612,0.940805,-0.45506,-0.400958,-0.474513,-0.415342


In [9]:
# Identify feature columns
feat_cols = [col for col in result.columns if not col.startswith('Metadata')]

result_corr = generate_correlations(df=result, feat_cols=feat_cols)

# Save the concatenated DataFrame as a Parquet file
result_corr.to_parquet('./construct_correlation_data/concatenated_all_plates_correlations.parquet', index=False)

print(result_corr.shape)
result_corr.head()

(12561, 9)


Unnamed: 0,correlation,Metadata_Well__group0,Metadata_Well__group1,Metadata_genotype__group0,Metadata_genotype__group1,Metadata_plate__group0,Metadata_plate__group1,Metadata_seed_density__group0,Metadata_seed_density__group1
0,-0.016705,B1,B2,WT,WT,Plate_3,Plate_3,500,1000
1,-0.18332,B1,B3,WT,WT,Plate_3,Plate_3,500,2000
2,-0.432502,B1,B4,WT,WT,Plate_3,Plate_3,500,4000
3,0.53271,B1,B9,WT,Null,Plate_3,Plate_3,500,500
4,0.104033,B1,B10,WT,Null,Plate_3,Plate_3,500,1000
