# <font color="red"> CellProfiler: profiling of P-Bodies (DCP1A, LSM14A) Sorbitol Experiment across all batches </font>
 
JIRA task: NN-151

Why Linear Mixed-Effects Models (LMMs)?
Your experimental structure involves:

- Two groups: dNLS_Untreated vs dNLS_DOX

- 5 batches per group

- Measurements per site image (the dependent variable), 50–250 site images per batch (i.e., image-level measurements)


- Random variation across batches random intercept per batch (i.e., variation across batches)

This design includes both fixed effects (groups) and random effects (batches)
Estimate how dNLS_DOX affects each CellProfiler feature vs dNLS_Untreated, accounting for batch effects (inter-batch variation).

In [1]:
from pathlib import Path
import glob
import sys
import os


os.environ['NOVA_HOME'] = '/home/projects/hornsteinlab/Collaboration/NOVA/'
os.environ['NOVA_DATA_HOME'] = f"{os.environ['NOVA_HOME']}/input"
print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))


import numpy as np
import pandas as pd
import seaborn as sns
from markdown import markdown
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from cell_profiler.code.cp_effect_size_utils import CP_OUTPUTS_FOLDER, validate_cp_files, extract_path_parts, merge_on_group, collect_cp_results_by_cell_line, load_cp_results, get_features_per_image, get_aggregated_features_per_image, collect_all_features, run_analysis_generate_report, print_mixedlm_conclusions, plot_cp_feature_grouped_by_gene, measures_to_plot
from manuscript.plot_config import PlotConfig

%load_ext autoreload    
%autoreload 2

NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA/


# New Sorbitol Experiment dataset - DCP1A

In [2]:
ANALYSIS_TYPE = 'PB_profiling/Sorbitol_DCP1A'

BATCHES = ['batch1', 'batch2', 'batch3']

# Save figures here
save_path = '/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/Sorbitol/cell_profiler/PB_profiling/Sorbitol_DCP1A'

_color_mapping = {'WT_Untreated':{'alias':'Wild-Type', 'color':'#37AFD7'},
                'WT_stress':{'alias':'Stress', 'color':'#11AFD7'}}


In [3]:
group_by_columns = ['ImageNumber', 'batch', 'rep', 'cell_line', 'condition']
REQUIRED_FILES = ['Image.csv', 'Pbodies.csv', 'Cytoplasm.csv']


In [4]:

# Test CP outputs (number of images)
if True:
    pattern = os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, '*', '*', '*', '*', '*', '*')
    # store marker folders by cell line
    for marker_path in glob.glob(pattern):
        if os.path.isdir(marker_path):
            try:
                image_df = pd.read_csv(marker_path +'/Image.csv')
                #print(marker_path, image_df.shape)
                #print(image_df[['Count_Pbodies', 'Count_nucleus']].head(10))
                
                # DEBUG CODE: to recognise problems in CP wiriting to the wrong folder
                # parts_df = image_df['PathName_nucleus'].apply(extract_path_parts)
                parts_df = image_df['PathName_DAPI'].apply(extract_path_parts)
                
                print(marker_path, parts_df['batch'].unique(), parts_df['cell_line'].unique(), parts_df['condition'].unique(), parts_df['rep'].unique(), )
                # DEBUG CODE

                marker = os.path.basename(marker_path)    
                cell_line = Path(marker_path).resolve().parents[3].name
            except FileNotFoundError as e:
                print("!!!!")
                print(e)
        else:
            print(f"Not a marker folder directory:{marker_path}")


/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_DCP1A/batch3/WT/panelA/stress/rep3/DCP1A ['batch3'] ['WT'] ['stress'] ['rep3']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_DCP1A/batch3/WT/panelA/stress/rep4/DCP1A ['batch3'] ['WT'] ['stress'] ['rep4']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_DCP1A/batch3/WT/panelA/stress/rep1/DCP1A ['batch3'] ['WT'] ['stress'] ['rep1']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_DCP1A/batch3/WT/panelA/stress/rep2/DCP1A ['batch3'] ['WT'] ['stress'] ['rep2']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_DCP1A/batch3/WT/panelA/Untreated/rep3/DCP1A 

## Collect CP files by "cell_line+condition" and Load CP data

In [5]:
# Collect paths of CP output files
paths_by_cell_line = collect_cp_results_by_cell_line(ANALYSIS_TYPE, include_condition=True)

In [6]:
# Load CP data
cp_data = load_cp_results(paths_by_cell_line, REQUIRED_FILES)


number of subjects from cell line WT_stress: 12
number of subjects from cell line WT_Untreated: 12


In [7]:
# Get the calculated features from all CP output files
cp_measurements = collect_all_features(cp_data, group_by_columns)

WT_stress (2674, 6) (2674, 32) (2674, 18)
(2674, 33)
(2674, 46)
WT_Untreated (2823, 6) (2823, 32) (2823, 18)
(2823, 33)
(2823, 46)
Shape after merging is: (5497, 46)


In [8]:
cp_measurements

Unnamed: 0,ImageNumber,batch,rep,cell_line,condition,num_pb,mean_AreaShape_Area,mean_AreaShape_Compactness,mean_AreaShape_Eccentricity,mean_AreaShape_EquivalentDiameter,...,mean_Math_Texture_Contrast_DCP1A_pb_only_5,mean_Math_Texture_Contrast_DCP1A_pb_only_9,mean_Math_Texture_Entropy_DCP1A_pb_only_15,mean_Math_Texture_Entropy_DCP1A_pb_only_3,mean_Math_Texture_Entropy_DCP1A_pb_only_5,mean_Math_Texture_Entropy_DCP1A_pb_only_9,mean_Math_Texture_Homogeneity_DCP1A_pb_only_15,mean_Math_Texture_Homogeneity_DCP1A_pb_only_3,mean_Math_Texture_Homogeneity_DCP1A_pb_only_5,mean_Math_Texture_Homogeneity_DCP1A_pb_only_9
0,1,batch1,rep1,WT,stress,10.282051,25.271820,0.792330,0.564398,5.464352,...,135.814722,29.119141,0.100133,0.836307,0.599088,0.281511,0.026699,0.169592,0.093890,0.063550
1,1,batch1,rep2,WT,stress,17.275862,23.950100,0.795016,0.568774,5.295376,...,65.387192,34.717262,0.332918,0.827373,0.645985,0.449233,0.141234,0.187459,0.123284,0.097678
2,1,batch1,rep3,WT,stress,12.117647,23.347087,0.772651,0.539623,5.230422,...,134.867132,26.421741,0.090862,0.798397,0.491850,0.249366,0.011753,0.171916,0.091922,0.065669
3,1,batch1,rep4,WT,stress,14.466667,25.951613,0.818639,0.582715,5.526321,...,72.692730,48.756461,0.141244,0.926325,0.709060,0.468966,0.054369,0.167290,0.167158,0.112909
4,1,batch2,rep1,WT,stress,14.593750,24.182013,0.792462,0.561082,5.336245,...,131.725029,21.158761,0.192388,0.937440,0.692591,0.346618,0.044577,0.158811,0.094750,0.079037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5492,246,batch2,rep4,WT,Untreated,8.545455,24.319149,0.763612,0.488463,5.355413,...,91.299167,10.135525,0.029112,0.731760,0.422588,0.093660,0.032886,0.150635,0.100497,0.024060
5493,247,batch2,rep2,WT,Untreated,10.511628,24.796460,0.782070,0.539933,5.379492,...,91.603439,24.110549,0.082067,0.761978,0.584771,0.197595,0.063270,0.221922,0.147233,0.055116
5494,247,batch2,rep4,WT,Untreated,11.441176,26.406170,0.799506,0.518545,5.593704,...,111.579842,19.583811,0.117775,0.839603,0.566913,0.229708,0.044934,0.153180,0.093709,0.185583
5495,248,batch2,rep2,WT,Untreated,8.490909,23.231263,0.761080,0.481098,5.257826,...,61.734043,16.734490,0.031328,0.747191,0.362847,0.143436,0.018134,0.117651,0.087549,0.078532


In [12]:
cp_measurements.groupby('batch')['ImageNumber'].unique().apply(len)

batch
batch1    241
batch2    249
batch3    245
Name: ImageNumber, dtype: int64

# Add new variable "group"

In [8]:


# Add group
cp_measurements['group'] = cp_measurements['cell_line']+"_"+cp_measurements['condition']

# Fiilter by lines
lines_to_include = ["WT_Untreated", "WT_stress"]
cp_measurements = cp_measurements[cp_measurements['group'].isin(lines_to_include)]
print(cp_measurements.shape)

# Important to put the reference group first in order for mixedlm() - has to be Categorical!
cp_measurements["group"] = pd.Categorical(
    cp_measurements["group"],
    categories=lines_to_include,
    ordered=True
)


(5497, 47)


In [9]:
cp_measurements[['rep', 'group']].value_counts()


rep   group       
rep4  WT_stress       726
rep2  WT_Untreated    720
rep3  WT_Untreated    707
rep1  WT_Untreated    706
rep4  WT_Untreated    690
rep1  WT_stress       667
rep3  WT_stress       644
rep2  WT_stress       637
Name: count, dtype: int64

In [10]:
cp_measurements[['batch']].value_counts()

batch 
batch2    1848
batch1    1846
batch3    1803
Name: count, dtype: int64

In [11]:
cp_measurements[['group']].value_counts()

group       
WT_Untreated    2823
WT_stress       2674
Name: count, dtype: int64

# Effect size modeling

The terms in the formula:
- measurement: The CellProfiler feature (e.g., mean number of p-bodies).
- gene_group: A fixed effect to test differences between WT, C9, +sALS, -sALS.
- rep: Random intercept for each patient, accounting for intra-patient correlation across site images.

What This Model Gives You:
- Estimates of group differences: WT vs other groups, with significance testing.

- Within-patient variability: Captures how consistent measurements are across images for a given patient.

- Between-patient variability: Tests whether observed effects are reproducible across patients.

- P-values or confidence intervals: For significance of gene group effects.



In [12]:
cp_measurements[['group', 'batch', 'num_pb']].groupby(['group', 'batch'], observed=False).describe()
#cp_measurements[['gene_group', 'num_pb']].groupby('gene_group', observed=False).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_pb,num_pb,num_pb,num_pb,num_pb,num_pb,num_pb,num_pb
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
group,batch,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
WT_Untreated,batch1,901.0,12.038027,4.007868,6.569444,9.469388,10.916667,13.15625,43.5
WT_Untreated,batch2,985.0,10.302657,2.694218,6.493671,8.490909,9.666667,11.225806,28.6
WT_Untreated,batch3,937.0,11.489772,4.377915,6.477273,8.734694,10.217391,12.628571,45.0
WT_stress,batch1,945.0,13.486353,4.87042,7.25,10.333333,12.117647,14.96,44.375
WT_stress,batch2,863.0,12.693563,5.090366,7.059701,10.091614,11.585366,13.983871,109.0
WT_stress,batch3,866.0,13.995236,6.979262,7.363636,10.245427,12.075047,15.905956,120.25


In [13]:
# get the CellProfiler features you want to calculate effect for 
cp_features_columns = [col for col in cp_measurements.columns if col not in group_by_columns + ['batch', 'group']]

results_df_DCP1A = run_analysis_generate_report(
                                df=cp_measurements,
                                feature_columns=cp_features_columns,
                                group_col="group",
                                batch_col="batch",
                                output_dir=os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, 'mixed_effect_report')
)







Analysing CP feature: num_pb
❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: num_pb
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     119.2
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           8.46e-75
Time:                        18:05:06   Log-Likelihood:                -16424.
No. Observations:                5497   AIC:                         3.286e+04
Df Residuals:                    5493   BIC:                         3.288e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
         

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.168
Model:                            OLS   Adj. R-squared:                  0.167
Method:                 Least Squares   F-statistic:                     368.6
Date:                Tue, 30 Sep 2025   Prob (F-statistic):          4.07e-218
Time:                        18:05:06   Log-Likelihood:                 10672.
No. Observations:                5497   AIC:                        -2.134e+04
Df Residuals:                    5493   BIC:                        -2.131e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.52

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     94.96
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           6.48e-60
Time:                        18:05:06   Log-Likelihood:                 43546.
No. Observations:                5497   AIC:                        -8.708e+04
Df Residuals:                    5493   BIC:                        -8.706e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.00

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.488
Model:                            OLS   Adj. R-squared:                  0.487
Method:                 Least Squares   F-statistic:                     1743.
Date:                Tue, 30 Sep 2025   Prob (F-statistic):               0.00
Time:                        18:05:06   Log-Likelihood:                 14614.
No. Observations:                5497   AIC:                        -2.922e+04
Df Residuals:                    5493   BIC:                        -2.919e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 1.36

❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_AreaShape_NormalizedMoment_1_2
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.079
Date:                Tue, 30 Sep 2025   Prob (F-statistic):              0.357
Time:                        18:05:07   Log-Likelihood:                 41001.
No. Observations:                5497   AIC:                        -8.199e+04
Df Residuals:                    5493   BIC:                        -8.197e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
            

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     90.46
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           3.87e-57
Time:                        18:05:07   Log-Likelihood:                 37000.
No. Observations:                5497   AIC:                        -7.399e+04
Df Residuals:                    5493   BIC:                        -7.397e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.00

❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_AreaShape_NormalizedMoment_3_3
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.052
Method:                 Least Squares   F-statistic:                     100.7
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           1.98e-63
Time:                        18:05:07   Log-Likelihood:                 48061.
No. Observations:                5497   AIC:                        -9.611e+04
Df Residuals:                    5493   BIC:                        -9.609e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
            

❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_Math_Texture_Contrast_DCP1A_pb_only_5
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.115
Model:                            OLS   Adj. R-squared:                  0.115
Method:                 Least Squares   F-statistic:                     238.5
Date:                Tue, 30 Sep 2025   Prob (F-statistic):          1.87e-145
Time:                        18:05:07   Log-Likelihood:                -25450.
No. Observations:                5497   AIC:                         5.091e+04
Df Residuals:                    5493   BIC:                         5.094e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
     

❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_Math_Texture_Entropy_DCP1A_pb_only_5
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     110.7
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           1.36e-69
Time:                        18:05:08   Log-Likelihood:                 3562.9
No. Observations:                5497   AIC:                            -7118.
Df Residuals:                    5493   BIC:                            -7091.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
      

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     57.65
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           1.09e-36
Time:                        18:05:08   Log-Likelihood:                 10675.
No. Observations:                5497   AIC:                        -2.134e+04
Df Residuals:                    5493   BIC:                        -2.132e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.09

In [14]:

def plot_cp_feature_grouped_by_gene(cp_measurements, cp_feature_col, group_col="gene_group", patient_col="patient_id", color_mapping=None, model_results_df=None, pdf_file=None):
    
    df = cp_measurements.copy()

    # Define fixed color and label mapping for both group_col and patient_col
    _palette = {}
    groups = df[group_col].astype(str).unique().tolist() + df[patient_col].astype(str).unique().tolist()
    for g in groups:
        if g in color_mapping: _palette[color_mapping[g]['alias']] = color_mapping[g]['color']
    if patient_col=='batch':
        for b in df[patient_col].astype(str).unique().tolist():
            _palette[b]='gray'

    
    # Rename groups to aliases
    label_mapping = {k: v["alias"] for k, v in color_mapping.items() if k in groups}
    df[group_col] = df[group_col].cat.rename_categories(label_mapping)
    df[patient_col] = df[patient_col].astype("category").cat.rename_categories(label_mapping)
    
    # Get x-axis positions for each group
    x_pos_map = {label: i for i, label in enumerate(sorted(df[group_col].unique()))}

    
    sns.set(style="white", font_scale=1.0)
    fig, ax = plt.subplots(figsize=(3, 4))
    line_width=1.5
    # Boxplot with custom order
    sns.boxplot(
        data=df,
        x=group_col,
        hue=group_col,  # or patient_col if needed
        y=cp_feature_col,
        palette=_palette,
        width=0.3,
        linewidth=line_width,
        showfliers=False,
        showmeans=True,
        meanline=True,
        meanprops={
        "linestyle": "-",
        "color": "black",
        "linewidth": line_width * 2
        },
        boxprops=dict(facecolor='white', edgecolor='black', linewidth=line_width),
        whiskerprops=dict(linewidth=line_width, color='black'),
        capprops=dict(linewidth=line_width, color='black'),
        medianprops=dict(visible=False),        
        order=list(x_pos_map.keys())  
    )

    # ============================
    # Full distribution: raw cell-level/image-level points
    # ============================
    sns.stripplot(
        data=df,
        x=group_col,
        y=cp_feature_col,
        order=list(x_pos_map.keys()),
        color="lightgray",
        size=2.5,
        jitter=0.25,
        alpha=0.4,
        dodge=False,
        zorder=1
    )
    # Then move legend
    ax = plt.gca()
    handles, labels = ax.get_legend_handles_labels()
    unique = dict(zip(labels, handles))
    plt.legend(
        unique.values(),
        unique.keys(),
        title=patient_col,
        bbox_to_anchor=(1.02, 1),
        loc="upper left"
    )
    
    # ============================
    # Overlay per-batch means
    # ============================
    batch_means = df.groupby([group_col, patient_col], observed=True)[cp_feature_col].mean().reset_index()
    
    for _, row in batch_means.iterrows():
        group = row[group_col]
        batch = row[patient_col]
        x = x_pos_map[group]
        y = row[cp_feature_col]
        # Jitter x slightly (mean = x, small std deviation)
        jittered_x = np.random.normal(loc=x, scale=0.05)  # adjust scale if needed
        
        ax.scatter(
            x=jittered_x,
            y=y,
            color=_palette.get(group, 'black'),
            edgecolor=None,
            s=20,
            zorder=3,
            label=batch  # will be deduplicated in legend
        )


    # Ensure tick marks are shown on both axes
    ax.tick_params(
        axis='both',
        which='both',
        direction='out',   # or 'in' or 'inout'
        length=4,          # length of the tick marks
        width=1,           # thickness of the ticks
        bottom=True, top=False,    # x-axis ticks
        left=True, right=False     # y-axis ticks
    )
    # ============================
    # p-value annotation LMM
    # ============================
    stat = model_results_df[(model_results_df['comparison'] == 'WT_stress') & (model_results_df['feature'] == cp_feature_col)]
    p = float(stat['pval'].iloc[0])

    #ci_low = float(stat['ci_lower'].values[0])
    #ci_high = float(stat['ci_upper'].values[0])
    #txt = f"(p = {p:.3f}, 95% CI: [{ci_low:.2f}, {ci_high:.2f}])"

    # Format p display
    if p < 0.001:
        p_text = "***"
    elif p < 0.01:
        p_text = "**"
    elif p < 0.05:
        p_text = "*"
    else:
        p_text = f"n.s. (p = {p:.2f})"

    # Use actual plot limits to place annotation
    ymin, ymax = ax.get_ylim()

    y_range = ymax - ymin
    line_y = ymax - 0.1 * y_range
    text_y = line_y 

    # Bridge line
    x1, x2 = 0, 1
    ax.plot([x1, x1, x2, x2],
            [line_y, line_y + 0.01*y_range, line_y + 0.01*y_range, line_y],
            lw=1.5, c='black')

    # Annotation text
    ax.text((x1 + x2) / 2, text_y, p_text, ha='center', va='bottom')
    
    # Format
    plt.ylabel(cp_feature_col)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.subplots_adjust(top=0.88)  # add top space if needed
    
    # Save the plot
    if pdf_file is not None:
        pdf_file.savefig(fig, bbox_inches='tight')
        
        plt.close(fig)
    else:
        plt.show()
    

In [15]:
with PdfPages(f"{save_path}/cell_profiler_Sorbitol_p_bodies_DCP1A.pdf") as pdf:
    for cp_feature_col in measures_to_plot:
        if cp_feature_col in cp_measurements.columns:
            plot_cp_feature_grouped_by_gene(cp_measurements, 
                                              cp_feature_col=cp_feature_col,
                                              group_col='group', 
                                              patient_col="batch",
                                              color_mapping=_color_mapping, 
                                              model_results_df=results_df_DCP1A,
                                              pdf_file=pdf
                                           )   

            


# New Sorbitol  dataset - LSM14A

In [16]:

ANALYSIS_TYPE = 'PB_profiling/Sorbitol_LSM14A'
BATCHES = ['batch1', 'batch2', 'batch3']

save_path = '/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/Sorbitol/cell_profiler/PB_profiling/Sorbitol_LSM14A'

In [17]:

# Test CP outputs (number of images)
if True:
    pattern = os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, '*', '*', '*', '*', '*', '*')
    # store marker folders by cell line
    for marker_path in glob.glob(pattern):
        if os.path.isdir(marker_path):
            try:
                image_df = pd.read_csv(marker_path +'/Image.csv')
                #print(marker_path, image_df.shape)
                #print(image_df[['Count_Pbodies', 'Count_nucleus']].head(10))
                
                # DEBUG CODE: to recognise problems in CP wiriting to the wrong folder
                # parts_df = image_df['PathName_nucleus'].apply(extract_path_parts)
                parts_df = image_df['PathName_DAPI'].apply(extract_path_parts)
                
                print(marker_path, parts_df['batch'].unique(), parts_df['cell_line'].unique(), parts_df['condition'].unique(), parts_df['rep'].unique(), )
                # DEBUG CODE

                marker = os.path.basename(marker_path)    
                cell_line = Path(marker_path).resolve().parents[3].name
            except FileNotFoundError as e:
                print("!!!!")
                print(e)
        else:
            print(f"Not a marker folder directory:{marker_path}")


/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_LSM14A/batch3/WT/panelB/stress/rep3/LSM14A ['batch3'] ['WT'] ['stress'] ['rep3']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_LSM14A/batch3/WT/panelB/stress/rep4/LSM14A ['batch3'] ['WT'] ['stress'] ['rep4']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_LSM14A/batch3/WT/panelB/stress/rep1/LSM14A ['batch3'] ['WT'] ['stress'] ['rep1']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_LSM14A/batch3/WT/panelB/stress/rep2/LSM14A ['batch3'] ['WT'] ['stress'] ['rep2']
/home/projects/hornsteinlab/Collaboration/NOVA/cell_profiler/outputs/cell_profiler_RUNS/Final_cp_analysis/PB_profiling/Sorbitol_LSM14A/batch3/WT/panelB/Untreated/re

## Collect CP files by "cell_line+condition" and Load CP data

In [18]:

# Collect paths of CP output files
paths_by_cell_line = collect_cp_results_by_cell_line(ANALYSIS_TYPE, include_condition=True)#, validate=False)

In [19]:
# Load CP data
cp_data = load_cp_results(paths_by_cell_line, REQUIRED_FILES)


number of subjects from cell line WT_stress: 12
number of subjects from cell line WT_Untreated: 12


In [20]:
# Get the calculated features from all CP output files

LSM14A_PB_in_cyto_measures = [
    "Math_LSM14A_PB_over_cyto", 
    "Math_Texture_Contrast_LSM14A_pb_only_15", 
    "Math_Texture_Contrast_LSM14A_pb_only_3", 
    "Math_Texture_Contrast_LSM14A_pb_only_5",
    "Math_Texture_Contrast_LSM14A_pb_only_9",
    "Math_Texture_Entropy_LSM14A_pb_only_15",
    "Math_Texture_Entropy_LSM14A_pb_only_3",
    "Math_Texture_Entropy_LSM14A_pb_only_5",
    "Math_Texture_Entropy_LSM14A_pb_only_9",
    "Math_Texture_Homogeneity_LSM14A_pb_only_15",
    "Math_Texture_Homogeneity_LSM14A_pb_only_3",
    "Math_Texture_Homogeneity_LSM14A_pb_only_5",
    "Math_Texture_Homogeneity_LSM14A_pb_only_9",

    
]

cp_measurements = collect_all_features(cp_data, group_by_columns, PB_in_cyto_measures=LSM14A_PB_in_cyto_measures)

WT_stress (2591, 6) (2591, 32) (2591, 18)
(2591, 33)
(2591, 46)
WT_Untreated (2727, 6) (2727, 32) (2727, 18)
(2727, 33)
(2727, 46)
Shape after merging is: (5318, 46)


# add new variable "group"

In [21]:


# Add group
cp_measurements['group'] = cp_measurements['cell_line']+"_"+cp_measurements['condition']

# Fiilter by lines
lines_to_include = ["WT_Untreated", "WT_stress"]
cp_measurements = cp_measurements[cp_measurements['group'].isin(lines_to_include)]
print(cp_measurements.shape)

# Important to put the reference group first in order for mixedlm() - has to be Categorical!
cp_measurements["group"] = pd.Categorical(
    cp_measurements["group"],
    categories=lines_to_include,
    ordered=True
)


(5318, 47)


In [22]:
cp_measurements[['rep', 'group']].value_counts()


rep   group       
rep1  WT_Untreated    694
rep4  WT_Untreated    684
rep3  WT_Untreated    679
rep4  WT_stress       678
rep2  WT_Untreated    670
rep1  WT_stress       640
rep2  WT_stress       640
rep3  WT_stress       633
Name: count, dtype: int64

In [23]:
cp_measurements[['batch']].value_counts()

batch 
batch1    1826
batch3    1756
batch2    1736
Name: count, dtype: int64

In [24]:
cp_measurements[['group']].value_counts()

group       
WT_Untreated    2727
WT_stress       2591
Name: count, dtype: int64

# Effect size modeling

The terms in the formula:
- measurement: The CellProfiler feature (e.g., mean number of p-bodies).
- gene_group: A fixed effect to test differences between WT, C9, +sALS, -sALS.
- rep: Random intercept for each patient, accounting for intra-patient correlation across site images.

What This Model Gives You:
- Estimates of group differences: WT vs other groups, with significance testing.

- Within-patient variability: Captures how consistent measurements are across images for a given patient.

- Between-patient variability: Tests whether observed effects are reproducible across patients.

- P-values or confidence intervals: For significance of gene group effects.



In [25]:
cp_measurements[['group', 'batch', 'num_pb']].groupby(['group', 'batch'], observed=False).describe()
#cp_measurements[['gene_group', 'num_pb']].groupby('gene_group', observed=False).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_pb,num_pb,num_pb,num_pb,num_pb,num_pb,num_pb,num_pb
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
group,batch,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
WT_Untreated,batch1,900.0,8.613726,1.457981,5.392857,7.550978,8.473611,9.439236,15.0
WT_Untreated,batch2,891.0,8.13565,1.332103,5.557377,7.228524,7.895833,8.850713,15.166667
WT_Untreated,batch3,936.0,7.919097,1.454472,5.166667,6.974107,7.67029,8.503788,20.0
WT_stress,batch1,926.0,9.907762,2.743192,5.298507,8.102572,9.363698,10.938416,41.75
WT_stress,batch2,845.0,9.161898,2.419958,5.637681,7.609756,8.636364,10.060606,30.125
WT_stress,batch3,820.0,9.152514,2.809151,5.088889,7.473684,8.506757,10.0,34.2


In [26]:
# get the CellProfiler features you want to calculate effect for 
cp_features_columns = [col for col in cp_measurements.columns if col not in group_by_columns + ['batch', 'group']]

results_df_LSM14A = run_analysis_generate_report(
                                df=cp_measurements,
                                feature_columns=cp_features_columns,
                                group_col="group",
                                batch_col="batch",
                                output_dir=os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, 'mixed_effect_report')
)







Analysing CP feature: num_pb
❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: num_pb
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     184.3
Date:                Tue, 30 Sep 2025   Prob (F-statistic):          1.05e-113
Time:                        18:07:18   Log-Likelihood:                -11536.
No. Observations:                5318   AIC:                         2.308e+04
Df Residuals:                    5314   BIC:                         2.311e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
         

             Mixed Linear Model Regression Results
Model:               MixedLM  Dependent Variable:  feature_value
No. Observations:    5318     Method:              ML           
No. Groups:          3        Scale:               0.0880       
Min. group size:     1736     Log-Likelihood:      -1087.5002   
Max. group size:     1826     Converged:           Yes          
Mean group size:     1772.7                                     
----------------------------------------------------------------
                      Coef. Std.Err.    z    P>|z| [0.025 0.975]
----------------------------------------------------------------
Intercept             5.964    0.024 248.076 0.000  5.917  6.011
C(group)[T.WT_stress] 0.100    0.008  12.250 0.000  0.084  0.116
Group Var             0.002    0.005                            


### Fixed Effects: Gene Group Differences from Ctrl
- Ctrl: estimated average = 5.96
- WT_stress: +0.09972 units vs Ctrl (p = 0.00000, 95% CI: [0.08377, 0.11568]) → ✔️

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     532.0
Date:                Tue, 30 Sep 2025   Prob (F-statistic):          2.48e-302
Time:                        18:07:18   Log-Likelihood:                 54126.
No. Observations:                5318   AIC:                        -1.082e+05
Df Residuals:                    5314   BIC:                        -1.082e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept              1.319e-

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.245
Model:                            OLS   Adj. R-squared:                  0.245
Method:                 Least Squares   F-statistic:                     575.2
Date:                Tue, 30 Sep 2025   Prob (F-statistic):          9.88e-324
Time:                        18:07:19   Log-Likelihood:                 27113.
No. Observations:                5318   AIC:                        -5.422e+04
Df Residuals:                    5314   BIC:                        -5.419e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.08

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.203
Model:                            OLS   Adj. R-squared:                  0.203
Method:                 Least Squares   F-statistic:                     451.5
Date:                Tue, 30 Sep 2025   Prob (F-statistic):          2.61e-261
Time:                        18:07:19   Log-Likelihood:                 26943.
No. Observations:                5318   AIC:                        -5.388e+04
Df Residuals:                    5314   BIC:                        -5.385e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.08

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     91.70
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           7.45e-58
Time:                        18:07:19   Log-Likelihood:                 37185.
No. Observations:                5318   AIC:                        -7.436e+04
Df Residuals:                    5314   BIC:                        -7.434e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.00

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.511
Model:                            OLS   Adj. R-squared:                  0.511
Method:                 Least Squares   F-statistic:                     1852.
Date:                Tue, 30 Sep 2025   Prob (F-statistic):               0.00
Time:                        18:07:20   Log-Likelihood:                -33519.
No. Observations:                5318   AIC:                         6.705e+04
Df Residuals:                    5314   BIC:                         6.707e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept               611.85

                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.074
Model:                            OLS   Adj. R-squared:                  0.074
Method:                 Least Squares   F-statistic:                     142.4
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           9.64e-89
Time:                        18:07:20   Log-Likelihood:                 2840.1
No. Observations:                5318   AIC:                            -5672.
Df Residuals:                    5314   BIC:                            -5646.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.82

  return 1 - self.ssr/self.centered_tss


                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.045
Model:                            OLS   Adj. R-squared:                  0.045
Method:                 Least Squares   F-statistic:                     83.75
Date:                Tue, 30 Sep 2025   Prob (F-statistic):           6.10e-53
Time:                        18:07:20   Log-Likelihood:                 12529.
No. Observations:                5318   AIC:                        -2.505e+04
Df Residuals:                    5314   BIC:                        -2.502e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                 0.04

In [27]:
with PdfPages(f"{save_path}/cell_profiler_Sorbitol_p_bodies_LSM14A.pdf") as pdf:
    for cp_feature_col in measures_to_plot:
        if cp_feature_col in cp_measurements.columns:
            plot_cp_feature_grouped_by_gene(cp_measurements, 
                                              cp_feature_col=cp_feature_col,
                                              group_col='group', 
                                              patient_col="batch",
                                              color_mapping=_color_mapping, 
                                              model_results_df=results_df_LSM14A,
                                              pdf_file=pdf
                                           )   

            


In [28]:
print("Done!")

Done!
