# <font color="red"> CellProfiler: profiling of P-Bodies (DCP1A) V5 OE across all batches </font>
 
JIRA task: NN-83

Why Linear Mixed-Effects Models (LMMs)?
Your experimental structure involves:

- Three groups: 'TDP43_dNLS_Monocolonal', 'TDP43_WT_Polycolonal','TDP43_dNLS_Polycolonal'

- Two conditions:+/- DOX 

- 3 batches per group

- Measurements per site image (the dependent variable), 50–250 site images per batch (i.e., image-level measurements)


- Random variation across batches random intercept per batch (i.e., variation across batches)

This design includes both fixed effects (groups) and random effects (batches)
Estimate how dNLS_DOX affects each CellProfiler feature vs dNLS_Untreated, accounting for batch effects (inter-batch variation).

In [1]:
from pathlib import Path
import glob
import sys
import os


os.environ['NOVA_HOME'] = '/home/projects/hornsteinlab/Collaboration/NOVA/'
os.environ['NOVA_DATA_HOME'] = f"{os.environ['NOVA_HOME']}/input"
print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))


import numpy as np
import pandas as pd
import seaborn as sns
from markdown import markdown
import matplotlib.pyplot as plt

from cell_profiler.code.cp_effect_size_utils import CP_OUTPUTS_FOLDER, validate_cp_files, extract_path_parts, merge_on_group, collect_cp_results_by_cell_line, load_cp_results, get_features_per_image, get_aggregated_features_per_image, collect_all_features, run_analysis_generate_report, print_mixedlm_conclusions, measures_to_plot
from manuscript.plot_config import PlotConfig

%load_ext autoreload    
%autoreload 2

NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA/


# V5 dNLS dataset - DCP1A

In [2]:
ANALYSIS_TYPE = 'PB_profiling/WT_OE_DCP1A'
BATCHES = ['batch1', 'batch2', 'batch3']

# Save figures here
save_path = '/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen/figures/dNLS_V5/cell_profiler/PB_profiling/WT_OE_DCP1A'

# Font
FONT_PATH = '/home/projects/hornsteinlab/sagyk/anaconda3/envs/nova/fonts/arial.ttf'
from matplotlib import font_manager as fm
import matplotlib
fm.fontManager.addfont(FONT_PATH)
matplotlib.rcParams['font.family'] = 'Arial'

plt.rcParams.update({
    'font.family': 'Arial',
    'font.size': 6
})

In [3]:
group_by_columns = ['ImageNumber', 'batch', 'rep', 'cell_line', 'condition']
REQUIRED_FILES = ['Image.csv', 'Pbodies.csv', 'Cytoplasm.csv']


In [4]:

# Test CP outputs (number of images)
if True:
    pattern = os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, '*', '*', '*', '*', '*', '*')
    # store marker folders by cell line
    for marker_path in glob.glob(pattern):
        if os.path.isdir(marker_path):
            try:
                image_df = pd.read_csv(marker_path +'/Image.csv')
                #print(marker_path, image_df.shape)
                #print(image_df[['Count_Pbodies', 'Count_nucleus']].head(10))
                
                # DEBUG CODE: to recognise problems in CP wiriting to the wrong folder
                #parts_df = image_df['PathName_DAPI'].apply(extract_path_parts)                
                #print(marker_path, parts_df['batch'].unique(), parts_df['cell_line'].unique(), parts_df['condition'].unique(), parts_df['rep'].unique(), )
                # DEBUG CODE

                marker = os.path.basename(marker_path)    
                cell_line = Path(marker_path).resolve().parents[3].name
            except FileNotFoundError as e:
                print("!!!!")
                print(e)
        else:
            print(f"Not a marker folder directory:{marker_path}")


## Collect CP files by "cell_line+condition" and Load CP data

In [5]:
# Collect paths of CP output files
paths_by_cell_line = collect_cp_results_by_cell_line(ANALYSIS_TYPE, include_condition=True)

In [6]:
for cl in paths_by_cell_line.keys():
    print(cl, len(paths_by_cell_line[cl]))
    

TDP43_dNLS_Monocolonal_stress 12
TDP43_dNLS_Monocolonal_Untreated 12
TDP43_WT_Polycolonal_stress 12
TDP43_WT_Polycolonal_Untreated 12
TDP43_dNLS_Polycolonal_stress 12
TDP43_dNLS_Polycolonal_Untreated 12
iw11-NGN_Untreated 14
iw11-NGN_stress 8


In [None]:
# Load CP data
cp_data = load_cp_results(paths_by_cell_line, REQUIRED_FILES)


In [None]:
# Get the calculated features from all CP output files
cp_measurements = collect_all_features(cp_data, group_by_columns)


TDP43_dNLS_Monocolonal_stress (1909, 6) (1909, 32) (1909, 18)
(1909, 33)
(1909, 46)
TDP43_dNLS_Monocolonal_Untreated (2067, 6) (2067, 32) (2067, 18)
(2067, 33)
(2067, 46)
TDP43_WT_Polycolonal_stress (2049, 6) (2049, 32) (2049, 18)
(2049, 33)
(2049, 46)
⚠️ TDP43_WT_Polycolonal_Untreated: Removed 5 of 1193 site images with 0 nuclei.
TDP43_WT_Polycolonal_Untreated (1188, 6) (1193, 32) (1188, 18)
(1193, 33)
(1193, 46)
⚠️ TDP43_dNLS_Polycolonal_stress: Removed 9 of 1313 site images with 0 nuclei.
TDP43_dNLS_Polycolonal_stress (1304, 6) (1313, 32) (1304, 18)
(1313, 33)
(1313, 46)
⚠️ TDP43_dNLS_Polycolonal_Untreated: Removed 7 of 407 site images with 0 nuclei.
TDP43_dNLS_Polycolonal_Untreated (400, 6) (407, 32) (400, 18)
(407, 33)
(407, 46)
iw11-NGN_Untreated (2345, 6) (1493, 32) (1493, 18)
(2345, 33)
(2345, 46)
iw11-NGN_stress (1488, 6) (1488, 32) (1488, 18)
(1488, 33)
(1488, 46)
Shape after merging is: (12771, 46)


In [None]:
cp_measurements.to_csv("cp_measurements.csv")

In [None]:
cp_measurements = pd.read_csv("cp_measurements.csv")

# Add "group"

In [None]:

# Rename condition to "DOX" (Bar called it "stress")
cp_measurements.loc[cp_measurements['condition']=='stress', 'condition'] = 'DOX'

# Add group
cp_measurements['group'] = cp_measurements['cell_line']+"_"+cp_measurements['condition']

In [None]:
feature = 'mean_AreaShape_Eccentricity' 
cp_measurements[['batch', 'group', feature]].groupby(['batch', 'group'], observed=False).describe()


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity,mean_AreaShape_Eccentricity
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
batch,group,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
batch1,TDP43_WT_Polycolonal_DOX,630.0,0.619129,0.048307,0.468053,0.583019,0.623994,0.656535,0.71701
batch1,TDP43_WT_Polycolonal_Untreated,316.0,0.65486,0.037216,0.512696,0.63515,0.660893,0.68,0.761473
batch1,TDP43_dNLS_Monocolonal_DOX,624.0,0.60133,0.053668,0.470898,0.562732,0.596705,0.63753,0.721965
batch1,TDP43_dNLS_Monocolonal_Untreated,698.0,0.580768,0.04676,0.462878,0.54935,0.580448,0.61073,0.721532
batch1,TDP43_dNLS_Polycolonal_DOX,503.0,0.616878,0.058486,0.484064,0.566762,0.618399,0.670797,0.738299
batch1,TDP43_dNLS_Polycolonal_Untreated,131.0,0.658018,0.032619,0.546019,0.641382,0.661311,0.680522,0.712884
batch1,iw11-NGN_DOX,731.0,0.633893,0.033234,0.509313,0.614439,0.637243,0.657393,0.732868
batch1,iw11-NGN_Untreated,1012.0,0.664105,0.038245,0.535862,0.635668,0.669728,0.697296,0.731368
batch2,TDP43_WT_Polycolonal_DOX,691.0,0.619221,0.049318,0.471111,0.582301,0.625275,0.656153,0.730076
batch2,TDP43_WT_Polycolonal_Untreated,339.0,0.645746,0.041912,0.500123,0.619111,0.65244,0.678927,0.735583


In [None]:
feature = 'mean_AreaShape_MeanRadius'
cp_measurements[['batch', 'group', feature]].groupby(['batch', 'group'], observed=False).describe()


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius,mean_AreaShape_MeanRadius
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
batch,group,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
batch1,TDP43_WT_Polycolonal_DOX,630.0,1.409408,0.04608,1.279369,1.377405,1.414787,1.439203,1.535613
batch1,TDP43_WT_Polycolonal_Untreated,316.0,1.404336,0.035767,1.299928,1.381598,1.403252,1.426496,1.506155
batch1,TDP43_dNLS_Monocolonal_DOX,624.0,1.421189,0.03531,1.274019,1.408247,1.426452,1.441476,1.546096
batch1,TDP43_dNLS_Monocolonal_Untreated,698.0,1.411327,0.022006,1.294597,1.396963,1.409884,1.424275,1.504334
batch1,TDP43_dNLS_Polycolonal_DOX,503.0,1.374052,0.040284,1.260659,1.343136,1.374534,1.403278,1.536696
batch1,TDP43_dNLS_Polycolonal_Untreated,131.0,1.402362,0.04277,1.294696,1.377649,1.396978,1.429468,1.539443
batch1,iw11-NGN_DOX,731.0,1.419154,0.036379,1.286456,1.397672,1.421789,1.443783,1.558172
batch1,iw11-NGN_Untreated,1012.0,1.450983,0.038439,1.306527,1.425741,1.44979,1.482116,1.535864
batch2,TDP43_WT_Polycolonal_DOX,691.0,1.404683,0.043722,1.260107,1.376523,1.409178,1.432685,1.559073
batch2,TDP43_WT_Polycolonal_Untreated,339.0,1.402378,0.034292,1.319581,1.379186,1.4007,1.424674,1.517263


In [14]:
# # TDP43_dNLS_Polycolonal_Untreated > TDP43_dNLS_Polycolonal_DOX > TDP43_WT_Polycolonal_DOX

# # TDP43_dNLS_Polycolonal_DOX - TDP43_WT_Polycolonal_DOX



# 'TDP43_dNLS_Monocolonal'


In [15]:
df_TDP43_dNLS_Monocolonal = cp_measurements[cp_measurements['cell_line']=='TDP43_dNLS_Monocolonal']

print(df_TDP43_dNLS_Monocolonal[['batch', 'group']].value_counts())

# Fiilter by lines
lines_to_include = ["TDP43_dNLS_Monocolonal_Untreated", "TDP43_dNLS_Monocolonal_DOX"]
df_TDP43_dNLS_Monocolonal = df_TDP43_dNLS_Monocolonal[df_TDP43_dNLS_Monocolonal['group'].isin(lines_to_include)]
print(df_TDP43_dNLS_Monocolonal.shape)

# Important to put the reference group first in order for mixedlm() - has to be Categorical!
df_TDP43_dNLS_Monocolonal["group"] = pd.Categorical(
    df_TDP43_dNLS_Monocolonal["group"],
    categories=lines_to_include,
    ordered=True
)

# get the CellProfiler features you want to calculate effect for 
cp_features_columns = ['mean_AreaShape_Eccentricity', 'mean_AreaShape_MeanRadius'] 

results_df_TDP43_dNLS_Monocolonal = run_analysis_generate_report(
                                df=df_TDP43_dNLS_Monocolonal,
                                feature_columns=cp_features_columns,
                                group_col="group",
                                batch_col="batch",
                                output_dir=os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, 'mixed_effect_report', 'TDP43_dNLS_Monocolonal')
)




batch   group                           
batch2  TDP43_dNLS_Monocolonal_Untreated    715
batch1  TDP43_dNLS_Monocolonal_Untreated    698
batch3  TDP43_dNLS_Monocolonal_Untreated    654
batch2  TDP43_dNLS_Monocolonal_DOX          645
batch3  TDP43_dNLS_Monocolonal_DOX          640
batch1  TDP43_dNLS_Monocolonal_DOX          624
Name: count, dtype: int64
(3976, 48)



Analysing CP feature: mean_AreaShape_Eccentricity
❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_AreaShape_Eccentricity
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     68.61
Date:                Thu, 18 Sep 2025   Prob (F-statistic):           2.97e

# 'TDP43_WT_Polycolonal'

In [None]:

# get the CellProfiler features you want to calculate effect for 
cp_features_columns = ['mean_AreaShape_MeanRadius'] 

In [None]:

df_TDP43_WT_Polycolonal = cp_measurements[cp_measurements['cell_line']=='TDP43_WT_Polycolonal']

print(df_TDP43_WT_Polycolonal[['batch', 'group']].value_counts())

# Fiilter by lines
lines_to_include = ["TDP43_WT_Polycolonal_Untreated", "TDP43_WT_Polycolonal_DOX"]
df_TDP43_WT_Polycolonal = df_TDP43_WT_Polycolonal[df_TDP43_WT_Polycolonal['group'].isin(lines_to_include)]
print(df_TDP43_WT_Polycolonal.shape)

# Important to put the reference group first in order for mixedlm() - has to be Categorical!
df_TDP43_WT_Polycolonal["group"] = pd.Categorical(
    df_TDP43_WT_Polycolonal["group"],
    categories=lines_to_include,
    ordered=True
)


results_TDP43_WT_Polycolonal = run_analysis_generate_report(
                                df=df_TDP43_WT_Polycolonal,
                                feature_columns=cp_features_columns,
                                group_col="group",
                                batch_col="batch",
                                output_dir=os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, 'mixed_effect_report', 'TDP43_WT_Polycolonal')
)


results_TDP43_WT_Polycolonal

batch   group                         
batch3  TDP43_WT_Polycolonal_DOX          728
batch2  TDP43_WT_Polycolonal_DOX          691
batch1  TDP43_WT_Polycolonal_DOX          630
batch3  TDP43_WT_Polycolonal_Untreated    538
batch2  TDP43_WT_Polycolonal_Untreated    339
batch1  TDP43_WT_Polycolonal_Untreated    316
Name: count, dtype: int64
(3242, 48)



Analysing CP feature: mean_AreaShape_Eccentricity
❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_AreaShape_Eccentricity
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.114
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     138.5
Date:                Thu, 18 Sep 2025   Prob (F-statistic):           1.87e-84
Time:     

# 'TDP43_dNLS_Polycolonal'

In [None]:


df_TDP43_dNLS_Polycolonal = cp_measurements[cp_measurements['cell_line']=='TDP43_dNLS_Polycolonal']

print(df_TDP43_dNLS_Polycolonal[['batch', 'group']].value_counts())

# Fiilter by lines
lines_to_include = ["TDP43_dNLS_Polycolonal_Untreated", "TDP43_dNLS_Polycolonal_DOX"]
df_TDP43_dNLS_Polycolonal = df_TDP43_dNLS_Polycolonal[df_TDP43_dNLS_Polycolonal['group'].isin(lines_to_include)]
print(df_TDP43_dNLS_Polycolonal.shape)

# Important to put the reference group first in order for mixedlm() - has to be Categorical!
df_TDP43_dNLS_Polycolonal["group"] = pd.Categorical(
    df_TDP43_dNLS_Polycolonal["group"],
    categories=lines_to_include,
    ordered=True
)


results_TDP43_dNLS_Polycolonal = run_analysis_generate_report(
                                df=df_TDP43_dNLS_Polycolonal,
                                feature_columns=cp_features_columns,
                                group_col="group",
                                batch_col="batch",
                                output_dir=os.path.join(CP_OUTPUTS_FOLDER, ANALYSIS_TYPE, 'mixed_effect_report', 'TDP43_dNLS_Polycolonal')
)

results_TDP43_dNLS_Polycolonal

batch   group                           
batch1  TDP43_dNLS_Polycolonal_DOX          503
batch3  TDP43_dNLS_Polycolonal_DOX          465
batch2  TDP43_dNLS_Polycolonal_DOX          345
batch3  TDP43_dNLS_Polycolonal_Untreated    167
batch1  TDP43_dNLS_Polycolonal_Untreated    131
batch2  TDP43_dNLS_Polycolonal_Untreated    109
Name: count, dtype: int64
(1720, 48)



Analysing CP feature: mean_AreaShape_Eccentricity
❌ Random effect variance is near zero. — Unable to fit random intercept (e.g., low variance or convergence issue)
⚠️ Fallback to fixed-effects model for feature: mean_AreaShape_Eccentricity
                            OLS Regression Results                            
Dep. Variable:          feature_value   R-squared:                       0.078
Model:                            OLS   Adj. R-squared:                  0.077
Method:                 Least Squares   F-statistic:                     48.64
Date:                Thu, 18 Sep 2025   Prob (F-statistic):           3.61e

In [None]:
feature = 'mean_AreaShape_Eccentricity' # dNLS_DOX effect=0.0045 and Intercept=0.6628

def do(df, name):
    tmp = df.loc[(df['feature']==feature) & (df['comparison']!='Intercept'), ['effect_size', 'pval']]
    tmp['analysis'] = name
    return tmp

pd.concat(
    [do(results_TDP43_WT_Polycolonal, 'WT_Polycolonal'), 
     do(results_TDP43_dNLS_Polycolonal, 'dNLS_Polycolonal'), 
     do(results_df_TDP43_dNLS_Monocolonal, 'dNLS_Monocolonal')])

In [19]:

feature = 'mean_AreaShape_MeanRadius' #dNLS_DOX effect=-0.0556  and Intercept=1.5170

def do(df, name):
    tmp = df.loc[(df['feature']==feature) & (df['comparison']!='Intercept'), ['effect_size', 'pval']]
    tmp['analysis'] = name
    return tmp

pd.concat(
    [do(results_TDP43_WT_Polycolonal, 'WT_Polycolonal'), 
     do(results_TDP43_dNLS_Polycolonal, 'dNLS_Polycolonal'), 
     do(results_df_TDP43_dNLS_Monocolonal, 'dNLS_Monocolonal')])

Unnamed: 0,effect_size,pval,analysis
3,0.003374,0.02514406,WT_Polycolonal
3,-0.031882,2.461183e-45,dNLS_Polycolonal
3,0.012267,1.0763670000000001e-43,dNLS_Monocolonal


In [20]:

def plot(cp_measurements, cp_feature_col, group_col="gene_group", patient_col="patient_id", color_mapping=None, model_results_df=None, save_path=None):
    
    df = cp_measurements.copy()

    # Define fixed color and label mapping for both group_col and patient_col
    _palette = {}
    groups = df[group_col].astype(str).unique().tolist() + df[patient_col].astype(str).unique().tolist()
    for g in groups:
        if g in color_mapping: _palette[color_mapping[g]['alias']] = color_mapping[g]['color']
    if patient_col=='batch':
        for b in df[patient_col].astype(str).unique().tolist():
            _palette[b]='gray'

    
    # Rename groups to aliases
    label_mapping = {k: v["alias"] for k, v in color_mapping.items() if k in groups}
    df[group_col] = df[group_col].cat.rename_categories(label_mapping)
    df[patient_col] = df[patient_col].astype("category").cat.rename_categories(label_mapping)
    
    # Determine group order and x-axis positions for each group (reversed for visual preference)
    groups_order = sorted(df[group_col].unique(), reverse=True)
    x_spacing = 0.4
    x_pos_map = {label: i * x_spacing for i, label in enumerate(groups_order)}

    
    # Setup plot
    sns.set(style="white", font_scale=1.0)
    fig, ax = plt.subplots(figsize=(3, 5))
    line_width = 1

    
    # ============================
    # Plot each group manually by numeric x
    # ============================
    for i, group in enumerate(groups_order):
        
        group_data = df[df[group_col] == group]
        xpos = x_pos_map[group]
        
        # Boxplot for group
        sns.boxplot(
            data=group_data,
            y=cp_feature_col,
            ax=ax,
            width=0.3,
            linewidth=line_width,
            showfliers=False,
            showmeans=True,
            meanline=True,
            meanprops={"linestyle": "-", "color": "black", "linewidth": line_width},
            boxprops=dict(facecolor='none', edgecolor='black', linewidth=line_width),
            whiskerprops=dict(linewidth=line_width-0.3, color='black'),
            capprops=dict(linewidth=line_width, color='black'),
            medianprops=dict(visible=False),
            positions=[x_pos_map[group]]
        )

        # ============================
        # Full distribution: raw cell-level/image-level points (light gray)
        # ============================ 
        ax.scatter(
            x=np.random.normal(loc=x_pos_map[group], scale=0.05, size=len(group_data)),  # jitter
            y=group_data[cp_feature_col],
            color='lightgray',
            s=2.5,
            alpha=0.4,
            zorder=1
        )

    # ============================
    # Overlay per-batch means (means as colored points)
    # ============================
    batch_means = df.groupby([group_col, patient_col], observed=True)[cp_feature_col].mean().reset_index()

    for _, row in batch_means.iterrows():
        group = row[group_col]
        batch = row[patient_col]

        # Skip if value is missing
        if pd.isna(group) or pd.isna(batch) or pd.isna(row[cp_feature_col]):
            continue

        xpos = x_pos_map.get(group)
        y = row[cp_feature_col]

        if pd.isna(xpos) or pd.isna(y):
            continue

        jittered_x = np.random.normal(loc=xpos, scale=0.05)
        
        ax.scatter(
            x=jittered_x,
            y=y,
            color=_palette.get(group, 'black'),
            edgecolor=None,
            s=5,
            zorder=3,
            label=batch
        )

    # Deduplicate legend
    handles, labels = ax.get_legend_handles_labels()
    unique = dict(zip(labels, handles))
    ax.legend(
        unique.values(),
        unique.keys(),
        title=patient_col,
        bbox_to_anchor=(1.02, 1),
        loc="upper left"
    )

    # Set axis formatting
    ax.set_xlim(-x_spacing+0.1, max(x_pos_map.values()) + x_spacing - 0.1)
    
    ax.set_xticks(list(x_pos_map.values()))
    ax.set_xticklabels(groups_order, rotation=90)
    ax.set_ylabel(cp_feature_col)
    ax.margins(x=0)
    
    # Ensure tick marks are shown on both axes
    ax.tick_params(axis='both', which='both', direction='out',
                   length=4, width=1, bottom=True, top=False, left=True, right=False)
    

    # ============================
    # P-value annotation LMM
    # ============================
    stat = model_results_df.loc[(model_results_df['comparison'] != 'Intercept') & (model_results_df['feature'] ==cp_feature_col)]
    p = float(stat['pval'].iloc[0])
    effect_size = float(stat['effect_size'].iloc[0])
    ci_low = float(stat['ci_lower'].values[0])
    ci_high = float(stat['ci_upper'].values[0])
    txt = f"Effect size: {effect_size} \n(p = {p}, 95% CI: \n[{ci_low:.4f}, {ci_high:.4f}])"

    # Format p display
    if p < 0.001:
        p_text = "***"
    elif p < 0.01:
        p_text = "**"
    elif p < 0.05:
        p_text = "*"
    else:
        p_text = f"n.s. (p = {p:.2f})"
        
    # Plot annotation - use actual plot limits to place annotation
    ymin, ymax = ax.get_ylim()
    y_range = ymax - ymin
    
    line_y = ymax - 0.1 * y_range
    text_y = line_y - 0.02 * y_range

    # Significance - Bridge line between two groups
    x_keys = list(x_pos_map.values())
    if len(x_keys) >= 2:
        x1, x2 = x_keys[0], x_keys[1]
        ymin, ymax = ax.get_ylim()
        y_range = ymax - ymin
        line_y = ymax - 0.1 * y_range
        text_y = line_y - 0.02 * y_range


        ax.plot([x1, x1, x2, x2],
                [line_y, line_y + 0.01*y_range, 
                 line_y + 0.01*y_range, line_y],
                lw=1.5, c='black')

        # Annotation text
        ax.text((x1 + x2) / 2, text_y, p_text, ha='center', va='bottom')
    
    # Remove extra space around plot
    plt.tight_layout()
    # Add top space if needed
    plt.subplots_adjust(top=0.8)
    plt.suptitle(txt, fontsize=8)
    
     # Save the plot
    if save_path is not None:
        plt.savefig(save_path, dpi=300)
        plt.close(fig)
    else:
        plt.show()

In [21]:
_color_mapping = {
     'TDP43_WT_Polycolonal_Untreated': {'alias': 'TDP43_WT_Polycolonal_Untreated', 'color': '#6BAD31'},
     'TDP43_WT_Polycolonal_DOX': {'alias': 'TDP43_WT_Polycolonal_DOX', 'color': '#90278E'},
     'TDP43_dNLS_Polycolonal_Untreated': {'alias': 'TDP43_dNLS_Polycolonal_Untreated', 'color': '#6BAD31'},
     'TDP43_dNLS_Polycolonal_DOX': {'alias': 'TDP43_dNLS_Polycolonal_DOX', 'color': '#90278E'},
}


In [22]:

plot(cp_measurements=df_TDP43_WT_Polycolonal, 
    cp_feature_col='mean_AreaShape_MeanRadius', 
    group_col="group", 
    patient_col="batch", 
    color_mapping=_color_mapping, 
    model_results_df=results_TDP43_WT_Polycolonal, 
    save_path=f"{save_path}/cell_profiler_V5OE_mean_AreaShape_MeanRadius_WT_Polycolonal.pdf")

plot(cp_measurements=df_TDP43_dNLS_Polycolonal, 
                                cp_feature_col='mean_AreaShape_MeanRadius', 
                                group_col="group", 
                                patient_col="batch", 
                                color_mapping=_color_mapping, 
                                model_results_df=results_TDP43_dNLS_Polycolonal,
                                save_path=f"{save_path}/cell_profiler_V5OE_mean_AreaShape_MeanRadius_dNLS_Polycolonal.pdf")

In [23]:
plot(cp_measurements=df_TDP43_WT_Polycolonal, 
    cp_feature_col='mean_AreaShape_Eccentricity', 
    group_col="group", 
    patient_col="batch", 
    color_mapping=_color_mapping, 
    model_results_df=results_TDP43_WT_Polycolonal, 
    save_path=f"{save_path}/cell_profiler_V5OE_mean_AreaShape_Eccentricity_WT_Polycolonal.pdf")

plot(cp_measurements=df_TDP43_dNLS_Polycolonal, 
                                cp_feature_col='mean_AreaShape_Eccentricity', 
                                group_col="group", 
                                patient_col="batch", 
                                color_mapping=_color_mapping, 
                                model_results_df=results_TDP43_dNLS_Polycolonal,
                                save_path=f"{save_path}/cell_profiler_V5OE_mean_AreaShape_Eccentricity_dNLS_Polycolonal.pdf")

In [24]:
print("Done!")

Done!
