# Extract logistic regression model coefficients per CellProfiler feature

The coefficients from the machine learning model will either be positive (x > 0), where if the feature value increases, the more likely the feature is related to the Healthy cell type, negative (x < 0), where if the feature value increases, the more likely the feature is the Failing cell type, and zero (x = 0), where that feature has no impact on predicting if a cell is Healthy or Failing.

## Import libraries

In [1]:
from joblib import load
import pathlib
import pandas as pd

## Load in the training data to collect the CellProfiler feature columns

In [2]:
# Path to ata dir
data_dir = pathlib.Path("./data/")

# path to training data to access the feature columns
path_to_training_data = pathlib.Path(f"{data_dir}/training_data.csv")

# Get all the column names from the training data
all_columns = pd.read_csv(path_to_training_data, nrows=0).columns

# Filter columns that start with 'Metadata_' to only get feature columns from CellProfiler
feature_columns = [col for col in all_columns if not col.startswith('Metadata_')]

print(len(feature_columns))
feature_columns
print(f"Examples of first five feature columns:")
for example_column in feature_columns[:5]:
    print(example_column)

631
Examples of first five feature columns:
Cytoplasm_AreaShape_Compactness
Cytoplasm_AreaShape_Eccentricity
Cytoplasm_AreaShape_Extent
Cytoplasm_AreaShape_FormFactor
Cytoplasm_AreaShape_MajorAxisLength


## Set paths and load in the final model

In [3]:
# path to the final model joblib file
path_to_final_model = pathlib.Path("./models/log_reg_fs_plate_4_final_downsample.joblib").resolve(strict=True)

# load in final model
final_model = load(path_to_final_model)

## Collect coefficients from the model and match with the correct feature in a dataframe

In [4]:
# Get the coefficients (assuming it's a linear model)
coefficients = final_model.coef_

# Print the coefficients shape and confirm it is the same number as feature columns from training data
print(coefficients.shape)
# Confirm it is the same number as feature columns from training data
if coefficients.shape[1] == len(feature_columns):
    print("The number of coefficients matches the number of feature columns.")
else:
    print("Warning: The number of coefficients does not match the number of feature columns.")

# Create a DataFrame with the coefficients and features
coefficients_df = pd.DataFrame({'Feature': feature_columns, 'Coefficient': coefficients.flatten()})

# Print the DataFrame
coefficients_df.head()

(1, 631)
The number of coefficients matches the number of feature columns.


Unnamed: 0,Feature,Coefficient
0,Cytoplasm_AreaShape_Compactness,0.084341
1,Cytoplasm_AreaShape_Eccentricity,0.05739
2,Cytoplasm_AreaShape_Extent,0.0
3,Cytoplasm_AreaShape_FormFactor,0.01602
4,Cytoplasm_AreaShape_MajorAxisLength,0.0


## Split the data frame by negative, positive. and zero coefficients which relate to different class importance

In [5]:
# Split into DataFrames with positive, negative, and zero coefficients
positive_coeffs_df = coefficients_df[coefficients_df['Coefficient'] > 0].copy()
negative_coeffs_df = coefficients_df[coefficients_df['Coefficient'] < 0].copy()
zero_coeffs_df = coefficients_df[coefficients_df['Coefficient'] == 0].copy()

# Rename the columns
positive_coeffs_df.columns = ['Feature', 'Healthy_Coeffs']
negative_coeffs_df.columns = ['Feature', 'Failing_Coeffs']
zero_coeffs_df.columns = ['Feature', 'Zero_Coeffs']

# Save the coef data into the "/data" folder
positive_coeffs_df.to_csv(f'{data_dir}/positive_coeffs.csv', index=False)
negative_coeffs_df.to_csv(f'{data_dir}/negative_coeffs.csv', index=False)
zero_coeffs_df.to_csv(f'{data_dir}/zero_coeffs.csv', index=False)


# Print or use the resulting DataFrames
print("Positive Coefficients:", positive_coeffs_df.shape[0])
print("\nNegative Coefficients:", negative_coeffs_df.shape[0])
print("\nZero Coefficients:", zero_coeffs_df.shape[0])
negative_coeffs_df.head()

Positive Coefficients: 208

Negative Coefficients: 216

Zero Coefficients: 207


Unnamed: 0,Feature,Failing_Coeffs
7,Cytoplasm_AreaShape_Solidity,-0.23075
8,Cytoplasm_AreaShape_Zernike_0_0,-0.09169
10,Cytoplasm_AreaShape_Zernike_2_0,-0.003549
12,Cytoplasm_AreaShape_Zernike_3_1,-0.079241
15,Cytoplasm_AreaShape_Zernike_4_2,-0.027028


## Explore the coefficients

In [6]:
# Find the row with the highest coefficient value
max_row = coefficients_df.loc[coefficients_df['Coefficient'].idxmax()]

# Extract the feature and coefficient values
max_feature = max_row['Feature']
max_coefficient_value = max_row['Coefficient']

# Print or use the result
print("Feature with the highest coefficient:", max_feature)
print("Coefficient value:", max_coefficient_value)

Feature with the highest coefficient: Nuclei_Intensity_MeanIntensityEdge_Hoechst
Coefficient value: 1.264448489875555


In [7]:
# Sort the DataFrame based on the coefficient values (from most positive to most negative)
coefficients_healthy_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Show the top ten ranking features for predicting "Healthy" class
coefficients_healthy_df.head(10)

Unnamed: 0,Feature,Coefficient
502,Nuclei_Intensity_MeanIntensityEdge_Hoechst,1.264448
504,Nuclei_Intensity_MinIntensityEdge_Hoechst,0.807379
487,Nuclei_Intensity_IntegratedIntensity_PM,0.764603
482,Nuclei_Intensity_IntegratedIntensityEdge_Hoechst,0.734242
80,Cytoplasm_Intensity_StdIntensityEdge_Hoechst,0.582452
443,Nuclei_Correlation_K_Mitochondria_PM,0.553185
199,Cytoplasm_Texture_InverseDifferenceMoment_Acti...,0.473053
569,Nuclei_Texture_AngularSecondMoment_PM_3_01_256,0.457448
353,Cells_Texture_AngularSecondMoment_Hoechst_3_01...,0.425375
472,Nuclei_Correlation_RWC_Mitochondria_PM,0.397943


In [8]:
# Find the row with the most negative coefficient value
min_row = coefficients_df.loc[coefficients_df['Coefficient'].idxmin()]

# Extract the feature and coefficient values
min_feature = min_row['Feature']
min_coefficient_value = min_row['Coefficient']

# Print or use the result
print("Feature with the most negative coefficient:", min_feature)
print("Coefficient value:", min_coefficient_value)

Feature with the most negative coefficient: Cells_Intensity_IntegratedIntensityEdge_Actin
Coefficient value: -0.8649601345230622


In [9]:
# Sort the DataFrame based on the coefficient values (from most negative to most positive)
coefficients_failing_df = coefficients_df.sort_values(by='Coefficient', ascending=True)

# Show the top ten ranking features for predicting "Failing" class
coefficients_failing_df.head(10)

Unnamed: 0,Feature,Coefficient
272,Cells_Intensity_IntegratedIntensityEdge_Actin,-0.86496
292,Cells_Intensity_StdIntensityEdge_Hoechst,-0.653109
462,Nuclei_Correlation_RWC_ER_Hoechst,-0.498114
510,Nuclei_Intensity_StdIntensityEdge_Hoechst,-0.496293
387,Nuclei_AreaShape_MajorAxisLength,-0.494908
43,Cytoplasm_Correlation_RWC_ER_Actin,-0.487502
213,Cells_AreaShape_Area,-0.479219
388,Nuclei_AreaShape_MinFeretDiameter,-0.472828
175,Cytoplasm_Texture_DifferenceVariance_Hoechst_3...,-0.463773
469,Nuclei_Correlation_RWC_Mitochondria_Actin,-0.44689
