# Extract logistic regression model coefficients per CellProfiler feature

The coefficients from the machine learning model will either be positive (x > 0), where if the feature value increases, the more likely the feature is related to the Healthy cell type, negative (x < 0), where if the feature value increases, the more likely the feature is the Failing cell type, and zero (x = 0), where that feature has no impact on predicting if a cell is Healthy or Failing.

## Import libraries

In [1]:
from joblib import load
import pathlib
import pandas as pd

## Load in the training data to collect the CellProfiler feature columns

In [2]:
# Path to data dir with the training, testing, and holdout splits
data_dir = pathlib.Path("../0.train_logistic_regression/data/")

# Output dir for coefficients data
coeff_dir = pathlib.Path("./coeff_data/")
coeff_dir.mkdir(exist_ok=True)

# Path to training data to access the feature columns
path_to_training_data = pathlib.Path(f"{data_dir}/training_data.csv")

# Get all the column names from the training data
all_columns = pd.read_csv(path_to_training_data, nrows=0).columns

# Filter columns that start with 'Metadata_' to only get feature columns from CellProfiler
feature_columns = [col for col in all_columns if not col.startswith('Metadata_')]

print(len(feature_columns))
feature_columns
print("Examples of first five feature columns:")
for example_column in feature_columns[:5]:
    print(example_column)

625
Examples of first five feature columns:
Cytoplasm_AreaShape_BoundingBoxArea
Cytoplasm_AreaShape_Compactness
Cytoplasm_AreaShape_Eccentricity
Cytoplasm_AreaShape_Extent
Cytoplasm_AreaShape_FormFactor


## Set paths and load in the final model

In [3]:
# path to the final model joblib file
path_to_final_model = pathlib.Path("../0.train_logistic_regression/models/log_reg_fs_plate_4_final_downsample.joblib").resolve(strict=True)

# load in final model
final_model = load(path_to_final_model)

## Collect coefficients from the model and match with the correct feature in a dataframe

In [4]:
# Get the coefficients
coefficients = final_model.coef_

# Print the coefficients shape and confirm it is the same number as feature columns from training data
print(coefficients.shape)
# Confirm it is the same number as feature columns from training data
if coefficients.shape[1] == len(feature_columns):
    print("The number of coefficients matches the number of feature columns.")
else:
    print("Warning: The number of coefficients does not match the number of feature columns.")

# Create a DataFrame with the coefficients and features
coefficients_df = pd.DataFrame({'Feature': feature_columns, 'Coefficient': coefficients.flatten()})

# Save the all coefficient data into a CSV file prior to splitting
coefficients_df.to_csv(f'{coeff_dir}/all_coeffs.csv', index=False)

# Print the DataFrame
coefficients_df.head()

(1, 625)
The number of coefficients matches the number of feature columns.


Unnamed: 0,Feature,Coefficient
0,Cytoplasm_AreaShape_BoundingBoxArea,0.0
1,Cytoplasm_AreaShape_Compactness,0.258104
2,Cytoplasm_AreaShape_Eccentricity,0.043979
3,Cytoplasm_AreaShape_Extent,0.0
4,Cytoplasm_AreaShape_FormFactor,0.0


## Split the data frame by negative, positive. and zero coefficients which relate to different class importance

In [5]:
# Split into DataFrames with positive, negative, and zero coefficients
positive_coeffs_df = coefficients_df[coefficients_df['Coefficient'] > 0].copy()
negative_coeffs_df = coefficients_df[coefficients_df['Coefficient'] < 0].copy()
zero_coeffs_df = coefficients_df[coefficients_df['Coefficient'] == 0].copy()

# Make the values in negative_coeffs_df positive to prevent issues during plotting
negative_coeffs_df['Coefficient'] = abs(negative_coeffs_df['Coefficient'])

# Rename the columns
positive_coeffs_df.columns = ['Feature', 'Healthy_Coeffs']
negative_coeffs_df.columns = ['Feature', 'Failing_Coeffs']
zero_coeffs_df.columns = ['Feature', 'Zero_Coeffs']

# Save the coef data into the "/coeff_data" folder
positive_coeffs_df.to_csv(f'{coeff_dir}/healthy_coeffs.csv', index=False)
negative_coeffs_df.to_csv(f'{coeff_dir}/failing_coeffs.csv', index=False)
zero_coeffs_df.to_csv(f'{coeff_dir}/zero_coeffs.csv', index=False)


# Print or use the resulting DataFrames
print("Positive Coefficients:", positive_coeffs_df.shape[0])
print("\nNegative Coefficients:", negative_coeffs_df.shape[0])
print("\nZero Coefficients:", zero_coeffs_df.shape[0])
negative_coeffs_df.head()

Positive Coefficients: 191

Negative Coefficients: 202

Zero Coefficients: 232


Unnamed: 0,Feature,Failing_Coeffs
8,Cytoplasm_AreaShape_Solidity,0.368831
9,Cytoplasm_AreaShape_Zernike_0_0,0.070718
15,Cytoplasm_AreaShape_Zernike_4_0,0.031305
16,Cytoplasm_AreaShape_Zernike_4_2,0.095932
18,Cytoplasm_AreaShape_Zernike_5_3,0.062056


## Explore the coefficients

In [6]:
# Find the row with the highest coefficient value
max_row = coefficients_df.loc[coefficients_df['Coefficient'].idxmax()]

# Extract the feature and coefficient values
max_feature = max_row['Feature']
max_coefficient_value = max_row['Coefficient']

# Print or use the result
print("Feature with the highest coefficient:", max_feature)
print("Coefficient value:", max_coefficient_value)

Feature with the highest coefficient: Nuclei_Intensity_MeanIntensityEdge_Hoechst
Coefficient value: 1.4188322866894274


In [7]:
# Sort the DataFrame based on the coefficient values (from most positive to most negative)
coefficients_healthy_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Show the top ten ranking features for predicting "Healthy" class
coefficients_healthy_df.head(10)

Unnamed: 0,Feature,Coefficient
495,Nuclei_Intensity_MeanIntensityEdge_Hoechst,1.418832
497,Nuclei_Intensity_MinIntensityEdge_Hoechst,0.810532
349,Cells_Texture_AngularSecondMoment_Hoechst_3_01...,0.74815
82,Cytoplasm_Intensity_StdIntensityEdge_Hoechst,0.68339
436,Nuclei_Correlation_K_Mitochondria_PM,0.668301
499,Nuclei_Intensity_MinIntensity_Actin,0.623373
475,Nuclei_Intensity_IntegratedIntensityEdge_Hoechst,0.555998
480,Nuclei_Intensity_IntegratedIntensity_PM,0.551413
476,Nuclei_Intensity_IntegratedIntensity_Actin,0.493182
561,Nuclei_Texture_AngularSecondMoment_PM_3_01_256,0.482055


In [8]:
# Find the row with the most negative coefficient value
min_row = coefficients_df.loc[coefficients_df['Coefficient'].idxmin()]

# Extract the feature and coefficient values
min_feature = min_row['Feature']
min_coefficient_value = min_row['Coefficient']

# Print or use the result
print("Feature with the most negative coefficient:", min_feature)
print("Coefficient value:", min_coefficient_value)

Feature with the most negative coefficient: Cells_Intensity_IntegratedIntensityEdge_Actin
Coefficient value: -1.3572041220936908


In [9]:
# Sort the DataFrame based on the coefficient values (from most negative to most positive)
coefficients_failing_df = coefficients_df.sort_values(by='Coefficient', ascending=True)

# Show the top ten ranking features for predicting "Failing" class
coefficients_failing_df.head(10)

Unnamed: 0,Feature,Coefficient
268,Cells_Intensity_IntegratedIntensityEdge_Actin,-1.357204
288,Cells_Intensity_StdIntensityEdge_Hoechst,-0.736874
183,Cytoplasm_Texture_InfoMeas1_PM_3_00_256,-0.518741
503,Nuclei_Intensity_StdIntensityEdge_Hoechst,-0.515995
286,Cells_Intensity_MinIntensityEdge_Actin,-0.506231
378,Nuclei_AreaShape_ConvexArea,-0.431671
381,Nuclei_AreaShape_MinFeretDiameter,-0.41316
210,Cells_AreaShape_Area,-0.400735
458,Nuclei_Correlation_RWC_Hoechst_Actin,-0.390901
42,Cytoplasm_Correlation_RWC_ER_Actin,-0.382378


## Add ranking column with sorted descending values and save the CSV for visualization

Rank is based on the highest positive coefficient which will have rank one and then descending from there. We expect to see that the model will take into account many different features (positive and negative which relate to different classes) and there will be many features at zero meaning they are redundant to the model.

In [10]:
# Sort coefficients_df by descending order
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Add a new column 'Rank'
coefficients_df['Rank'] = range(1, len(coefficients_df) + 1)

# Save the ranked df
coefficients_df.to_csv(f'{coeff_dir}/ranked_coeffs.csv', index=False)

# Show df to assess if the ranking was performed correctly
print(coefficients_df.shape)
coefficients_df.head()

(625, 3)


Unnamed: 0,Feature,Coefficient,Rank
495,Nuclei_Intensity_MeanIntensityEdge_Hoechst,1.418832,1
497,Nuclei_Intensity_MinIntensityEdge_Hoechst,0.810532,2
349,Cells_Texture_AngularSecondMoment_Hoechst_3_01...,0.74815,3
82,Cytoplasm_Intensity_StdIntensityEdge_Hoechst,0.68339,4
436,Nuclei_Correlation_K_Mitochondria_PM,0.668301,5
