## Train machine learning models to predict failing or healthy cell status

## Import libraries

In [1]:
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import parallel_backend

sys.path.append("../utils")
from training_utils import downsample_data, get_X_y_data

## Set paths and variables

In [2]:
# set numpy seed to make sure any random operations performs are reproducible
np.random.seed(0)

# path to training data set
training_data_path = pathlib.Path("./data/training_data.csv")

# Metadata column used for prediction class
label = "Metadata_cell_type"

# Directory for models to be outputted
model_dir = pathlib.Path("./models")
model_dir.mkdir(exist_ok=True, parents=True)

# Directory for label encoder
encoder_dir = pathlib.Path("./encoder_results")
encoder_dir.mkdir(exist_ok=True, parents=True)

## Load in training data

In [3]:
df = pd.read_csv(training_data_path)

print(df.shape)
df.head()

(8668, 645)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,E,2,2,Healthy,,,441.483467,778.252011,438.909951,749.408845,...,0.869128,0.758897,1.195791,1.283194,-0.208418,0.326266,-0.333189,-0.261369,-0.254243,6
1,G,6,4,Failing,Dilated_Cardiomyopathy,,701.516199,301.958963,714.290835,311.671313,...,0.545165,0.173092,0.479434,0.147169,1.577848,1.025633,-0.260897,-0.085654,-0.33715,8
2,G,2,4,Failing,Dilated_Cardiomyopathy,,638.387773,151.165939,697.226395,127.074661,...,0.745489,0.870165,0.959225,1.031354,1.28224,0.73549,-0.339162,-0.250675,-0.329171,1
3,B,2,2,Healthy,,,895.475589,107.883838,927.093275,120.202047,...,1.078607,1.086057,0.99211,1.175198,0.688815,0.704334,-0.345029,-0.248479,-0.284084,1
4,B,3,19,Failing,Dilated_Cardiomyopathy,,625.849673,580.358751,654.460339,586.267165,...,0.730539,0.765736,1.09822,0.887757,0.798801,0.202608,-0.320407,-0.259002,-0.336574,5


## Perform downsampling on training data and output as data frame

In [4]:
# load in training plate 4 data as downsampled to lowest class
downsample_df = downsample_data(path_to_data=training_data_path, label=label)

print(downsample_df.shape)
print(downsample_df["Metadata_cell_type"].value_counts())
downsample_df.head()

(6492, 644)
Failing    3246
Healthy    3246
Name: Metadata_cell_type, dtype: int64


Unnamed: 0_level_0,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
Metadata_WellRow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D,8,4,Failing,Dilated_Cardiomyopathy,,462.301616,125.17684,447.577325,101.181091,32,...,0.64554,0.628166,-0.171782,0.005777,-0.02602,0.133574,-0.323449,-0.166209,-0.279224,3
F,7,23,Failing,Dilated_Cardiomyopathy,,563.943837,491.020423,563.18848,529.42636,15,...,0.254929,0.403851,-0.198852,0.439955,0.963652,0.668016,-0.283631,-0.117615,-0.279644,0
F,9,23,Failing,Dilated_Cardiomyopathy,,726.982414,756.063949,766.07038,773.330319,25,...,1.163046,1.05629,1.076159,1.12141,1.19457,1.127103,-0.339849,-0.257978,-0.352242,3
F,7,23,Failing,Dilated_Cardiomyopathy,,880.060143,228.900102,896.201017,205.120417,23,...,0.585016,0.662624,0.012257,-0.208008,0.922835,0.759215,-0.341053,-0.216576,-0.322021,5
B,5,19,Failing,Dilated_Cardiomyopathy,,860.353646,311.37962,870.167182,314.207839,37,...,0.40433,0.588658,0.866026,0.727757,-0.675272,0.47468,-0.298338,-0.247866,-0.131623,6


## Get X and y data for both final and shuffled models

In [5]:
# Get not shuffled training data from downsampled df (e.g., "final")
X_train, y_train = get_X_y_data(df=downsample_df, label=label, shuffle=False)

# Get shuffled training data from downsampled df(e.g., "shuffled_baseline")
X_shuffled_train, y_shuffled_train = get_X_y_data(
    df=downsample_df, label=label, shuffle=True
)

## Encode labels in both shuffled and non-shuffled

**Note:** Failing will be considered as 0 and Healthy will be 1.

In [6]:
# Encode classes
le = LabelEncoder()
le.fit(y_train)
# Fit the labels onto the shuffled and non-shuffled data
y_train = le.transform(y_train)
y_shuffled_train = le.transform(y_shuffled_train)

# Print the original classes and their corresponding encoded values
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Class Mapping:")
print(class_mapping)

Class Mapping:
{'Failing': 0, 'Healthy': 1}


## Train the models

**Note:** We will be using RandomizedSearchCV to hyperparameterize the model since we have a larger dataset and it will be easier to try random combinations than all combinations.

### Set up the model and hyper parameter method

In [7]:
# Set folds for k-fold cross validation (default is 5)
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

# Set Logistic Regression model parameters (use default for max_iter)
logreg_params = {
    "penalty": "elasticnet",
    "solver": "saga",
    "max_iter": 1000,
    "n_jobs": -1,
    "random_state": 0,
    "class_weight": "balanced",
}

# Define the hyperparameter search space for RandomizedSearchCV
param_dist = {
    "C": np.logspace(-3, 3, 7),
    "l1_ratio": np.linspace(0, 1, 11),
}

# Set the random search hyperparameterization method parameters (used default for "cv" and "n_iter" parameter)
random_search_params = {
    "param_distributions": param_dist,
    "scoring": "f1_weighted",
    "random_state": 0,
    "n_jobs": -1,
    "cv": straified_k_folds,
}

### Train final model

In [8]:
# Check if the "models" folder contains a file with "final" in its name
if any(model_dir.glob("*final*")):
    print("Model training skipped as a 'final' model already exists.")
else:
    # Generate logistic regression model for non-shuffled training data
    final_logreg = LogisticRegression(**logreg_params)

    # Initialize the RandomizedSearchCV
    final_random_search = RandomizedSearchCV(final_logreg, **random_search_params)

    # Prevent the convergence warning in sklearn, it does not impact the result
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # Perform the random hyperparameter search
            final_random_search.fit(X_train, y_train)

# Print the best parameters
print("Optimal parameters:", final_random_search.best_params_)

Optimal parameters: {'l1_ratio': 0.4, 'C': 0.1}


### Train shuffled baseline model

In [9]:
# Check if the "models" folder contains a file with "final" in its name
if any(model_dir.glob("*shuffled*")):
    print("Model training skipped as a 'shuffled' model already exists.")
else:
    # Generate logistic regression model for shuffled training data
    shuffled_logreg = LogisticRegression(**logreg_params)

    # Initialize the RandomizedSearchCV
    shuffled_random_search = RandomizedSearchCV(shuffled_logreg, **random_search_params)

    # Prevent the convergence warning in sklearn, it does not impact the result
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # Perform the random hyperparameter search
            shuffled_random_search.fit(X_shuffled_train, y_shuffled_train)

## Save the models and label encoder

In [10]:
data_prefix = "log_reg_fs_plate_4"

# Check if there are models with "final" or "shuffled" in its name that exists in the models folder
if any(model_dir.glob("*final*")) or any(model_dir.glob("*shuffled*")):
    print(
        "No models were generated or saved because 'final' and/or 'shuffled' files already exist."
    )
else:
    # Save the models
    dump(
        final_random_search.best_estimator_,
        f"{model_dir}/{data_prefix}_final_downsample.joblib",
    )
    dump(
        shuffled_random_search.best_estimator_,
        f"{model_dir}/{data_prefix}_shuffled_downsample.joblib",
    )

    # Save label encoder
    dump(le, f"{encoder_dir}/label_encoder_{data_prefix}.joblib")

print("Models and label encoder have been saved!")

Models and label encoder have been saved!
