## Train machine learning models to predict failing or healthy cell status

## Import libraries

In [1]:
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import parallel_backend

sys.path.append("../utils")
from training_utils import downsample_data, get_X_y_data

## Set paths and variables

In [2]:
# set numpy seed to make sure any random operations performs are reproducible
np.random.seed(0)

# path to training data set
training_data_path = pathlib.Path("./data/training_data.csv")

# Metadata column used for prediction class
label = "Metadata_cell_type"

# Directory for models to be outputted
model_dir = pathlib.Path("./models")
model_dir.mkdir(exist_ok=True, parents=True)

# Directory for label encoder
encoder_dir = pathlib.Path("./encoder_results")
encoder_dir.mkdir(exist_ok=True, parents=True)

## Load in training data

In [3]:
df = pd.read_csv(training_data_path)

print(df.shape)
df.head()

(8918, 651)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,C,6,7,Healthy,,,376.242424,128.480938,381.32624,137.099836,...,0.707842,0.606332,0.549532,0.638221,0.610855,0.219378,-0.346219,-0.25298,-0.313307,5.0
1,B,3,19,Failing,Dilated_Cardiomyopathy,,866.876056,460.705164,853.963621,364.509569,...,0.829509,0.930983,0.830988,1.201172,0.542919,0.559025,-0.344126,-0.268366,-0.316401,2.0
2,F,8,7,Healthy,,,456.878341,499.134562,435.320179,491.308636,...,0.837364,0.842956,1.101873,0.796826,1.620575,1.55982,-0.342015,-0.266188,-0.393977,6.0
3,D,6,4,Failing,Dilated_Cardiomyopathy,,969.914847,552.963974,968.154904,548.139516,...,0.43736,0.881944,1.076897,0.869873,0.418556,0.207326,-0.336241,-0.261156,-0.33134,3.0
4,E,6,2,Healthy,,,523.463244,62.512252,495.42779,83.191793,...,-0.657142,-0.029122,-0.066978,-0.511275,-1.176767,-0.860554,-0.23222,-0.151296,-0.021158,4.0


## Perform downsampling on training data and output as data frame

In [4]:
# load in training plate 4 data as downsampled to lowest class
downsample_df = downsample_data(path_to_data=training_data_path, label=label)

print(downsample_df.shape)
print(downsample_df["Metadata_cell_type"].value_counts())
downsample_df.head()

(6696, 650)
Failing    3348
Healthy    3348
Name: Metadata_cell_type, dtype: int64


Unnamed: 0_level_0,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_01_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
Metadata_WellRow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,9,19,Failing,Dilated_Cardiomyopathy,,514.269565,227.987826,484.3093,276.219322,32,...,0.714296,0.499177,0.3438,0.464,0.160815,0.643087,-0.311519,-0.240635,-0.323208,4.0
F,7,23,Failing,Dilated_Cardiomyopathy,,474.185151,475.505958,450.688099,454.02817,33,...,0.87855,1.031134,0.100154,0.286933,1.562716,1.56194,-0.337908,-0.221147,-0.397044,4.0
F,9,23,Failing,Dilated_Cardiomyopathy,,222.640763,512.182004,242.508374,521.577954,27,...,1.289552,1.316277,1.182249,1.032297,1.508439,1.626475,-0.358266,-0.27252,-0.395561,5.0
B,7,19,Failing,Dilated_Cardiomyopathy,,163.564347,667.011439,175.958528,715.618335,29,...,0.624865,0.844951,-0.34205,0.047592,-1.048463,-0.380218,-0.323891,-0.22089,-0.113575,4.0
C,9,23,Failing,Dilated_Cardiomyopathy,,435.642342,521.645946,422.971475,513.217996,23,...,0.862337,0.928782,1.429672,1.525531,1.396693,0.761789,-0.350207,-0.278043,-0.385237,4.0


## Get X and y data for both final and shuffled models

In [5]:
# Get not shuffled training data from downsampled df (e.g., "final")
X_train, y_train = get_X_y_data(df=downsample_df, label=label, shuffle=False)

# Get shuffled training data from downsampled df(e.g., "shuffled_baseline")
X_shuffled_train, y_shuffled_train = get_X_y_data(
    df=downsample_df, label=label, shuffle=True
)

## Encode labels in both shuffled and non-shuffled

**Note:** Failing will be considered as 0 and Healthy will be 1.

In [6]:
# Encode classes
le = LabelEncoder()
le.fit(y_train)
# Fit the labels onto the shuffled and non-shuffled data
y_train = le.transform(y_train)
y_shuffled_train = le.transform(y_shuffled_train)

# Print the original classes and their corresponding encoded values
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Class Mapping:")
print(class_mapping)

Class Mapping:
{'Failing': 0, 'Healthy': 1}


## Train the models

**Note:** We will be using RandomizedSearchCV to hyperparameterize the model since we have a larger dataset and it will be easier to try random combinations than all combinations.

### Set up the model and hyper parameter method

In [7]:
# Set folds for k-fold cross validation (default is 5)
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

# Set Logistic Regression model parameters (use default for max_iter)
logreg_params = {
    "penalty": "elasticnet",
    "solver": "saga",
    "max_iter": 1000,
    "n_jobs": -1,
    "random_state": 0,
    "class_weight": "balanced",
}

# Define the hyperparameter search space for RandomizedSearchCV
param_dist = {
    "C": np.logspace(-3, 3, 7),
    "l1_ratio": np.linspace(0, 1, 11),
}

# Set the random search hyperparameterization method parameters (used default for "cv" and "n_iter" parameter)
random_search_params = {
    "param_distributions": param_dist,
    "scoring": "f1_weighted",
    "random_state": 0,
    "n_jobs": -1,
    "cv": straified_k_folds,
}

### Train final model

In [8]:
# Check if the "models" folder contains a file with "final" in its name
if any(model_dir.glob("*final*")):
    print("Model training skipped as a 'final' model already exists.")
else:
    # Generate logistic regression model for non-shuffled training data
    final_logreg = LogisticRegression(**logreg_params)

    # Initialize the RandomizedSearchCV
    final_random_search = RandomizedSearchCV(final_logreg, **random_search_params)

    # Prevent the convergence warning in sklearn, it does not impact the result
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # Perform the random hyperparameter search
            final_random_search.fit(X_train, y_train)

Model training skipped as a 'final' model already exists.


### Train shuffled baseline model

In [9]:
# Check if the "models" folder contains a file with "final" in its name
if any(model_dir.glob("*shuffled*")):
    print("Model training skipped as a 'shuffled' model already exists.")
else:
    # Generate logistic regression model for shuffled training data
    shuffled_logreg = LogisticRegression(**logreg_params)

    # Initialize the RandomizedSearchCV
    shuffled_random_search = RandomizedSearchCV(shuffled_logreg, **random_search_params)

    # Prevent the convergence warning in sklearn, it does not impact the result
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # Perform the random hyperparameter search
            shuffled_random_search.fit(X_shuffled_train, y_shuffled_train)

Model training skipped as a 'shuffled' model already exists.


## Save the models and label encoder

In [10]:
data_prefix = "log_reg_fs_plate_4"

# Check if there are models with "final" or "shuffled" in its name that exists in the models folder
if any(model_dir.glob("*final*")) or any(model_dir.glob("*shuffled*")):
    print(
        "No models were generated or saved because 'final' and/or 'shuffled' files already exist."
    )
else:
    # Save the models
    dump(
        final_random_search.best_estimator_,
        f"{model_dir}/{data_prefix}_final_downsample.joblib",
    )
    dump(
        shuffled_random_search.best_estimator_,
        f"{model_dir}/{data_prefix}_shuffled_downsample.joblib",
    )

    # Save label encoder
    dump(le, f"{encoder_dir}/label_encoder_{data_prefix}.joblib")

    print("Models and label encoder have been saved!")

No models were generated or saved because 'final' and/or 'shuffled' files already exist.
