## Train machine learning models to predict failing or healthy cell status

## Import libraries

In [1]:
import pathlib
import sys
import warnings

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import parallel_backend

sys.path.append("../utils")
from training_utils import downsample_data, get_X_y_data

## Set paths and variables

In [2]:
# set numpy seed to make sure any random operations performs are reproducible
np.random.seed(0)

# path to training data set
training_data_path = pathlib.Path("./data/training_data.csv")

# Metadata column used for prediction class
label = "Metadata_cell_type"

# Directory for models to be outputted
model_dir = pathlib.Path("./models")
model_dir.mkdir(exist_ok=True, parents=True)

# Directory for label encoder
encoder_dir = pathlib.Path("./encoder_results")
encoder_dir.mkdir(exist_ok=True, parents=True)

## Load in training data

In [3]:
df = pd.read_csv(training_data_path)

print(df.shape)
df.head()

(8702, 658)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
0,F,2,7,Healthy,,,613.422998,254.98152,630.647224,233.819879,...,0.821706,0.892537,0.977827,1.142503,1.894191,-0.32555,-0.265339,-0.248069,-0.384025,1
1,F,7,23,Failing,Dilated_Cardiomyopathy,,275.111654,580.180929,218.303429,625.545988,...,1.359632,0.896813,0.919688,1.773802,1.723262,-0.339526,-0.36467,-0.253798,-0.306529,5
2,C,9,23,Failing,Dilated_Cardiomyopathy,,485.506726,736.105381,491.177433,723.542717,...,-0.348476,-0.155386,-0.169902,-0.13877,-1.085878,-0.086118,1.301086,-0.122886,0.077051,2
3,B,5,19,Failing,Dilated_Cardiomyopathy,,359.107872,637.41691,405.214423,641.678827,...,-0.664908,0.596938,0.237781,-0.342517,-0.756091,-0.080471,-0.195668,-0.208327,-0.208316,5
4,B,11,2,Healthy,,,590.700501,914.219298,572.155363,919.109394,...,0.083843,-0.985349,-1.172097,-1.524154,-1.343887,-0.282244,0.048215,0.057349,0.238584,2


## Perform downsampling on training data and output as data frame

In [4]:
# load in training plate 4 data as downsampled to lowest class
downsample_df = downsample_data(path_to_data=training_data_path, label=label)

print(downsample_df.shape)
print(downsample_df["Metadata_cell_type"].value_counts())
downsample_df.head()

(6522, 657)
Failing    3261
Healthy    3261
Name: Metadata_cell_type, dtype: int64


Unnamed: 0_level_0,Metadata_WellCol,Metadata_heart_number,Metadata_cell_type,Metadata_heart_failure_type,Metadata_treatment,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,Metadata_Image_Count_Cells,...,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_01_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_03_256,Nuclei_Texture_SumVariance_ER_3_03_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256,Metadata_Neighbors_Adjacent
Metadata_WellRow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B,3,19,Failing,Dilated_Cardiomyopathy,,627.541315,485.854132,625.686494,468.664476,30,...,1.43307,1.496215,1.50268,1.247278,1.309461,-0.340186,-0.331621,-0.258459,-0.360749,1
D,4,4,Failing,Dilated_Cardiomyopathy,,733.984631,135.354508,711.48403,126.972989,17,...,-0.272718,-0.270392,-0.078268,0.120369,0.278177,-0.204931,-0.098995,-0.183604,-0.293393,5
B,3,19,Failing,Dilated_Cardiomyopathy,,465.48593,295.227179,482.387108,273.99045,35,...,-0.835077,-2.118458,-1.40833,-2.232201,-1.821909,0.912571,1.753897,0.28786,2.013099,3
E,5,19,Failing,Dilated_Cardiomyopathy,,876.896514,937.699346,865.839058,949.917594,26,...,-1.787664,-1.044779,-1.759282,-0.572932,-1.005298,-0.141144,-0.097852,-0.040746,0.038565,4
C,3,23,Failing,Dilated_Cardiomyopathy,,703.206457,836.039932,716.919096,819.884167,22,...,1.215096,0.833805,1.021828,0.56635,0.309508,-0.34576,-0.333346,-0.251009,-0.302852,4


## Get X and y data for both final and shuffled models

In [5]:
# Get not shuffled training data from downsampled df (e.g., "final")
X_train, y_train = get_X_y_data(df=downsample_df, label=label, shuffle=False)

# Get shuffled training data from downsampled df(e.g., "shuffled_baseline")
X_shuffled_train, y_shuffled_train = get_X_y_data(
    df=downsample_df, label=label, shuffle=True
)

## Encode labels in both shuffled and non-shuffled

**Note:** Failing will be considered as 0 and Healthy will be 1.

In [6]:
# Encode classes
le = LabelEncoder()
le.fit(y_train)
# Fit the labels onto the shuffled and non-shuffled data
y_train = le.transform(y_train)
y_shuffled_train = le.transform(y_shuffled_train)

# Print the original classes and their corresponding encoded values
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Class Mapping:")
print(class_mapping)

Class Mapping:
{'Failing': 0, 'Healthy': 1}


## Train the models

**Note:** We will be using RandomizedSearchCV to hyperparameterize the model since we have a larger dataset and it will be easier to try random combinations than all combinations.

### Set up the model and hyper parameter method

In [7]:
# Set folds for k-fold cross validation (default is 5)
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

# Set Logistic Regression model parameters (use default for max_iter)
logreg_params = {
    "penalty": "elasticnet",
    "solver": "saga",
    "max_iter": 1000,
    "n_jobs": -1,
    "random_state": 0,
    "class_weight": "balanced",
}

# Define the hyperparameter search space for RandomizedSearchCV
param_dist = {
    "C": np.logspace(-3, 3, 7),
    "l1_ratio": np.linspace(0, 1, 11),
}

# Set the random search hyperparameterization method parameters (used default for "cv" and "n_iter" parameter)
random_search_params = {
    "param_distributions": param_dist,
    "scoring": "f1_weighted",
    "random_state": 0,
    "n_jobs": -1,
    "cv": straified_k_folds,
}

### Train final model

In [8]:
# Check if the "models" folder contains a file with "final" in its name
if any(model_dir.glob("*final*")):
    print("Model training skipped as a 'final' model already exists.")
else:
    # Generate logistic regression model for non-shuffled training data
    final_logreg = LogisticRegression(**logreg_params)

    # Initialize the RandomizedSearchCV
    final_random_search = RandomizedSearchCV(final_logreg, **random_search_params)

    # Prevent the convergence warning in sklearn, it does not impact the result
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # Perform the random hyperparameter search
            final_random_search.fit(X_train, y_train)
    
    # Print the best parameters
    print("Optimal parameters:", final_random_search.best_params_)

Optimal parameters: {'l1_ratio': 0.4, 'C': 0.1}


### Train shuffled baseline model

In [9]:
# Check if the "models" folder contains a file with "final" in its name
if any(model_dir.glob("*shuffled*")):
    print("Model training skipped as a 'shuffled' model already exists.")
else:
    # Generate logistic regression model for shuffled training data
    shuffled_logreg = LogisticRegression(**logreg_params)

    # Initialize the RandomizedSearchCV
    shuffled_random_search = RandomizedSearchCV(shuffled_logreg, **random_search_params)

    # Prevent the convergence warning in sklearn, it does not impact the result
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            # Perform the random hyperparameter search
            shuffled_random_search.fit(X_shuffled_train, y_shuffled_train)

## Save the models and label encoder

In [10]:
data_prefix = "log_reg_fs_plate_4"

# Check if there are models with "final" or "shuffled" in its name that exists in the models folder
if any(model_dir.glob("*final*")) or any(model_dir.glob("*shuffled*")):
    print(
        "No models were generated or saved because 'final' and/or 'shuffled' files already exist."
    )
else:
    # Save the models
    dump(
        final_random_search.best_estimator_,
        f"{model_dir}/{data_prefix}_final_downsample.joblib",
    )
    dump(
        shuffled_random_search.best_estimator_,
        f"{model_dir}/{data_prefix}_shuffled_downsample.joblib",
    )

    # Save label encoder
    dump(le, f"{encoder_dir}/label_encoder_{data_prefix}.joblib")

    print("Models and label encoder have been saved!")

Models and label encoder have been saved!
