### Load Libraries


In [1]:
import pathlib
import warnings
from joblib import load
import itertools

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.utils import parallel_backend
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import f1_score

import sys

sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_X_y_data
from evaluate_utils import get_SCM_model_data

### Load/Preview Labeled Data


In [2]:
# load labeled data
labeled_data_path = pathlib.Path("../0.download_data/data/labeled_data.csv.gz")
labeled_data = get_features_data(labeled_data_path)

# preview labeled data
print(labeled_data.shape)
labeled_data.head(5)

(2862, 1458)


Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.85668,-0.934949,0.725091,2.25545,-0.565433,1.628086,-0.605625,-0.748135
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.74209,0.36552,0.643759,-1.906097,1.01937
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.950706,-0.811825,-0.522427,-1.402842,-0.28994,2.66125,0.126978,-0.824945,-0.494285,1.763332


In [3]:
# see number of images to
num_images = labeled_data["Metadata_DNA"].unique().shape[0]
print(f"There are {num_images} images to perform LOIO evaluation on per model.")


There are 270 images to perform LOIO evaluation on per model.


### Get LOIO probabilities


In [4]:
# directory to load the models from
models_dir = pathlib.Path("../2.train_model/models/multi_class_models")

# use a list to keep track of LOIO probabilities in tidy long format for each model combination
compiled_LOIO_wide_data = []

# iterate through each model (final model, shuffled baseline model, etc)
# sorted so final models are loaded before shuffled_baseline
for model_path in sorted(models_dir.iterdir()):
    # only perform LOIO with hyper params from final models so skip shuffled_baseline models
    if "shuffled" in model_path.name:
        continue

    # load the model
    model = load(model_path)
    # determine feature type from model file name
    feature_type = model_path.name.split("__")[1].replace(".joblib", "")

    print(
        f"Performing LOIO for feature type {feature_type} with parameters C: {model.C}, l1_ratio: {model.l1_ratio}"
    )

    # iterate through image paths
    for image_path in labeled_data["Metadata_DNA"].unique():
        # get training and testing cells from image path
        # every cell from the image path is for testing, the rest are for training
        train_cells = labeled_data.loc[labeled_data["Metadata_DNA"] != image_path]
        test_cells = labeled_data.loc[labeled_data["Metadata_DNA"] == image_path]

        # get X, y from training and testing cells
        X_train, y_train = get_X_y_data(train_cells, feature_type)
        X_test, y_test = get_X_y_data(test_cells, feature_type)

        # capture convergence warning from sklearn
        # this warning does not affect the model but takes up lots of space in the output
        # this warning must be caught with parallel_backend because the logistic regression model uses parallel_backend
        # (n_jobs=-1 means use all processors)
        with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore", category=ConvergenceWarning, module="sklearn"
                )

                # fit a logisitc regression model on the training X, y
                LOIO_model = LogisticRegression(
                    penalty="elasticnet",
                    solver="saga",
                    max_iter=100,
                    n_jobs=-1,
                    random_state=0,
                    C=model.C,
                    l1_ratio=model.l1_ratio,
                ).fit(X_train, y_train)

        # create metadata dataframe for test cells with model parameters
        metadata_dataframe = pd.concat(
            [
                test_cells["Cell_UUID"],
                test_cells["Metadata_DNA"],
                test_cells["Mitocheck_Phenotypic_Class"],
            ],
            axis=1,
        ).reset_index(drop=True)
        metadata_dataframe["Model_Feature_Type"] = feature_type
        metadata_dataframe["Model_C"] = model.C
        metadata_dataframe["Model_l1_ratio"] = model.l1_ratio

        # predict probabilities for test cells and make these probabilities into a dataframe
        probas = LOIO_model.predict_proba(X_test)
        probas_dataframe = pd.DataFrame(probas, columns=model.classes_)
        
        # combine metadata and probabilities dataframes for test cells to create wide data
        test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)

        # add tidy long data to compiled data
        compiled_LOIO_wide_data.append(test_cells_wide_data)

Performing LOIO for feature type CP with parameters C: 1.0, l1_ratio: 0.30000000000000004


### Format and save LOIO probabilities


In [None]:
# compile list of wide data into one dataframe
compiled_LOIO_wide_data = pd.concat(compiled_LOIO_wide_data).reset_index(drop=True)

# convert wide data to tidy long data and sort by Cell_UUID, Model_Feature_Type, and Model_Phenotypic_Class for pretty formatting
compiled_LOIO_tidy_long_data = (
    pd.melt(
        compiled_LOIO_wide_data,
        id_vars=metadata_dataframe.columns,
        value_vars=probas_dataframe.columns,
        var_name="Model_Phenotypic_Class",
        value_name="Predicted_Probability",
    )
    .sort_values(["Model_Feature_Type", "Cell_UUID", "Model_Phenotypic_Class"])
    .reset_index(drop=True)
)

# specify results directory
LOIO_probas_dir = pathlib.Path("evaluations/LOIO_probas/")
LOIO_probas_dir.mkdir(parents=True, exist_ok=True)

# define save path
compiled_LOIO_save_path = pathlib.Path(
    f"{LOIO_probas_dir}/compiled_LOIO_probabilites.tsv"
)

# save data as tsv
compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep="\t")

# preview tidy long data
compiled_LOIO_tidy_long_data


Unnamed: 0,Cell_UUID,Metadata_DNA,Mitocheck_Phenotypic_Class,Model_Feature_Type,Model_C,Model_l1_ratio,Model_Phenotypic_Class,Predicted_Probability
0,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Elongated,CP,1.0,0.3,ADCCM,0.000290
1,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Elongated,CP,1.0,0.3,Anaphase,0.000040
2,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Elongated,CP,1.0,0.3,Apoptosis,0.000022
3,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Elongated,CP,1.0,0.3,Binuclear,0.000015
4,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Elongated,CP,1.0,0.3,Elongated,0.991291
...,...,...,...,...,...,...,...,...
1075,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Elongated,DP,1.0,1.0,MetaphaseAlignment,0.000490
1076,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Elongated,DP,1.0,1.0,OutOfFocus,0.000562
1077,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Elongated,DP,1.0,1.0,Polylobed,0.000023
1078,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Elongated,DP,1.0,1.0,Prometaphase,0.001219


### Get LOIO probabilities (single class models)


In [None]:
# directory to load the models from
models_dir = pathlib.Path("../2.train_model/models/single_class_models")

# use a list to keep track of LOIO probabilities in tidy long format for each model combination
compiled_LOIO_wide_data = []

# define combinations to test over
model_types = ["final"] # only perform LOIO with hyper params from final models so skip shuffled_baseline models
feature_types = ["CP", "DP", "CP_and_DP"]
phenotypic_classes = labeled_data["Mitocheck_Phenotypic_Class"].unique()

# iterate through each combination of feature_types, evaluation_types, phenotypic_classes
for model_type, feature_type, phenotypic_class in itertools.product(
    model_types, feature_types, phenotypic_classes
):
    single_class_model_path = pathlib.Path(
            f"{models_dir}/{phenotypic_class}_models/{model_type}__{feature_type}.joblib"
        )

    # load the model
    model = load(single_class_model_path)

    print(
        f"Performing LOIO on {phenotypic_class} model for feature type {feature_type} with parameters C: {model.C}, l1_ratio: {model.l1_ratio}"
    )

    # iterate through image paths
    for image_path in labeled_data["Metadata_DNA"].unique():
        # get training and testing cells from image path
        # every cell from the image path is for testing, the rest are for training
        train_cells = labeled_data.loc[labeled_data["Metadata_DNA"] != image_path]
        test_cells = labeled_data.loc[labeled_data["Metadata_DNA"] == image_path]
        
        # rename negative label and downsample over represented classes
        train_cells = get_SCM_model_data(train_cells, phenotypic_class, "train")
        test_cells = get_SCM_model_data(test_cells, phenotypic_class, "test")

        # get X, y from training and testing cells
        X_train, y_train = get_X_y_data(train_cells, feature_type)
        X_test, y_test = get_X_y_data(test_cells, feature_type)
        
        # capture convergence warning from sklearn
        # this warning does not affect the model but takes up lots of space in the output
        # this warning must be caught with parallel_backend because the logistic regression model uses parallel_backend
        # (n_jobs=-1 means use all processors)
        with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore", category=ConvergenceWarning, module="sklearn"
                )

                # fit a logisitc regression model on the training X, y
                LOIO_model = LogisticRegression(
                    penalty="elasticnet",
                    solver="saga",
                    max_iter=100,
                    n_jobs=-1,
                    random_state=0,
                    C=model.C,
                    l1_ratio=model.l1_ratio,
                ).fit(X_train, y_train)

        # create metadata dataframe for test cells with model parameters
        metadata_dataframe = pd.concat(
            [
                test_cells["Cell_UUID"],
                test_cells["Metadata_DNA"],
                test_cells["Mitocheck_Phenotypic_Class"],
            ],
            axis=1,
        ).reset_index(drop=True)
        metadata_dataframe["Model_Feature_Type"] = feature_type
        metadata_dataframe["Model_C"] = model.C
        metadata_dataframe["Model_l1_ratio"] = model.l1_ratio
        metadata_dataframe["Model_Phenotypic_Class"] = phenotypic_class

        # predict probabilities for test cells and make these probabilities into a dataframe
        probas = LOIO_model.predict_proba(X_test)
        probas_dataframe = pd.DataFrame(probas, columns=model.classes_)
        # make column names consistent for all single cell models (SCMs)
        # positive label corresponds to that SCM's phenotypic class, negative is all other labels
        probas_dataframe = probas_dataframe.rename(columns={phenotypic_class: 'Positive_Label', f'Not {phenotypic_class}': 'Negative_Label'})

        # combine metadata and probabilities dataframes for test cells to create wide data
        test_cells_wide_data = pd.concat([metadata_dataframe, probas_dataframe], axis=1)

        # add tidy long data to compiled data
        compiled_LOIO_wide_data.append(test_cells_wide_data)


../2.train_model/models/single_class_models/Large_models/final__CP.joblib
Performing LOIO on Large model for feature type CP with parameters C: 0.01, l1_ratio: 0.1
../2.train_model/models/single_class_models/Prometaphase_models/final__CP.joblib
Performing LOIO on Prometaphase model for feature type CP with parameters C: 1.0, l1_ratio: 0.7000000000000001
../2.train_model/models/single_class_models/Grape_models/final__CP.joblib
Performing LOIO on Grape model for feature type CP with parameters C: 0.01, l1_ratio: 0.0
../2.train_model/models/single_class_models/Interphase_models/final__CP.joblib
Performing LOIO on Interphase model for feature type CP with parameters C: 1.0, l1_ratio: 0.0
../2.train_model/models/single_class_models/Apoptosis_models/final__CP.joblib
Performing LOIO on Apoptosis model for feature type CP with parameters C: 0.01, l1_ratio: 0.1
../2.train_model/models/single_class_models/ADCCM_models/final__CP.joblib
Performing LOIO on ADCCM model for feature type CP with param

### Format and save LOIO probabilities

In [None]:
# compile list of wide data into one dataframe
compiled_LOIO_wide_data = pd.concat(compiled_LOIO_wide_data).reset_index(drop=True)

# convert wide data to tidy long data and sort by Cell_UUID, Model_Feature_Type, and Model_Phenotypic_Class for pretty formatting
compiled_LOIO_tidy_long_data = (
    pd.melt(
        compiled_LOIO_wide_data,
        id_vars=metadata_dataframe.columns,
        value_vars=probas_dataframe.columns,
        var_name="Predicted_Label",
        value_name="Predicted_Probability",
    )
    .sort_values(["Model_Feature_Type", "Cell_UUID", "Model_Phenotypic_Class"])
    .reset_index(drop=True)
)

# specify results directory
LOIO_probas_dir = pathlib.Path("evaluations/LOIO_probas/")
LOIO_probas_dir.mkdir(parents=True, exist_ok=True)

# define save path
compiled_LOIO_save_path = pathlib.Path(
    f"{LOIO_probas_dir}/compiled_SCM_LOIO_probabilites.tsv"
)

# save data as tsv
compiled_LOIO_tidy_long_data.to_csv(compiled_LOIO_save_path, sep="\t")

# preview tidy long data
compiled_LOIO_tidy_long_data

Unnamed: 0,Cell_UUID,Metadata_DNA,Mitocheck_Phenotypic_Class,Model_Feature_Type,Model_C,Model_l1_ratio,Model_Phenotypic_Class,Predicted_Label,Predicted_Probability
0,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Not ADCCM,CP,0.01,0.2,ADCCM,Negative_Label,0.651219
1,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Not ADCCM,CP,0.01,0.2,ADCCM,Positive_Label,0.348781
2,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Not Anaphase,CP,0.01,0.0,Anaphase,Negative_Label,0.647665
3,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Not Anaphase,CP,0.01,0.0,Anaphase,Positive_Label,0.352335
4,009a4190-7583-4821-9a17-737f0485d252,LT0010_27/LT0010_27_173_83.tif,Not Apoptosis,CP,0.01,0.1,Apoptosis,Negative_Label,0.913495
...,...,...,...,...,...,...,...,...,...
2155,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Not Polylobed,DP,0.10,0.4,Polylobed,Positive_Label,0.057508
2156,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Not Prometaphase,DP,0.01,0.0,Prometaphase,Negative_Label,0.971908
2157,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Not Prometaphase,DP,0.01,0.0,Prometaphase,Positive_Label,0.028092
2158,f74ce97f-5862-419b-a9a8-44ad53d332a2,LT0010_27/LT0010_27_173_83.tif,Not SmallIrregular,DP,0.01,0.2,SmallIrregular,Negative_Label,0.701965
