### Import Libraries


In [1]:
import sys
import pathlib
import itertools

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from joblib import load

sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data
from evaluate_utils import get_SCM_model_data


### Load necessary data


In [2]:
# load features data from indexes and features dataframe
data_split_path = pathlib.Path("../1.split_data/indexes/data_split_indexes.tsv")
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)
features_dataframe_path = pathlib.Path("../0.download_data/data/labeled_data.csv.gz")
features_dataframe = get_features_data(features_dataframe_path)

### Get Each Model Predictions on Each Dataset (Multi Class Models)


In [3]:
# directory to load the models from
models_dir = pathlib.Path("../2.train_model/models/multi_class_models")

# use a list to keep track of scores in tidy long format for each model and dataset combination
compiled_predictions = []

# iterate through each model (final model, shuffled baseline model, etc)
# sorted so final models are shown before shuffled_baseline
for model_path in sorted(models_dir.iterdir()):
    model = load(model_path)
    # determine model/feature type from model file name
    model_type = model_path.name.split("__")[0]
    feature_type = model_path.name.split("__")[1].replace(".joblib", "")

    # iterate through label datasets (labels correspond to train, test, etc)
    # with nested for loops, we test each model on each dataset(corresponding to a label)
    for label in data_split_indexes["label"].unique():
        print(
            f"Getting predictions for model: {model_type}, trained with features: {feature_type}, on dataset: {label}"
        )

        # load dataset (train, test, etc)
        data = get_dataset(features_dataframe, data_split_indexes, label)

        # get features and labels dataframes
        X, y = get_X_y_data(data, feature_type)

        # get predictions from model
        y_pred = model.predict(X)

        # create dataframe with dataset index of cell being predicted,
        # predicted phenotypic class,
        # true phenotypic class,
        # and which dataset/models are involved in prediction
        predictions_df = pd.DataFrame(
            {
                "Cell_UUID": data["Cell_UUID"],
                "Phenotypic_Class_Predicted": y,
                "Phenotypic_Class_True": y_pred,
                "data_split": label,
                "shuffled": "shuffled" in model_type,
                "feature_type": feature_type,
            }
        )

        compiled_predictions.append(predictions_df)


Getting predictions for model: final, trained with features: CP, on dataset: train
Getting predictions for model: final, trained with features: CP, on dataset: test
Getting predictions for model: final, trained with features: CP_and_DP, on dataset: train
Getting predictions for model: final, trained with features: CP_and_DP, on dataset: test
Getting predictions for model: final, trained with features: DP, on dataset: train
Getting predictions for model: final, trained with features: DP, on dataset: test
Getting predictions for model: shuffled_baseline, trained with features: CP, on dataset: train
Getting predictions for model: shuffled_baseline, trained with features: CP, on dataset: test
Getting predictions for model: shuffled_baseline, trained with features: CP_and_DP, on dataset: train
Getting predictions for model: shuffled_baseline, trained with features: CP_and_DP, on dataset: test
Getting predictions for model: shuffled_baseline, trained with features: DP, on dataset: train
Gett

### Compile and Save Predictions


In [4]:
# compile predictions and reset index of dataframe
compiled_predictions = pd.concat(compiled_predictions).reset_index(drop=True)

# specify save path
compiled_predictions_save_path = pathlib.Path("predictions/compiled_predictions.tsv")
compiled_predictions_save_path.parent.mkdir(parents=True, exist_ok=True)

# save data as tsv
compiled_predictions.to_csv(compiled_predictions_save_path, sep="\t")

# preview compiled predictions
compiled_predictions

Unnamed: 0,Cell_UUID,Phenotypic_Class_Predicted,Phenotypic_Class_True,data_split,shuffled,feature_type
0,21da27ab-873a-41f4-ab98-49170cae9a2d,Interphase,Interphase,train,False,CP
1,82f7949b-4ea2-45c8-8dd9-7854caf49077,OutOfFocus,OutOfFocus,train,False,CP
2,cec7234f-fe35-4411-aded-f8112bb31219,Apoptosis,Apoptosis,train,False,CP
3,43d9e7c9-c9ec-45ce-8820-048bfb896989,Interphase,Interphase,train,False,CP
4,63ce6652-338e-4afd-9c77-dbc0e903bf92,Prometaphase,Prometaphase,train,False,CP
...,...,...,...,...,...,...
17167,9afa05fd-b8f8-4c3b-8542-74bd89cacb08,Hole,Polylobed,test,True,DP
17168,7526cf67-9cba-489e-9d4c-b2c1b6c900b3,ADCCM,Binuclear,test,True,DP
17169,2ae329f7-6948-40da-93be-fd45ad4c6587,Interphase,Large,test,True,DP
17170,91e9f94c-8fd6-4bf6-916e-a470c8f0f16b,Prometaphase,Interphase,test,True,DP


### Get Each Model Predictions on Each Dataset (Single Class Models)


In [5]:
# directory to load the models from
models_dir = pathlib.Path("../2.train_model/models/single_class_models")

# use a list to keep track of scores in tidy long format for each model and dataset combination
compiled_predictions = []

# define combinations to test over
model_types = [
    "final",
    "shuffled_baseline",
]  # only perform LOIO with hyper params from final models so skip shuffled_baseline models
feature_types = ["CP", "DP", "CP_and_DP"]
evaluation_types = ["train", "test"]
phenotypic_classes = features_dataframe["Mitocheck_Phenotypic_Class"].unique()

# iterate through each combination of feature_types, evaluation_types, phenotypic_classes
for model_type, feature_type, phenotypic_class, evaluation_type in itertools.product(
    model_types, feature_types, phenotypic_classes, evaluation_types
):
    # load single class model for this combination of model type, feature type, and phenotypic class
    single_class_model_path = pathlib.Path(
        f"{models_dir}/{phenotypic_class}_models/{model_type}__{feature_type}.joblib"
    )
    model = load(single_class_model_path)

    print(
        f"Getting predictions for {phenotypic_class} model: {model_type}, trained with features: {feature_type}, on dataset: {evaluation_type}"
    )

    # load dataset (train, test, etc)
    data = get_SCM_model_data(features_dataframe, phenotypic_class, evaluation_type)

    # get features and labels dataframe
    X, y = get_X_y_data(data, feature_type)

    # get predictions from model
    y_pred = model.predict(X)

    # create dataframe with dataset index of cell being predicted,
    # predicted phenotypic class,
    # true phenotypic class,
    # and which dataset/models are involved in prediction
    predictions_df = pd.DataFrame(
        {
            "Cell_UUID": data["Cell_UUID"],
            "Phenotypic_Class_Predicted": y,
            "Phenotypic_Class_True": y_pred,
            "data_split": evaluation_type,
            "shuffled": "shuffled" in model_type,
            "feature_type": feature_type,
        }
    )

    compiled_predictions.append(predictions_df)

Getting predictions for Large model: final, trained with features: CP, on dataset: train
Getting predictions for Large model: final, trained with features: CP, on dataset: test
Getting predictions for Prometaphase model: final, trained with features: CP, on dataset: train
Getting predictions for Prometaphase model: final, trained with features: CP, on dataset: test
Getting predictions for Grape model: final, trained with features: CP, on dataset: train
Getting predictions for Grape model: final, trained with features: CP, on dataset: test
Getting predictions for Interphase model: final, trained with features: CP, on dataset: train
Getting predictions for Interphase model: final, trained with features: CP, on dataset: test
Getting predictions for Apoptosis model: final, trained with features: CP, on dataset: train
Getting predictions for Apoptosis model: final, trained with features: CP, on dataset: test
Getting predictions for ADCCM model: final, trained with features: CP, on dataset: 

### Compile and Save Predictions


In [6]:
# compile predictions and reset index of dataframe
compiled_predictions = pd.concat(compiled_predictions).reset_index(drop=True)

# specify save path
compiled_predictions_save_path = pathlib.Path(
    "predictions/compiled_SCM_predictions.tsv"
)
compiled_predictions_save_path.parent.mkdir(parents=True, exist_ok=True)

# save data as tsv
compiled_predictions.to_csv(compiled_predictions_save_path, sep="\t")

# preview compiled predictions
compiled_predictions

Unnamed: 0,Cell_UUID,Phenotypic_Class_Predicted,Phenotypic_Class_True,data_split,shuffled,feature_type
0,21da27ab-873a-41f4-ab98-49170cae9a2d,Large,Large,train,False,CP
1,82f7949b-4ea2-45c8-8dd9-7854caf49077,Large,Large,train,False,CP
2,cec7234f-fe35-4411-aded-f8112bb31219,Large Negative,Large Negative,train,False,CP
3,43d9e7c9-c9ec-45ce-8820-048bfb896989,Large Negative,Large Negative,train,False,CP
4,63ce6652-338e-4afd-9c77-dbc0e903bf92,Large,Large,train,False,CP
...,...,...,...,...,...,...
291919,380728fc-28b0-423f-b8a7-07be1af590d9,OutOfFocus Negative,OutOfFocus,test,True,CP_and_DP
291920,30ed67c7-8de2-4d78-bce9-3fa1aff28565,OutOfFocus Negative,OutOfFocus Negative,test,True,CP_and_DP
291921,2960b13e-6090-4592-b2a9-d1c4c1b24b50,OutOfFocus Negative,OutOfFocus,test,True,CP_and_DP
291922,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,OutOfFocus,OutOfFocus Negative,test,True,CP_and_DP
