In [1]:
import pathlib
import warnings

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle, parallel_backend
from sklearn.exceptions import ConvergenceWarning
from joblib import dump, load

import sys

sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data

In [2]:
# set numpy seed to make random operations reproduceable
np.random.seed(0)

# create results directory
results_dir = pathlib.Path("models/")
results_dir.mkdir(parents=True, exist_ok=True)

# load training data from indexes and features dataframe
data_split_path = pathlib.Path(f"../1.split_data/indexes/data_split_indexes.tsv")
labled_data_path = pathlib.Path("../0.download_data/data/labeled_data.csv.gz")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
labled_data = get_features_data(labled_data_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

# get training data from labeled data
# training_data = get_dataset(features_dataframe, data_split_indexes, "train")

labled_data.head(5)

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.85668,-0.934949,0.725091,2.25545,-0.565433,1.628086,-0.605625,-0.748135
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.74209,0.36552,0.643759,-1.906097,1.01937
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.950706,-0.811825,-0.522427,-1.402842,-0.28994,2.66125,0.126978,-0.824945,-0.494285,1.763332


In [3]:
# see number of images to 
num_images = labled_data["Metadata_DNA"].unique().shape[0]
print(f"There are {num_images} to perform LOIO evaluation on.")

There are 270 to perform LOIO evaluation on.


In [5]:
# directory to load the models from
models_dir = pathlib.Path("../2.train_model/models/")

for model_path in sorted(models_dir.iterdir()):
    # only look at final models
    if "final" not in model_path.name:
        continue

    model = load(model_path)
    # determine feature type, C, and l1_ratio from model
    feature_type = model_path.name.split("__")[1].replace(".joblib", "")

    print(
        f"Performing LOIO for feature type {feature_type} with parameters C: {model.C}, l1_ratio: {model.l1_ratio}"
    )

    # iterate through image paths
    for image_path in labled_data["Metadata_DNA"].unique():
        # get training and testing cells from image path
        # every cell from the image path is for testing, the rest are for training
        train_cells = labled_data.loc[labled_data["Metadata_DNA"] != image_path]
        test_cells = labled_data.loc[labled_data["Metadata_DNA"] == image_path]

        # get X, y from training and testing cells
        X_train, y_train = get_X_y_data(train_cells, "CP_and_DP")
        X_test, y_test = get_X_y_data(test_cells, "CP_and_DP")

        # capture convergence warning from sklearn
        # this warning does not affect the model but takes up lots of space in the output
        with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore", category=ConvergenceWarning, module="sklearn"
                )
                
                # fit a logisitc regression model on the training X, y
                LOIO_model = LogisticRegression(
                    penalty="elasticnet",
                    solver="saga",
                    max_iter=10, #CHANGE TO 100
                    n_jobs=-1,
                    random_state=0,
                    C=model.C,
                    l1_ratio=model.l1_ratio,
                ).fit(X_train, y_train)

        probas = LOIO_model.predict_proba(X_test)
        score = LOIO_model.score(X_test, y_test)

        print(
            f"Leaving out image: {image_path}; number of cells: {test_cells.shape[0]}; score on image: {score}"
        )
        
        break

    break

Performing LOIO for feature type CP with parameters C: 1.0, l1_ratio: 0.30000000000000004
Leaving out image: LT0010_27/LT0010_27_173_83.tif; number of cells: 24; score on image: 0.7916666666666666
Leaving out image: LT0013_38/LT0013_38_42_75.tif; number of cells: 15; score on image: 0.8
Leaving out image: LT0013_38/LT0013_38_42_95.tif; number of cells: 22; score on image: 0.8181818181818182
Leaving out image: LT0017_19/LT0017_19_365_65.tif; number of cells: 9; score on image: 0.4444444444444444
Leaving out image: LT0019_06/LT0019_06_140_70.tif; number of cells: 29; score on image: 0.6206896551724138
Leaving out image: LT0027_44/LT0027_44_292_88.tif; number of cells: 10; score on image: 0.9
Leaving out image: LT0027_44/LT0027_44_292_95.tif; number of cells: 10; score on image: 0.7
Leaving out image: LT0038_27/LT0038_27_250_8.tif; number of cells: 9; score on image: 0.7777777777777778
Leaving out image: LT0042_10/LT0042_10_144_36.tif; number of cells: 3; score on image: 1.0
Leaving out i