In [1]:
import pathlib
import warnings

import pandas as pd
import numpy as np
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle, parallel_backend
from sklearn.exceptions import ConvergenceWarning
from joblib import dump

import sys

sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data

In [2]:
# set numpy seed to make random operations reproduceable
np.random.seed(0)

# load training data from indexes and features dataframe
data_split_path = pathlib.Path(f"../1.split_data/indexes/data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../0.download_data/data/labeled_data.csv.gz")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

# get training data from labeled data
training_data = get_dataset(features_dataframe, data_split_indexes, "train")
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.856680,-0.934949,0.725091,2.255450,-0.565433,1.628086,-0.605625,-0.748135
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.742090,0.365520,0.643759,-1.906097,1.019370
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.950706,-0.811825,-0.522427,-1.402842,-0.289940,2.661250,0.126978,-0.824945,-0.494285,1.763332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,0.549654,8.142944,1.619399,-1.521878,-0.182734,-1.608294,-0.783477,-2.613400,0.442609,1.977761
2912,OutOfFocus,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,0.358861,6.294227,1.827482,-0.997080,-0.614779,-1.270435,-1.335869,-0.560155,0.836314,3.473351
2913,OutOfFocus,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,0.570003,10.106912,1.130243,-1.288302,-0.956321,-1.409762,-0.058448,-0.025529,0.628679,1.657651
2914,OutOfFocus,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.023441,11.088221,2.068912,-0.977407,-1.108647,-1.399433,-2.744383,-2.037700,0.667556,2.438798


In [3]:
# specify model types, feature types, and phenotypic classes
model_types = ["final", "shuffled_baseline"]
feature_types = ["CP", "DP", "CP_and_DP"]
phenotypic_classes = training_data["Mitocheck_Phenotypic_Class"].unique()

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    max_iter=100,
    n_jobs=-1,
    random_state=0,
    class_weight="balanced"
)

# specify parameters to tune for
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
print(f"Parameters being tested during grid search: {parameters}\n")

# create grid search with cross validation with hypertuning params
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted"
)

# train model on each combination of model type, feature type, and phenotypic class
for model_type, feature_type, phenotypic_class in itertools.product(model_types, feature_types, phenotypic_classes):
    # create results directory
    results_dir = pathlib.Path(
        f"models/single_class_models/{phenotypic_class}_models/"
    )
    results_dir.mkdir(parents=True, exist_ok=True)

    # get number of labels for this specific phenotypic_class
    phenotypic_class_counts = (
        training_data.loc[
            training_data["Mitocheck_Phenotypic_Class"] == phenotypic_class
        ]
    ).shape[0]
    print(
        f"Training {model_type} model on {feature_type} features for {phenotypic_class} phenotypic class with {phenotypic_class_counts} positive labels..."
    )

    # create deep copy of training data so we can make modifications without affecting original training data
    class_training_data = training_data.copy(deep=True)
    # convert labels that are not phenotypic class to 0 (negative)
    class_training_data.loc[
        class_training_data["Mitocheck_Phenotypic_Class"] != phenotypic_class,
        "Mitocheck_Phenotypic_Class",
    ] = f"Not {phenotypic_class}"
    
    # because the label balance is so great for some classes (ex: 50 positive labels to 2400 negative labels),
    # it is nessary to undersample negative labels
    # the following code completes the undersampling
    # first, get indexes of all positive labels (labels that are the desired phenotypic class) 
    positive_label_indexes = (
        training_data.loc[
            training_data["Mitocheck_Phenotypic_Class"] == phenotypic_class
        ]
    ).index
    # next, get the same number of negative labels (labels that are not the desired phenotypic class) 
    negative_label_indexes = (
        training_data.loc[
            training_data["Mitocheck_Phenotypic_Class"] != phenotypic_class
        ]
    ).sample(phenotypic_class_counts, random_state=0).index
    # the new class training data are the two subsets found above
    # this new class training data will have equal numbers of positive and negative labels
    # this removes the drastic class imbalances
    class_training_data = class_training_data.loc[positive_label_indexes.union(negative_label_indexes)]

    # get X (features) and y (labels) data
    X, y = get_X_y_data(class_training_data, feature_type)
    print(f"X has shape {X.shape}, y has shape {y.shape}")
    
    # shuffle columns of X (features) dataframe independently to create shuffled baseline
    if model_type == "shuffled_baseline":
        for column in X.T:
            np.random.shuffle(column)

    # fit grid search cv to X and y data
    # capture convergence warning from sklearn
    # this warning does not affect the model but takes up lots of space in the output
    with parallel_backend("multiprocessing"):
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", category=ConvergenceWarning, module="sklearn"
            )
            grid_search_cv = grid_search_cv.fit(X, y)

    # print info for best estimator
    print(f"Best parameters: {grid_search_cv.best_params_}")
    print(f"Score of best estimator: {grid_search_cv.best_score_}\n")

    # save final estimator
    dump(
        grid_search_cv.best_estimator_,
        f"{results_dir}/{model_type}__{feature_type}.joblib",
    )


Parameters being tested during grid search: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}

Training final model on CP features for Large phenotypic class with 67 positive labels...
X has shape (134, 165), y has shape (134,)
Best parameters: {'C': 0.01, 'l1_ratio': 0.1}
Score of best estimator: 0.9069855144855146

Training final model on CP features for Prometaphase phenotypic class with 293 positive labels...
X has shape (586, 165), y has shape (586,)
Best parameters: {'C': 1.0, 'l1_ratio': 0.7000000000000001}
Score of best estimator: 0.9022945870637846

Training final model on CP features for Grape phenotypic class with 63 positive labels...
X has shape (126, 165), y has shape (126,)
Best parameters: {'C': 0.01, 'l1_ratio': 0.0}
Score of best estimator: 0.9291841491841492

Training final model on CP features for Interphase phenotypic class with 357 positive labels...
X has shape (714, 