### Import libraries

In [1]:
import pathlib
import warnings

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import parallel_backend
from joblib import dump

import sys
sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data

### Load training data and create stratified folds for cross validation

In [2]:
# set numpy seed to make random operations reproduceable
np.random.seed(0)

results_dir = pathlib.Path("models/")
results_dir.mkdir(parents=True, exist_ok=True)

# load training data from indexes and features dataframe
data_split_path = pathlib.Path(f"../1.split_data/indexes/data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../0.download_data/data/labeled_data.csv.gz")

# dataframe with only the labeled data we want (exclude certain phenotypic classes)
features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

# get training data from labeled data
training_data = get_dataset(features_dataframe, data_split_indexes, "train")
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Frame,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,397,618,LT0010_27,173,1,83,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,RAB21,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,359,584,LT0010_27,173,1,83,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,RAB21,...,-0.482883,-1.354858,-0.856680,-0.934949,0.725091,2.255450,-0.565433,1.628086,-0.605625,-0.748135
2,Large,383,685,LT0010_27,173,1,83,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,RAB21,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556
3,Large,932,532,LT0013_38,42,1,75,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,KIF14,...,-1.001625,-0.801021,-0.586539,0.076197,0.599191,1.742090,0.365520,0.643759,-1.906097,1.019370
4,Large,477,130,LT0013_38,42,1,75,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,KIF14,...,0.950706,-0.811825,-0.522427,-1.402842,-0.289940,2.661250,0.126978,-0.824945,-0.494285,1.763332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,383,219,LT0601_01,217,1,49,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,ABCB8,...,0.549654,8.142944,1.619399,-1.521878,-0.182734,-1.608294,-0.783477,-2.613400,0.442609,1.977761
2912,OutOfFocus,975,294,LT0603_03,2,1,49,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,0.358861,6.294227,1.827482,-0.997080,-0.614779,-1.270435,-1.335869,-0.560155,0.836314,3.473351
2913,OutOfFocus,898,302,LT0603_03,2,1,49,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,0.570003,10.106912,1.130243,-1.288302,-0.956321,-1.409762,-0.058448,-0.025529,0.628679,1.657651
2914,OutOfFocus,946,281,LT0603_03,2,1,49,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,-0.023441,11.088221,2.068912,-0.977407,-1.108647,-1.399433,-2.744383,-2.037700,0.667556,2.438798


In [3]:
model_types = ["final","shuffled_baseline"]
feature_types = ["CP","DP","CP_and_DP"]

# train model for each combination of model type and feature type
for model_type in model_types:
    for feature_type in feature_types:
        print(f"Training {model_type} model on {feature_type} features...")
        
        X, y = get_X_y_data(training_data, feature_type)
        print(f"X has shape {X.shape}, y has shape {y.shape}")
        
        # shuffle columns of X (features) dataframe independently to create shuffled baseline
        if model_type == "shuffled_baseline":
            for column in X.T:
                np.random.shuffle(column)
        
        # create stratified data sets for k-fold cross validation
        straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)
        
        # create logistic regression model with following parameters
        log_reg_model = LogisticRegression(
            penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
        )
        
        # hypertune parameters with GridSearchCV
        parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
        print(f"Parameters being tested: {parameters}")
        grid_search_cv = GridSearchCV(
            log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
        )
        
        # capture convergence warning from sklearn
        # this warning does not affect the model but takes up lots of space in the output
        with parallel_backend("multiprocessing"):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn")
                grid_search_cv = grid_search_cv.fit(X, y)
        
        # print info for best estimator
        print(f"Best parameters: {grid_search_cv.best_params_}")
        print(f"Score of best estimator: {grid_search_cv.best_score_}\n")
        
        # save final estimator
        dump(grid_search_cv.best_estimator_, f"{results_dir}/{model_type}__{feature_type}.joblib")

Training final model on CP features...
X has shape (2432, 165), y has shape (2432,)
Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}
Best parameters: {'C': 1.0, 'l1_ratio': 0.30000000000000004}
Score of best estimator: 0.8160827228903825

Training final model on DP features...
X has shape (2432, 1280), y has shape (2432,)
Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}
Best parameters: {'C': 1.0, 'l1_ratio': 1.0}
Score of best estimator: 0.7383705853823289

Training final model on CP_and_DP features...
X has shape (2432, 1445), y has shape (2432,)
Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}
Best parameters: {'C': 0.1,