### Import libraries

In [37]:
import pandas as pd
import numpy as np
import pathlib
from typing import Tuple, Any, List, Union

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    cross_validate,
    cross_val_predict,
    GridSearchCV,
)
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score
from sklearn.utils import shuffle
from joblib import dump
import matplotlib.pyplot as plt

import seaborn as sns

### Helper functions

In [38]:
def get_features_data(load_path: pathlib.Path) -> pd.DataFrame:
    """get DP training data from csv at load path

    Args:
        load_path (pathlib.Path): path to training data csv

    Returns:
        pd.DataFrame: training dataframe
    """
    # read dataset into pandas dataframe
    training_data = pd.read_csv(load_path, index_col=0)

    # remove training data with ADCCM class as this class was not used for classification in original paper
    training_data = training_data[
        training_data["Mitocheck_Phenotypic_Class"] != "ADCCM"
    ]

    # replace shape1 and shape3 labels with their correct respective classes
    training_data = training_data.replace("Shape1", "Binuclear")
    training_data = training_data.replace("Shape3", "Polylobed")

    return training_data

def get_training_data(features_dataframe: pd.DataFrame, data_split_indexes: pd.DataFrame) -> pd.DataFrame:
    """get training data from features dataframe and the data split indexes

    Args:
        features_dataframe (pd.DataFrame): dataframe with all features data
        data_split_indexes (pd.DataFrame): dataframe with split indexes

    Returns:
        pd.DataFrame: _description_
    """
    training_indexes = data_split_indexes.loc[data_split_indexes['label'] == "train"]
    training_indexes = training_indexes["index"]
    training_data = features_dataframe.loc[training_indexes]
    
    return training_data
    

def get_X_y_data(training_data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """generate X (features) and y (labels) dataframes from training data

    Args:
        training_data (pd.DataFrame): training dataframe

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: X, y dataframes
    """

    # all features from DeepProfiler have "efficientnet" in their column name
    morphology_features = [
        col for col in training_data.columns.tolist() if "efficientnet" in col
    ]

    # extract features
    X = training_data.loc[:, morphology_features].values

    # extract phenotypic class label
    y = training_data.loc[:, ["Mitocheck_Phenotypic_Class"]].values
    # make Y data
    y = np.ravel(y)

    # shuffle data because as it comes from MitoCheck same labels tend to be in grou
    X, y = shuffle(X, y, random_state=0)

    return X, y



### Load training data and create stratified folds for cross validation

In [39]:
# load training data from indexes and features dataframe
data_split_path = pathlib.Path("results/data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../1.format_data/data/training_data.csv.gz")


features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

training_data = get_training_data(features_dataframe, data_split_indexes)
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Mitocheck_Object_ID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
4,Polylobed,10.0,1212.640449,21.314607,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,1.764085,-0.364659,-0.623983,0.087524,-0.678471,-1.047430,0.119700,0.254014,0.080685,-0.808582
5,MetaphaseAlignment,42.0,69.902174,104.782609,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-0.030402,-0.306105,0.471312,1.111647,-0.395580,0.265579,0.337486,-0.728758,0.519263,1.143726
7,Interphase,85.0,1155.936170,191.180851,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-1.264048,-0.678396,0.076916,3.142620,0.202174,0.331271,0.567700,0.072269,-1.715632,1.303155
9,Artefact,108.0,795.484536,242.752577,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-1.406520,0.368818,0.568022,1.618059,-0.320691,0.527715,0.130431,-0.293846,-0.755968,0.025133
10,Artefact,100.0,744.214286,236.428571,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-0.313731,0.189396,0.359831,0.693035,-0.039102,0.101826,0.783466,-0.191989,-0.278789,1.324181
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4303,SmallIrregular,37.0,828.268657,338.328358,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,-0.010054,2.490791,0.112932,-0.448705,-0.573112,-1.219449,0.756078,-0.434373,-0.617329,2.989479
4304,SmallIrregular,45.0,62.742424,384.424242,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,0.828838,2.328690,2.365700,-1.219878,-0.377726,0.285707,0.072360,-0.101487,0.592109,-0.326425
4305,SmallIrregular,49.0,799.772727,407.651515,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,0.342158,1.118108,2.618269,-1.146326,-0.574519,0.284514,0.491826,-0.489022,0.969788,-0.492233
4306,SmallIrregular,52.0,105.014085,429.056338,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,-0.890952,0.301522,0.345463,0.594489,0.737245,3.037339,-0.636915,0.061156,1.849867,-0.896322


In [40]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(3378, 1280)
(3378,)


### Define model without C/l1_ratio parameters


In [41]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

### Perform grid search for best C and l1_ratio parameters

In [42]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [1.0], "l1_ratio": [0.8]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [43]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")
log_reg_model = LogisticRegression(
    C=grid_search_cv.best_params_["C"],
    l1_ratio=grid_search_cv.best_params_["l1_ratio"],
    penalty="elasticnet",
    solver="saga",
    max_iter=100,
    n_jobs=-1,
    random_state=0,
)

Best parameters: {'C': 1.0, 'l1_ratio': 0.1}
Score of best estimator: 0.8084622407950416


### Save best model

In [44]:
# make results dir for saving
results_dir = pathlib.Path("results/")
results_dir.mkdir(parents=True, exist_ok=True)

# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/log_reg_model.joblib")

['results/log_reg_model.joblib']

## Repeat process with shuffling to create shuffled baseline model

In [45]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# shuffle rows of X (features) dataframe independently to create shuffled baseline
for row in X:
    np.random.shuffle(row)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(3378, 1280)
(3378,)


In [46]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

In [47]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [1.0], "l1_ratio": [0.8]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [48]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")
log_reg_model = LogisticRegression(
    C=grid_search_cv.best_params_["C"],
    l1_ratio=grid_search_cv.best_params_["l1_ratio"],
    penalty="elasticnet",
    solver="saga",
    max_iter=100,
    n_jobs=-1,
    random_state=0,
)

Best parameters: {'C': 0.01, 'l1_ratio': 0.5}
Score of best estimator: 0.3475427106561551


In [49]:
# make results dir for saving
results_dir = pathlib.Path("results/")
results_dir.mkdir(parents=True, exist_ok=True)

# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/shuffled_baseline_log_reg_model.joblib")

['results/shuffled_baseline_log_reg_model.joblib']