### Import libraries

In [1]:
import pandas as pd
import numpy as np
import pathlib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle
from joblib import dump

from utils.MlPipelineUtils import get_features_data, get_dataset, get_X_y_data

### Load training data and create stratified folds for cross validation

In [2]:
# load training data from indexes and features dataframe
data_split_path = pathlib.Path("results/0.data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../1.format_data/data/training_data.csv.gz")

features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

training_data = get_dataset(features_dataframe, data_split_indexes, "train")
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Mitocheck_Object_ID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
4,Polylobed,10.0,1212.640449,21.314607,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,1.764085,-0.364659,-0.623983,0.087524,-0.678471,-1.047430,0.119700,0.254014,0.080685,-0.808582
5,MetaphaseAlignment,42.0,69.902174,104.782609,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-0.030402,-0.306105,0.471312,1.111647,-0.395580,0.265579,0.337486,-0.728758,0.519263,1.143726
6,Interphase,72.0,517.024390,159.317073,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-2.070584,-0.419038,-0.716160,2.525790,-0.300407,0.243762,0.270543,0.473745,-1.024547,-0.401801
8,Artefact,100.0,748.324675,220.935065,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-0.834010,-0.404291,0.839559,0.230029,-0.322646,-0.254167,-0.602655,-0.273222,-0.722049,0.554533
9,Artefact,108.0,795.484536,242.752577,LT0043_48,166_55,1,LT0043_48_166_55,LT0043_48/166/55/LT0043_48_166_55.tif,OGG1,...,-1.406520,0.368818,0.568022,1.618059,-0.320691,0.527715,0.130431,-0.293846,-0.755968,0.025133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4302,SmallIrregular,70.0,645.173913,664.536232,LT0106_02,287_6,1,LT0106_02_287_6,LT0106_02/287/6/LT0106_02_287_6.tif,ENSG00000186143,...,0.481624,-0.066337,-0.298825,-1.073172,-0.263557,-0.922345,0.761749,0.721974,1.400016,-0.244034
4303,SmallIrregular,37.0,828.268657,338.328358,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,-0.010054,2.490791,0.112932,-0.448705,-0.573112,-1.219449,0.756078,-0.434373,-0.617329,2.989479
4304,SmallIrregular,45.0,62.742424,384.424242,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,0.828838,2.328690,2.365700,-1.219878,-0.377726,0.285707,0.072360,-0.101487,0.592109,-0.326425
4306,SmallIrregular,52.0,105.014085,429.056338,LT0106_02,287_33,1,LT0106_02_287_33,LT0106_02/287/33/LT0106_02_287_33.tif,ENSG00000186143,...,-0.890952,0.301522,0.345463,0.594489,0.737245,3.037339,-0.636915,0.061156,1.849867,-0.896322


In [3]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(3417, 1280)
(3417,)


### Define model without C/l1_ratio parameters


In [4]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

### Perform grid search for best C and l1_ratio parameters

In [5]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [0.1], "l1_ratio": [0.0]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [6]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")

Best parameters: {'C': 1.0, 'l1_ratio': 0.6000000000000001}
Score of best estimator: 0.7977295975762645


### Save best model

In [7]:
# make results dir for saving
results_dir = pathlib.Path("results/")
results_dir.mkdir(parents=True, exist_ok=True)

# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/1.log_reg_model.joblib")

['results/1.log_reg_model.joblib']

## Repeat process with shuffling to create shuffled baseline model

In [8]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# shuffle columns of X (features) dataframe independently to create shuffled baseline
for column in X.T:
    np.random.shuffle(column)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(3417, 1280)
(3417,)


In [9]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

In [10]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [1.0], "l1_ratio": [0.8]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [11]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")

Best parameters: {'C': 0.01, 'l1_ratio': 0.1}
Score of best estimator: 0.19819771935157435


In [12]:
# make results dir for saving
results_dir = pathlib.Path("results/")
results_dir.mkdir(parents=True, exist_ok=True)

# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/1.shuffled_baseline_log_reg_model.joblib")

['results/1.shuffled_baseline_log_reg_model.joblib']