### Import libraries

In [1]:
import pandas as pd
import numpy as np
import pathlib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle
from joblib import dump

import sys
sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data

### Load training data and create stratified folds for cross validation

In [2]:
# set numpy seed to make random operations reproduceable
np.random.seed(0)

results_dir = pathlib.Path("models/")
results_dir.mkdir(parents=True, exist_ok=True)

# load training data from indexes and features dataframe
data_split_path = pathlib.Path(f"../1.split_data/indexes/data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../0.download_data/data/training_data.csv.gz")

features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

training_data = get_dataset(features_dataframe, data_split_indexes, "train")
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,MetaphaseAlignment,572.214286,58.185714,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.048350,-0.721622,0.749788,-1.377590,0.454974,0.188488,0.141427,-1.553405,2.346107,-1.774278
1,Artefact,1117.070423,342.732394,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.172767,-0.290257,-0.709041,-1.431541,-0.063308,-0.412793,0.452684,-1.906647,1.962141,-0.223039
2,Artefact,1116.500000,362.000000,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.093582,-0.323180,-0.663069,-1.427502,-0.901764,-0.355080,0.418053,-2.298449,1.098266,-0.069326
3,Artefact,1106.348485,370.469697,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.943948,-0.211267,-0.346355,-1.365543,-0.276932,0.023856,0.376514,-1.700348,1.833686,-0.625385
5,Prometaphase,1305.853333,656.426667,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.581095,0.635676,-0.597231,-1.204226,0.247975,0.923955,0.060671,-2.054225,1.040119,-0.528491
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5518,OutOfFocus,383.075269,220.198925,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,ABCB8,...,0.525202,8.110262,1.777901,-1.512628,-0.225867,-1.612982,-0.679415,-2.581475,0.501395,1.981009
5519,OutOfFocus,975.747253,293.868132,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,0.277908,5.959773,2.362540,-1.000032,-0.723652,-1.460720,-1.919148,-0.301130,0.779582,3.084642
5520,OutOfFocus,898.614815,302.407407,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,0.553313,10.086836,1.170072,-1.317000,-0.994644,-1.406541,-0.104613,-0.056216,0.714624,1.612470
5521,OutOfFocus,946.758621,281.689655,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,-0.152000,11.078217,2.460915,-0.989282,-1.141723,-1.424882,-2.959780,-1.999795,0.702134,2.306039


In [3]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(4640, 1280)
(4640,)


### Define model without C/l1_ratio parameters


In [4]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

### Perform grid search for best C and l1_ratio parameters

In [5]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [0.1], "l1_ratio": [0.0]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [6]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")

Best parameters: {'C': 0.1, 'l1_ratio': 0.1}
Score of best estimator: 0.8041639526745415


### Save best model

In [7]:
# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/log_reg_model.joblib")

['models/log_reg_model.joblib']

## Repeat process with shuffling to create shuffled baseline model

In [8]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# shuffle columns of X (features) dataframe independently to create shuffled baseline
for column in X.T:
    np.random.shuffle(column)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(4640, 1280)
(4640,)


In [9]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

In [10]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [1.0], "l1_ratio": [0.8]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [11]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")

Best parameters: {'C': 0.01, 'l1_ratio': 0.1}
Score of best estimator: 0.15943335633696815


In [12]:
# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/shuffled_baseline_log_reg_model.joblib")

['models/shuffled_baseline_log_reg_model.joblib']