### Import libraries

In [1]:
import pandas as pd
import numpy as np
import pathlib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.utils import shuffle
from joblib import dump

import sys
sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_dataset, get_X_y_data

### Load training data and create stratified folds for cross validation

In [2]:
# set numpy seed to make random operations reproduceable
np.random.seed(0)

results_dir = pathlib.Path("models/")
results_dir.mkdir(parents=True, exist_ok=True)

# load training data from indexes and features dataframe
data_split_path = pathlib.Path(f"../1.split_data/indexes/data_split_indexes.tsv")
features_dataframe_path = pathlib.Path("../0.download_data/data/training_data.csv.gz")

features_dataframe = get_features_data(features_dataframe_path)
data_split_indexes = pd.read_csv(data_split_path, sep="\t", index_col=0)

training_data = get_dataset(features_dataframe, data_split_indexes, "train")
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Object_Outline,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,Large,[[396 595]\n [395 596]\n [394 596]\n [393 596]...,397.288288,618.558559,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.527232,-0.388696,-0.715081,-0.939058,-0.079103,1.966414,18.714635,0.061156,2.639598,-0.085996
1,Large,[[361 563]\n [360 564]\n [359 564]\n [358 564]...,359.535714,585.062500,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.483285,-1.355129,-0.857182,-0.934520,0.724273,2.254263,-0.564988,1.629091,-0.606195,-0.747432
3,Large,[[923 515]\n [922 516]\n [921 516]\n [920 516]...,934.568807,534.385321,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-1.424769,-0.863939,-0.582677,-0.228617,0.426065,1.906163,0.193329,0.909489,-1.723110,0.955596
4,Large,[[483 96]\n [482 97]\n [481 97]\n [480 98]...,481.007143,121.978571,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,0.693119,-0.581144,-0.256527,-1.401117,-0.357608,2.607508,6.506821,-1.335764,-0.264039,0.978577
5,Large,[[456 803]\n [455 804]\n [454 804]\n [453 804]...,465.732824,822.656489,LT0013_38,42,95,1,LT0013_38_42,LT0013_38/LT0013_38_42_95.tif,...,1.221643,0.427448,1.270162,-0.513263,2.415464,1.093042,0.218068,-1.291372,0.512568,1.728560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2910,OutOfFocus,[[480 884]\n [479 885]\n [478 885]\n [477 885]...,477.566667,900.411111,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,0.020656,8.914342,3.211937,-1.092754,-0.879064,-1.162240,-1.896239,-0.418936,0.504933,1.216141
2911,OutOfFocus,[[413 338]\n [412 339]\n [411 339]\n [410 339]...,411.508929,361.410714,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-0.724141,4.936666,2.227179,-1.458934,-0.774605,-1.679045,-2.429998,-0.826404,0.730059,1.286466
2912,OutOfFocus,[[797 520]\n [796 521]\n [795 521]\n [794 522]...,808.740741,536.296296,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-1.463249,4.504158,4.272579,-1.529108,-0.932259,-0.898132,-3.123627,-0.719680,-0.425281,1.687259
2913,OutOfFocus,[[736 898]\n [735 899]\n [734 899]\n [733 899]...,732.185567,916.804124,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-0.755542,5.010617,6.062460,-1.249060,-1.027013,-1.500677,-2.383565,-0.655265,0.690708,-0.007728


In [3]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(2215, 1280)
(2215,)


### Define model without C/l1_ratio parameters


In [4]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

### Perform grid search for best C and l1_ratio parameters

In [5]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [0.1], "l1_ratio": [0.0]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [6]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")

Best parameters: {'C': 0.01, 'l1_ratio': 0.0}
Score of best estimator: 0.7281902865069328


### Save best model

In [7]:
# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/log_reg_model.joblib")

['models/log_reg_model.joblib']

## Repeat process with shuffling to create shuffled baseline model

In [8]:
X, y = get_X_y_data(training_data)

print(X.shape)
print(y.shape)

# shuffle columns of X (features) dataframe independently to create shuffled baseline
for column in X.T:
    np.random.shuffle(column)

# create stratified data sets for k-fold cross validation
straified_k_folds = StratifiedKFold(n_splits=10, shuffle=False)

(2215, 1280)
(2215,)


In [9]:
# create logistic regression model with following parameters
log_reg_model = LogisticRegression(
    penalty="elasticnet", solver="saga", max_iter=100, n_jobs=-1, random_state=0
)

In [10]:
# hypertune parameters with GridSearchCV
parameters = {"C": np.logspace(-3, 3, 7), "l1_ratio": np.linspace(0, 1, 11)}
#parameters = {"C": [1.0], "l1_ratio": [0.8]}
print(f"Parameters being tested: {parameters}")
grid_search_cv = GridSearchCV(
    log_reg_model, parameters, cv=straified_k_folds, n_jobs=-1, scoring="f1_weighted",
)
grid_search_cv = grid_search_cv.fit(X, y)

Parameters being tested: {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])}




In [11]:
print(f"Best parameters: {grid_search_cv.best_params_}")
print(f"Score of best estimator: {grid_search_cv.best_score_}")

Best parameters: {'C': 1.0, 'l1_ratio': 0.2}
Score of best estimator: 0.10270631232504415


In [12]:
# save final estimator
dump(grid_search_cv.best_estimator_, f"{results_dir}/shuffled_baseline_log_reg_model.joblib")

['models/shuffled_baseline_log_reg_model.joblib']