In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
# import XGBoost
from xgboost import XGBClassifier
import os
import itertools
import tqdm
logistic_regression_param_grid = {
            # 'penalty': ['l1', 'l2'],
            # 'solver' : ['liblinear'],
            'penalty': ['elasticnet'],
            'solver' : ['saga'],
            'l1_ratio': [0, 0.25, 0.33, 0.5, 0.66, 0.75, 1],
            'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10],
            'class_weight': ['balanced'],
            'max_iter': [10000],
            'tol': [1e-3, 1e-4],
            }

In [3]:
data_dir= '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_finetune_optimization/April_30_Finetune_Data'
local_dir = os.path.expanduser('~/Desktop/saved_models')



In [15]:
X_train = pd.read_csv(os.path.join(data_dir, 'X_finetune_train.csv'),index_col=0)
y_train = pd.read_csv(os.path.join(data_dir, 'y_finetune_train.csv'),index_col=0)

X_val = pd.read_csv(os.path.join(data_dir, 'X_finetune_val.csv'),index_col=0)
y_val = pd.read_csv(os.path.join(data_dir, 'y_finetune_val.csv'),index_col=0)

X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

X_test = pd.read_csv(os.path.join(data_dir, 'X_finetune_test.csv'),index_col=0)
y_test = pd.read_csv(os.path.join(data_dir, 'y_finetune_test.csv'),index_col=0)

In [16]:
y_col = 'IMDC BINARY'
y_train = y_train[y_col]
y_train.dropna(inplace=True)
X_train = X_train.loc[y_train.index]

y_val = y_val[y_col]
y_val.dropna(inplace=True)
X_val = X_val.loc[y_val.index]

y_trainval = y_trainval[y_col]
y_trainval.dropna(inplace=True)
X_trainval = X_trainval.loc[y_trainval.index]

y_test = y_test[y_col]
y_test.dropna(inplace=True)
X_test = X_test.loc[y_test.index]

In [12]:
y_val.shape

(61,)

In [11]:
X_val.shape

(61, 2736)

In [17]:
param_grid = logistic_regression_param_grid

param_combs = list(itertools.product(*param_grid.values()))
grid_search = [dict(zip(param_grid.keys(), values)) for values in param_combs]
hp_summary = {}
for i, gs in tqdm.tqdm(enumerate(grid_search)):

    model = LogisticRegression(**gs)
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    y_pred = model.predict(X_val.to_numpy())
    y_pred_proba = model.predict_proba(X_val.to_numpy())[:,1]
    auc = roc_auc_score(y_val.to_numpy(), y_pred_proba)
    acc = accuracy_score(y_val.to_numpy(), y_pred)
    hp_summary[i] = {'auc': auc, 'acc': acc, 'params': gs}


77it [04:48,  7.02s/it]

In [None]:
# refit the top result from the hp summary

best_hp = max(hp_summary, key=lambda x: hp_summary[x]['auc'])
best_params = hp_summary[best_hp]['params']
model = LogisticRegression(**best_params)

model.fit(X_trainval, y_trainval)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

print(f'Best HP: {best_params}')
print(f'Test AUC: {auc}')

