In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, recall_score
from sklearn.preprocessing import OneHotEncoder
#SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
#sys.path.append(os.path.dirname(SCRIPT_DIR))
#from config import RANDOM_STATE, N_SPLITS, features_path, targets_path, gwas_results_path, data_path

### Grid Search

In [2]:
param_grid = [
  {'C': [0.01, 0.1, 1, 10, 100], 'fit_intercept': [True, False], 'class_weight': [None, 'balanced'], 
   'max_iter': [1000], 'random_state': [42], 'solver': ['lbfgs'], 'penalty': ['l2', 'none']},
  {'C': [0.01, 0.1, 1, 10, 100], 'fit_intercept': [True, False], 'class_weight': [None, 'balanced'],
    'max_iter': [1000], 'random_state': [42], 'solver': ['liblinear'], 'penalty': ['l2'],
   'dual': [False, True]},
  {'C': [0.01, 0.1, 1, 10, 100], 'fit_intercept': [True, False], 'class_weight': [None, 'balanced'],
    'max_iter': [1000], 'random_state': [42], 'solver': ['liblinear'], 'penalty': ['l1']},
]

In [3]:
data = np.load("../stroke_data/features")
pvals = np.load('../stroke_data/features_p_values')
targets = np.array(np.load("../stroke_data/targets", allow_pickle=True)) - 1 # (1, 2) -> (0, 1)
data = data[:, np.argsort(pvals)[:30000]] # 30000

In [6]:
lr = LogisticRegression()
recall_class_zero = make_scorer(recall_score, pos_label=0)
clf = GridSearchCV(lr, param_grid, 
                   scoring={'recall 0': recall_class_zero,
                            'f1 macro': 'f1_macro',
                            'accuracy': 'accuracy'},
                   refit='recall 0',
                   cv=5,
                   verbose=0,
                   error_score=0)

In [7]:
%%capture output
clf.fit(data, targets)

In [16]:
clf.cv_results_["params"][np.argmax(clf.cv_results_["mean_test_recall 0"])]

{'C': 0.01,
 'class_weight': 'balanced',
 'fit_intercept': True,
 'max_iter': 1000,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs'}

In [17]:
enc = OneHotEncoder()
oh_data = enc.fit_transform(data)

In [20]:
lr = LogisticRegression()
recall_class_zero = make_scorer(recall_score, pos_label=0)
oh_clf = GridSearchCV(lr, param_grid, 
                   scoring={'recall 0': recall_class_zero,
                            'f1 macro': 'f1_macro',
                            'accuracy': 'accuracy'},
                   refit='recall 0',
                   cv=5,
                   verbose=0,
                   error_score=0,
                   n_jobs=-1)

In [21]:
%%capture output
oh_clf.fit(data, targets)

In [23]:
oh_clf.cv_results_["params"][np.argmax(oh_clf.cv_results_["mean_test_recall 0"])]

{'C': 0.01,
 'class_weight': 'balanced',
 'fit_intercept': True,
 'max_iter': 1000,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs'}