In [1]:
import os
import sys
import pandas as pd
import numpy as np
import json
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold

from misc import save_json, get_dropbox_dir


import sklearn
print('sklearn version: ', sklearn.__version__)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

sklearn version:  1.3.2


In [71]:
def my_sklearn_fitter(data_dict, param_grid, 
                      output_dir, model_kind, model_name,
                      cv=None, n_iter=10, scoring_func=None, verbose=0):
    if cv is None:
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=42)
    
    if scoring_func is None:
        # depends on which sklearn version we are using
        # scoring_func = make_scorer(roc_auc_score, average='weighted', response_method='predict_proba')
        scoring_func = make_scorer(roc_auc_score, average='weighted', needs_proba=True)

    if model_kind == 'logistic_regression':
        base_model = LogisticRegression(max_iter=1000)
    elif model_kind == 'random_forest':
        base_model = RandomForestClassifier()
    elif model_kind == 'decision_tree':
        base_model = DecisionTreeClassifier()
    elif model_kind == 'svc':
        base_model = SVC(probability=True)

    model = RandomizedSearchCV(base_model, 
                               param_distributions=param_grid, 
                               n_iter=n_iter, 
                               cv=cv, 
                               scoring=scoring_func, 
                               verbose=verbose, 
                               random_state=1010,
                               n_jobs=1)
    X_train, y_train = data_dict['X_train'], data_dict['y_train']
    X_val, y_val = data_dict['X_val'], data_dict['y_val']
    X_test, y_test = data_dict['X_test'], data_dict['y_test']

    model.fit(X_train, y_train)

    model_results = pd.DataFrame(model.cv_results_)

    model_results.sort_values('rank_test_score', inplace=True)
    model_results.iloc[0]['mean_test_score']
    cv_score = model_results.iloc[0]['mean_test_score']
    cv_score_std = model_results.iloc[0]['std_test_score']
    print(f'Train CV score: {cv_score} +/- {cv_score_std}')


    y_train_proba = model.predict_proba(X_train)[:,1]
    train_score = roc_auc_score(y_train, y_train_proba, average='weighted')
    print(f'Training score: {train_score}')


    y_val_proba = model.predict_proba(X_val)[:,1]
    val_score = roc_auc_score(y_val, y_val_proba, average='weighted')
    print(f'Validation score: {val_score}')


    y_test_proba = model.predict_proba(X_test)[:,1]
    test_score = roc_auc_score(y_test, y_test_proba, average='weighted')
    print(f'Test score: {test_score}')


    model_summary = {
        'model_kind': model_kind,
        'model_name': model_name,
        'n_input ft': X_train.shape[1],
        'param_grid' : param_grid,
        'best_params': model.best_params_,
        'score name': 'roc_auc (weighted)',
        'cv_score': cv_score,
        'cv_score_std': cv_score_std,
        'train_score': train_score,
        'val_score': val_score,
        'test_score': test_score,
        'train sz': X_train.shape[0],
        'val sz': X_val.shape[0],
        'test sz': X_test.shape[0],
        'n_trials': n_iter,
        'n_folds': cv.get_n_splits(),
    }

    save_json(model_summary, os.path.join(output_dir, f'{model_name}_summary.json'))

    return model_summary


dropbox_dir = get_dropbox_dir()
base_dir = os.path.join(dropbox_dir, 'development_CohortCombination','alignment_RCC_2024_Feb_27')

ref_freq = 0.6
# input_freq = 0.1
grid_id = 1
matt_ft_dir = os.path.join(base_dir, 'matt_top_fts')

# %%
def get_key_ft_dct():

    matt_ft_files = os.listdir(matt_ft_dir)
    matt_ft_files = [f for f in matt_ft_files if f.endswith('.txt')]

    matt_ft_dict = {}
    for f in matt_ft_files:
        ft_name = f.split('_feats')[0]
        # with open(os.path.join(matt_ft_dir, f), 'r') as file:
        #     ft = file.read().split(', ')
        # if len(ft) == 1:
        with open(os.path.join(matt_ft_dir, f), 'r') as file:
            ft = file.read().splitlines()
            # print(file.read()
        # remove all of the ', and commas from the strings in the list
        ft = [x.strip(',').strip(' ').strip('"').strip("'").strip('\n').strip('\t') for x in ft]
        matt_ft_dict[ft_name] = ft
        # break
        print(ft_name + ': ' + str(len(ft)))

    # %% [markdown]
    # ### RCC Target Metabolites

    # %%
    # %%
    rcc_peak_info_file = os.path.join(base_dir, 'rcc_result', 'peak_info.csv')
    rcc_peak_info_df = pd.read_csv(rcc_peak_info_file, index_col=0)

    rcc_peak_info_df = rcc_peak_info_df[rcc_peak_info_df['freq'] >= ref_freq].copy()

    print(f'Number of peaks in the reference cohort after {ref_freq} filter: ', rcc_peak_info_df.shape[0])

        
    rcc_matched_targets_file = os.path.join(base_dir,'rcc_result', 'matched_targets HILIC POSITIVE ION MODE.csv')
    rcc_matched_targets_df = pd.read_csv(rcc_matched_targets_file, index_col=0)
    rcc_matched_targets_df.loc[rcc_peak_info_df.index]

    potential_feats = rcc_matched_targets_df[rcc_matched_targets_df['potential_target']].index.to_list()
    print('Number of features that potentially match to a target metabolite: ', len(potential_feats))

    double_match_ids = rcc_matched_targets_df[rcc_matched_targets_df['potential_target_count'] > 1]
    num_double_match = double_match_ids.shape[0]
    print('Number of features that potentially match to more than one target metabolite: ', double_match_ids.shape[0])
    print(rcc_matched_targets_df.loc[double_match_ids.index, 'potential_target_id'])

    # here are the double matches in RCC, two are the same metabolite (but different adducts?)
    # FT3202                           tryptophanTryptophan_μM
    # FT3237                           kynurenineKynurenine_μM
    # FT8451    C18:1 LPC plasmalogen_AC18:1 LPC plasmalogen_B


    potential_feat_names = rcc_matched_targets_df.loc[potential_feats]['potential_target_id'].unique()
    # print('Number of potential feature names: ', len(potential_feat_names))
    print(potential_feat_names)

    print('Number of target metabolite captured: ', len(potential_feat_names))

    # for now don't remove the double counts, since they are NOT actually double counts
    num_rcc_targets_found =  len(potential_feat_names)
    rcc_target_feats = potential_feats

    # add to the matt ft dictionary
    matt_ft_dict['rcc_targets'] = rcc_target_feats

    return matt_ft_dict, rcc_peak_info_df


# Helper functions to finding the number and percentage of captured features
def get_captured_fts(matt_ft_list, align_ft_list):
    captured_fts = [ft for ft in matt_ft_list if ft in align_ft_list]
    return captured_fts

def get_captured_perc(matt_ft_list, align_ft_list):
    captured_fts = get_captured_fts(matt_ft_list, align_ft_list)
    matt_capture_perc = len(captured_fts) / len(matt_ft_list)
    align_capture_perc = len(captured_fts) / len(align_ft_list)
    return matt_capture_perc, align_capture_perc


In [79]:
def optimize_with_val(model, data_dict, param_grid,n_iter=20):
    
    all_res = []
    
    # cycle over all parameter combinations
    param_combinations = []
    for iter in range(3*n_iter):
        param_kwargs = {}
        for param_name in param_grid.keys():
            param_kwargs[param_name] = np.random.choice(param_grid[param_name])

        param_combinations.append(param_kwargs)

    #remove duplicates
    param_combinations = [dict(t) for t in {tuple(d.items()) for d in param_combinations}]
    print(len(param_combinations))
    if len(param_combinations) > n_iter:
        param_combinations = param_combinations[:n_iter]

    X_train, y_train = data_dict['X_train'], data_dict['y_train']
    X_val, y_val = data_dict['X_val'], data_dict['y_val']
    X_test, y_test = data_dict['X_test'], data_dict['y_test']


    for model_param in param_combinations:

        param = {k:v for k,v in model_param.items()}
        model.set_params(**model_param)
        model.fit(X_train, y_train)
        y_train_proba = model.predict_proba(X_train)[:,1]
        train_score = roc_auc_score(y_train, y_train_proba, average='weighted')
        param['train_score (fit on train)'] = train_score


        y_val_proba = model.predict_proba(X_val)[:,1]
        val_score = roc_auc_score(y_val, y_val_proba, average='weighted')
        param['val_score (fit on train)'] = val_score

        y_test_proba = model.predict_proba(X_test)[:,1]
        test_score = roc_auc_score(y_test, y_test_proba, average='weighted')
        param['test_score (fit on train)'] = test_score

        # join the train and val sets
        X_train_val = pd.concat([X_train, X_val], axis=0)
        y_train_val = pd.concat([y_train, y_val], axis=0)
        model.fit(X_train_val, y_train_val)
        y_test_proba = model.predict_proba(X_test)[:,1]
        test_score2 = roc_auc_score(y_test, y_test_proba, average='weighted')
        param['test_score (fit on train+val)'] = test_score2

        all_res.append(param)




    res_summary = pd.DataFrame(all_res)
    res_summary['train sz'] = X_train.shape[0]
    res_summary['val sz'] = X_val.shape[0]
    res_summary['test sz'] = X_test.shape[0]
    res_summary['n_trials'] = n_iter
    res_summary['n_peaks'] = X_train.shape[1]

    return res_summary

In [3]:
subset_dir = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/March_12_Data'

In [4]:
matt_ft_dict, rcc_peak_info_df = get_key_ft_dct()


top_10: 10
top_25: 25
168_os_pfs: 168
net_matched: 86
Number of peaks in the reference cohort after 0.6 filter:  4016
Number of features that potentially match to a target metabolite:  188
Number of features that potentially match to more than one target metabolite:  3
feats
FT3202                           tryptophanTryptophan_μM
FT3237                           kynurenineKynurenine_μM
FT8451    C18:1 LPC plasmalogen_AC18:1 LPC plasmalogen_B
Name: potential_target_id, dtype: object
['trimethylamine-N-oxide' 'alanine' 'GABA' 'serine' 'hypotaurine'
 'cytosine' 'creatinine' 'betaine' 'threonine' 'niacinamide' 'taurine'
 'ornithine' 'N-acetylalanine' 'N-carbamoyl-beta-alanine'
 'N-methylproline' 'leucine' 'hydroxyproline' 'N-acetylputrescine'
 '1-methylnicotinamide' 'trigonelline' 'anthranilic acid' 'urocanic acid'
 'imidazole propionate' 'ectoine' 'proline-betaine' 'glutamate'
 '4-acetamidobutanoate' 'butyrobetaine' 'glutamine' 'lysine' 'methionine'
 'N1-methyl-2-pyridone-5-carboxamide' 

In [5]:
X_data = pd.read_csv(os.path.join(subset_dir, 'X.csv'), index_col=0)

In [6]:
nan_mask = pd.read_csv(os.path.join(subset_dir, 'nans.csv'), index_col=0)


In [7]:
y_data = pd.read_csv(os.path.join(subset_dir, 'y.csv'), index_col=0)

In [8]:
print('number of samples: ', X_data.shape[0])
print('number of features: ', X_data.shape[1])

number of samples:  17685
number of features:  2736


In [9]:
pretrain_files = y_data[y_data['Set']=='Pretrain'].index.to_list()
finetune_files = y_data[y_data['Set']=='Finetune'].index.to_list()
holdout_test_files = y_data[y_data['Set']=='Test'].index.to_list()
holdout_val_files = y_data[y_data['Set']=='Validation'].index.to_list()


In [10]:
finetune_freq = 1 - nan_mask.loc[finetune_files].sum(axis=0)/ len(finetune_files)
pretrain_freq = 1- nan_mask.loc[pretrain_files].sum(axis=0)/ len(pretrain_files)
finetune_var = X_data.loc[finetune_files].var(axis=0)

temp = pd.concat([finetune_freq, pretrain_freq, finetune_var], axis=1)
temp.columns = ['finetune_freq', 'pretrain_freq', 'finetune_var']
print(temp.shape)

(2736, 3)


In [11]:
# check that the Matt feats are found in the RCC peaks
print('In the peaks of size ', temp.shape[0])
for matt_ft_name, matt_ft_list in matt_ft_dict.items():
    captured_peaks = get_captured_fts(matt_ft_list, temp.index)
    print(f'Number of {matt_ft_name} captured: {len(captured_peaks)} out of {len(matt_ft_list)}: {len(captured_peaks)/len(matt_ft_list):.2f}')

In the peaks of size  2736
Number of top_10 captured: 10 out of 10: 1.00
Number of top_25 captured: 23 out of 25: 0.92
Number of 168_os_pfs captured: 134 out of 168: 0.80
Number of net_matched captured: 73 out of 86: 0.85
Number of rcc_targets captured: 159 out of 188: 0.85


In [13]:
subset_dir

'/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/March_12_Data'

In [82]:
# Create a splits csv file that is just the validation data

stratify_col = 'MSKCC BINARY'
splits_dir = os.path.join(subset_dir, f'{stratify_col} predefined_val')
os.makedirs(splits_dir, exist_ok=True)
n_repeats = 100
yes_remove_nans = True
finetuen_and_val = finetune_files + holdout_val_files

metadata_subset = y_data.loc[finetuen_and_val].copy()

# check for nans
if metadata_subset[stratify_col].isna().any():
    if yes_remove_nans:
        metadata_subset = metadata_subset[~metadata_subset[stratify_col].isna()]
    else:
        print('There are nans in the stratify column')
        fill_val = metadata_subset[stratify_col].max() + 1
        metadata_subset[stratify_col] = metadata_subset[stratify_col].fillna(fill_val)



rskf_df = pd.DataFrame(index=finetuen_and_val)

for i in range(n_repeats):
    rskf_df[f'fold_{i}'] = False
    rskf_df.loc[holdout_val_files, f'fold_{i}'] = True


rskf_df.to_csv(os.path.join(splits_dir, 'splits.csv'))

rskf_info = {
    'rskf_params': None,
    'stratify_col': stratify_col,
    'size': rskf_df.shape[0],
    'Train Bool' : False,
    'Test Bool' : True,
    'remove nans': yes_remove_nans
}

with open(os.path.join(splits_dir, 'splits_info.json'), 'w') as f:
    json.dump(rskf_info, f, indent=4)    

In [59]:
stratify_col = 'MSKCC BINARY'
splits_dir = os.path.join(subset_dir, f'{stratify_col} finetune_folds')

# although 50 splits were created, only 5 are used for finetuning

# with open(os.path.join(splits_dir, 'split_info.json'), 'r') as f:
with open(os.path.join(splits_dir, 'splits_info_classical.json'), 'r') as f:
    rskf_info = json.load(f)

rskf_params = rskf_info['rskf_params']
print(rskf_params)

rskf_params['n_repeats'] = 100
rskf = RepeatedStratifiedKFold(**rskf_params)

splits = pd.read_csv(os.path.join(splits_dir, 'splits.csv'), index_col=0)

{'n_splits': 5, 'n_repeats': 1, 'random_state': 42}


In [60]:
yes_dropna = True #drops nan values from the label column

finetune_label_col = 'MSKCC BINARY'

task_dir = os.path.join(splits_dir, finetune_label_col)
os.makedirs(task_dir, exist_ok=True)

In [61]:
finetune_var_q = 0.05
# finetune_var_q = 0.05 # meant to use this one, but forgot to change it in the NN optimization
finetune_var_th = temp['finetune_var'].quantile(finetune_var_q)
print(finetune_var_th)
# finetune_var_th = 0.5#0.75
finetune_freq_th = 0.9
pretrain_freq_th = 0.3 #0.35

finetune_filter = (finetune_var >= finetune_var_th) & (finetune_freq >= finetune_freq_th) & (pretrain_freq >= pretrain_freq_th)

temp_filter = temp[finetune_filter]
print(temp_filter.shape)
chosen_feats = temp_filter.index.to_list()
all_feats = temp.index.to_list()


print('In the peaks of size ', temp_filter.shape[0])
for matt_ft_name, matt_ft_list in matt_ft_dict.items():
    captured_peaks = get_captured_fts(matt_ft_list, temp_filter.index)
    print(f'Number of {matt_ft_name} captured: {len(captured_peaks)} out of {len(matt_ft_list)}: {len(captured_peaks)/len(matt_ft_list):.2f}')


overall_freq = (1 - nan_mask[chosen_feats].mean(axis=0)).mean()
print(f'Overall frequency of chosen features: {overall_freq:.3f}')

0.8965741889899261
(930, 3)
In the peaks of size  930
Number of top_10 captured: 2 out of 10: 0.20
Number of top_25 captured: 9 out of 25: 0.36
Number of 168_os_pfs captured: 50 out of 168: 0.30
Number of net_matched captured: 25 out of 86: 0.29
Number of rcc_targets captured: 80 out of 188: 0.43
Overall frequency of chosen features: 0.549


In [62]:
finetune_var_q = 0.1
# finetune_var_q = 0.05 # meant to use this one, but forgot to change it in the NN optimization
finetune_var_th = temp['finetune_var'].quantile(finetune_var_q)
print(finetune_var_th)
# finetune_var_th = 0.5#0.75
finetune_freq_th = 0.9
pretrain_freq_th = 0.3 #0.35

finetune_filter = (finetune_var >= finetune_var_th) & (finetune_freq >= finetune_freq_th) & (pretrain_freq >= pretrain_freq_th)

temp_filter = temp[finetune_filter]
print(temp_filter.shape)
filtered_feats = temp_filter.index.to_list()
all_feats = temp.index.to_list()


print('In the peaks of size ', temp_filter.shape[0])
for matt_ft_name, matt_ft_list in matt_ft_dict.items():
    captured_peaks = get_captured_fts(matt_ft_list, temp_filter.index)
    print(f'Number of {matt_ft_name} captured: {len(captured_peaks)} out of {len(matt_ft_list)}: {len(captured_peaks)/len(matt_ft_list):.2f}')

overall_freq = (1 - nan_mask[chosen_feats].mean(axis=0)).mean()
print(f'Overall frequency of chosen features: {overall_freq:.3f}')

0.9353629930755045
(878, 3)
In the peaks of size  878
Number of top_10 captured: 1 out of 10: 0.10
Number of top_25 captured: 7 out of 25: 0.28
Number of 168_os_pfs captured: 43 out of 168: 0.26
Number of net_matched captured: 21 out of 86: 0.24
Number of rcc_targets captured: 73 out of 188: 0.39


In [63]:
# Create the Train, Val, and Test data sets

X_train = X_data.loc[finetune_files]
y_train = y_data.loc[finetune_files, finetune_label_col]
y_train = y_train.dropna()
X_train = X_train.loc[y_train.index]


X_val = X_data.loc[holdout_val_files]
y_val = y_data.loc[holdout_val_files, finetune_label_col]
y_val = y_val.dropna()
X_val = X_val.loc[y_val.index]


X_test = X_data.loc[holdout_test_files]
y_test = y_data.loc[holdout_test_files, finetune_label_col]
y_test = y_test.dropna()
X_test = X_test.loc[y_test.index]

In [64]:
logistic_regression_param_grid = {
            # 'penalty': ['l1', 'l2'],
            # 'solver' : ['liblinear'],
            'penalty': ['elastic'],
            'l1_ratio': [0, 0.25, 0.33, 0.5, 0.66, 0.75, 1],
            'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10],
            'class_weight': ['balanced']
            }

random_forest_param_grid = {
                'n_estimators': [10, 20, 50, 100, 200, 400],
                'max_depth': [2, 3, 5, 10, None],
                'min_samples_split': [0.05, 0.1, 0.2, 2, 5, 7, 10],
                'min_samples_leaf': [0.025, 0.05, 0.1, 2, 4, 6, 8],
                'bootstrap': [True, False],
                'max_features': ['auto', 'sqrt', 'log2'],
                'ccp_alpha': [0, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
                'class_weight': ['balanced']
}

decision_tree_param_grid = {
                'max_depth': [2, 3, 5, 10, None],
                'min_samples_split': [0.05, 0.1, 0.2, 2, 5, 7, 10],
                'min_samples_leaf': [0.025, 0.05, 0.1, 2, 4, 6, 8],
                'class_weight': ['balanced']
}

svc_param_grid = {
                'C': [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.66, 1, 1.5, 2, 5, 10],
                'kernel': ['linear', 'rbf', 'poly'], 
                'gamma': ['scale', 'auto'],
                # 'early_stopping': [True],
                'class_weight': ['balanced'],
                'probability': [True]
                # 'validation_fraction': [0.1, 0.2]
            }

In [81]:
# param_grids = [logistic_regression_param_grid]#, random_forest_param_grid, svc_param_grid]
# model_kinds = ['logistic_regression']#, 'random_forest', 'svc']

param_grids = [logistic_regression_param_grid,svc_param_grid]
model_kinds = ['logistic_regression','svc']

feat_filt_names = ['filtered peaks', 'all peaks']
chosen_fts_list = [filtered_feats, all_feats]
output_dir = os.path.join(task_dir, 'classical_models_alt2')
os.makedirs(output_dir, exist_ok=True)

for param_grid, model_kind in zip(param_grids, model_kinds):
    print(model_kind)
    for feat_filt_name, chosen_fts in zip(feat_filt_names, chosen_fts_list):
        print(feat_filt_name)
        model_name = f'{model_kind} optimal'  + f'_{feat_filt_name}'

        data_dict = {'X_train': X_train[chosen_fts], 
                    'y_train': y_train, 
                    'X_val': X_val[chosen_fts], 
                    'y_val': y_val, 
                    'X_test': X_test[chosen_fts], 
                    'y_test': y_test}

        if os.path.exists(os.path.join(output_dir, f'{model_name}_summary.json')):
            print(f'{model_name} already exists')
            
            with open(os.path.join(output_dir, f'{model_name}_summary.json'), 'r') as f:
                model_summary = json.load(f)

            param_kwargs = model_summary['best_params']

            if model_kind == 'logistic_regression':
                base_model = LogisticRegression(max_iter=1000)
            elif model_kind == 'random_forest':
                base_model = RandomForestClassifier()
            elif model_kind == 'decision_tree':
                base_model = DecisionTreeClassifier()
            elif model_kind == 'svc':
                base_model = SVC(probability=True)

            model = base_model.set_params(**param_kwargs)
            X_both = pd.concat([data_dict['X_train'], data_dict['X_val']], axis=0)
            y_both = pd.concat([data_dict['y_train'], data_dict['y_val']], axis=0)
            model.fit(X_both, y_both)
            y_test_proba = model.predict_proba(data_dict['X_test'])[:,1]
            test_score = roc_auc_score(data_dict['y_test'], y_test_proba, average='weighted')
            print(f'Test score: {test_score}')
            model_summary['test_score (fit on train and val)'] = test_score
            save_json(model_summary, os.path.join(output_dir, f'{model_name}_summary2.json'))

        else:
            out = my_sklearn_fitter(data_dict, param_grid, 
                                output_dir, model_kind, model_name,
                                cv=rskf, n_iter=20, verbose=0)

logistic_regression
filtered peaks
logistic_regression optimal_filtered peaks already exists
Test score: 0.8778280542986425
all peaks
logistic_regression optimal_all peaks already exists
Test score: 0.9193061840120663
svc
filtered peaks
svc optimal_filtered peaks already exists
Test score: 0.9087481146304676
all peaks
svc optimal_all peaks already exists
Test score: 0.9147812971342382


In [70]:
rskf.get_n_splits()

500

In [67]:
output_files = os.listdir(output_dir)
output_summary_files = [f for f in output_files if f.endswith('summary.json')]
other_files = [f for f in output_files if f not in output_summary_files]

all_res = []
df_cols = ['model_kind','model_name','n_input ft','cv_score','cv_score_std','train_score','val_score', 'test_score']
for f in output_summary_files:
    print(f)
    model_name = f.split('_summary.json')[0]
    res = json.load(open(os.path.join(output_dir, f)))
    res_df = pd.DataFrame({k: res[k] for k in df_cols}, index=[model_name])
    all_res.append(res_df)


res_summary = pd.concat(all_res, axis=0)    
res_summary = res_summary.round(4)
res_summary.to_csv(os.path.join(task_dir, 'classical_alt_summary.csv'))

logistic_regression optimal_filtered peaks_summary.json
svc optimal_filtered peaks_summary.json
svc optimal_all peaks_summary.json
logistic_regression optimal_all peaks_summary.json


## Do parameter optimization using the validation set

In [80]:
param_grids = [logistic_regression_param_grid, random_forest_param_grid, svc_param_grid]
model_kinds = ['logistic_regression', 'random_forest', 'svc']


feat_filt_names = ['filtered peaks', 'all peaks']
chosen_fts_list = [filtered_feats, all_feats]
output_dir = os.path.join(task_dir, 'classical_models_alt3')
os.makedirs(output_dir, exist_ok=True)

for param_grid, model_kind in zip(param_grids, model_kinds):
    print(model_kind)
    for feat_filt_name, chosen_fts in zip(feat_filt_names, chosen_fts_list):
        print(feat_filt_name)
        model_name = f'{model_kind} optimal'  + f'_{feat_filt_name}'

        data_dict = {'X_train': X_train[chosen_fts], 
                    'y_train': y_train, 
                    'X_val': X_val[chosen_fts], 
                    'y_val': y_val, 
                    'X_test': X_test[chosen_fts], 
                    'y_test': y_test}
        
        if model_kind == 'svc':
            model = SVC(probability=True)
        elif model_kind == 'logistic_regression':
            model = LogisticRegression(max_iter=1000)
        elif model_kind == 'random_forest':
            model = RandomForestClassifier()
        elif model_kind == 'decision_tree':
            model = DecisionTreeClassifier()


        out = optimize_with_val(model, data_dict, param_grid,n_iter=20)


        out.to_csv(os.path.join(output_dir, f'{model_name}_optimal_by_val.csv'))


logistic_regression
filtered peaks
13
all peaks
14
random_forest
filtered peaks
55
all peaks
54
svc
filtered peaks
28
all peaks
27


In [None]:
for 

In [41]:
X_train.shape

(241, 2736)

In [55]:
y_train.mean()

0.6473029045643154

In [42]:
y_train.shape

(241,)

In [53]:
y_test.mean()

0.6623376623376623

In [46]:
y_test.value_counts()

MSKCC BINARY
1.0    51
0.0    26
Name: count, dtype: int64

In [54]:
y_val.mean()

0.6410256410256411

In [58]:
np.mean([y_train.loc[splits.iloc[:,x]].count() for x in range(0,5)])

48.2

In [56]:
np.mean([y_train.loc[splits.iloc[:,x]].mean() for x in range(0,5)])

0.6472789115646259

In [57]:
np.mean([y_train.loc[~splits.iloc[:,x]].mean() for x in range(0,5)])

0.6473013816925735

In [52]:
y_train.loc[~splits.iloc[:,0]].mean()

0.6458333333333334