In [1]:
import pandas as pd
import os
import numpy as np
from sklearn_models import run_train_sklearn_model
from prep import ClassifierDataset
import torch

from misc import get_dropbox_dir

dropbox_dir = get_dropbox_dir()

mskcc_dir = f'{dropbox_dir}/development_CohortCombination/mskcc_prediction_study_feb19'
benefit_dir = f'{dropbox_dir}/development_CohortCombination/benefit_study_feb20'

In [2]:
finetune_n_subsets = 30

# data_dir = benefit_dir
# finetune_label_col = 'Benefit'
# finetune_label_encoder = {'CB': 1, 'NCB': 0, 'ICB': np.nan}


data_dir = mskcc_dir
finetune_label_col = 'MSKCC'
finetune_label_encoder  = {'FAVORABLE': 1, 'POOR': 0, 'INTERMEDIATE': np.nan}



In [3]:

save_dir = os.path.join(data_dir, 'classical_models')
os.makedirs(save_dir, exist_ok=True)

for model_kind in ['logistic_regression', 'random_forest', 'svc']:
    gather_output = []
    model_name = model_kind + '_default'
    output_summary_file = os.path.join(save_dir, f'{model_name}_summary.csv')
    for subset_id in range(finetune_n_subsets):
        finetune_dataset = ClassifierDataset(data_dir, 
                                    subset='train_{}'.format(subset_id),
                                    label_col=finetune_label_col,
                                    label_encoder=finetune_label_encoder)
        
        class_weights = 1 / torch.bincount(finetune_dataset.y.long())

        test_dataset = ClassifierDataset(data_dir, 
                                    subset='test_{}'.format(subset_id),
                                    label_col=finetune_label_col,
                                    label_encoder=finetune_label_encoder)
        

        data_dict = {'train': finetune_dataset, 'test': test_dataset}


        output_data = run_train_sklearn_model(data_dict,save_dir,
                                model_kind = model_kind,
                                model_name=f'{model_name}_{subset_id}',
                                # model_name=model_name,
                                param_grid={})
        


        gather_output.append(output_data)

    ## Summarize the important results
    # best_epoch_list = [output['best_epoch'] for output in gather_output]
    val_auroc_list = [output['end_state_auroc']['test'] for output in gather_output]
    train_auroc_list = [output['end_state_auroc']['train'] for output in gather_output]
    # model_name = gather_output[0]['model_name']
    auc_summary = pd.DataFrame({
                    'train_auroc': train_auroc_list,
                    'test_auroc': val_auroc_list,})


    result_summary_avg = auc_summary.mean()
    result_summary_std = auc_summary.std()
    result_summary_avg.index = [f'AVG {col}' for col in result_summary_avg.index]
    result_summary_std.index = [f'STD {col}' for col in result_summary_std.index]
    result_summary = pd.concat([result_summary_avg, result_summary_std])
    result_summary.to_csv(output_summary_file)


In [4]:
result_summary

AVG train_auroc    0.996171
AVG test_auroc     0.608014
STD train_auroc    0.001099
STD test_auroc     0.057410
dtype: float64

In [5]:
len(gather_output)

30