In [1]:
import os
import sys
import pandas as pd
import numpy as np
import json
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold

from misc import save_json, get_dropbox_dir

from sklearn_models import sklearn_fit_eval_wrapper, sklearn_fitCV_eval_wrapper


import sklearn
print('sklearn version: ', sklearn.__version__)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

sklearn version:  1.3.2


In [2]:
subset_dir = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/March_12_Data'

In [3]:
X_data = pd.read_csv(os.path.join(subset_dir, 'X.csv'), index_col=0)

In [4]:
nan_mask = pd.read_csv(os.path.join(subset_dir, 'nans.csv'), index_col=0)


In [5]:
y_data = pd.read_csv(os.path.join(subset_dir, 'y.csv'), index_col=0)

In [6]:
print('number of samples: ', X_data.shape[0])
print('number of features: ', X_data.shape[1])

number of samples:  17685
number of features:  2736


In [7]:
pretrain_files = y_data[y_data['Set']=='Pretrain'].index.to_list()
finetune_files = y_data[y_data['Set']=='Finetune'].index.to_list()
holdout_test_files = y_data[y_data['Set']=='Test'].index.to_list()
holdout_val_files = y_data[y_data['Set']=='Validation'].index.to_list()


In [8]:
finetune_freq = 1 - nan_mask.loc[finetune_files].sum(axis=0)/ len(finetune_files)
pretrain_freq = 1- nan_mask.loc[pretrain_files].sum(axis=0)/ len(pretrain_files)
finetune_var = X_data.loc[finetune_files].var(axis=0)

temp = pd.concat([finetune_freq, pretrain_freq, finetune_var], axis=1)
temp.columns = ['finetune_freq', 'pretrain_freq', 'finetune_var']
print(temp.shape)

(2736, 3)


In [9]:
stratify_col = 'MSKCC BINARY'
splits_dir = os.path.join(subset_dir, f'{stratify_col} finetune_folds')

# although 50 splits were created, only 5 are used for finetuning

with open(os.path.join(splits_dir, 'splits_info.json'), 'r') as f:
    rskf_info = json.load(f)

rskf_params = rskf_info['rskf_params']
print(rskf_params)

rskf_params['n_repeats'] = 100
rskf = RepeatedStratifiedKFold(**rskf_params)

splits = pd.read_csv(os.path.join(splits_dir, 'splits.csv'), index_col=0)

{'n_splits': 5, 'n_repeats': 10, 'random_state': 42}


In [10]:
yes_dropna = True #drops nan values from the label column

finetune_label_col = 'MSKCC BINARY'

task_dir = os.path.join(splits_dir, finetune_label_col)
os.makedirs(task_dir, exist_ok=True)

In [11]:
finetune_var_q = 0.05
# finetune_var_q = 0.05 # meant to use this one, but forgot to change it in the NN optimization
finetune_var_th = temp['finetune_var'].quantile(finetune_var_q)
print(finetune_var_th)
# finetune_var_th = 0.5#0.75
finetune_freq_th = 0.9
pretrain_freq_th = 0.3 #0.35

finetune_filter = (finetune_var >= finetune_var_th) & (finetune_freq >= finetune_freq_th) & (pretrain_freq >= pretrain_freq_th)

temp_filter = temp[finetune_filter]
print(temp_filter.shape)
filtered_feats = temp_filter.index.to_list()
all_feats = temp.index.to_list()

overall_freq = (1 - nan_mask[filtered_feats].mean(axis=0)).mean()
print(f'Overall frequency of chosen features: {overall_freq:.3f}')

0.8965741889899261
(930, 3)
Overall frequency of chosen features: 0.549


In [12]:
# Create the Train, Val, and Test data sets

X_train = X_data.loc[finetune_files]
y_train = y_data.loc[finetune_files, finetune_label_col]
y_train = y_train.dropna()
X_train = X_train.loc[y_train.index]


X_val = X_data.loc[holdout_val_files]
y_val = y_data.loc[holdout_val_files, finetune_label_col]
y_val = y_val.dropna()
X_val = X_val.loc[y_val.index]


X_test = X_data.loc[holdout_test_files]
y_test = y_data.loc[holdout_test_files, finetune_label_col]
y_test = y_test.dropna()
X_test = X_test.loc[y_test.index]

## Run RandomGridSearchCV to choose the optimal model

In [13]:
if False:
    
    model_kinds = ['logistic_regression','svc', 'random_forest']

    feat_filt_names = ['filtered peaks', 'all peaks']
    chosen_fts_list = [filtered_feats, all_feats]
    subdir = 'classical_models_alt2'

    output_dir = os.path.join(task_dir, subdir)
    os.makedirs(output_dir, exist_ok=True)

    for model_kind in model_kinds:
        print(model_kind)
        for feat_filt_name, chosen_fts in zip(feat_filt_names, chosen_fts_list):
            print(feat_filt_name)
            model_name = f'{model_kind} optimal'  + f'_{feat_filt_name}'

            data_dict = {'X_train': X_train[chosen_fts], 
                        'y_train': y_train, 
                        'X_val': X_val[chosen_fts], 
                        'y_val': y_val, 
                        'X_test': X_test[chosen_fts], 
                        'y_test': y_test}

            if os.path.exists(os.path.join(output_dir, f'{model_name}_summary.json')):
                print(f'{model_name} already exists')
                
                with open(os.path.join(output_dir, f'{model_name}_summary.json'), 'r') as f:
                    model_summary = json.load(f)

                param_kwargs = model_summary['best_params']
                continue

            else:
                out = sklearn_fitCV_eval_wrapper(data_dict=data_dict, 
                                                model_kind=model_kind, 
                                                output_dir=output_dir, 
                                                model_name=model_name, 
                                                cv = rskf,
                                                n_iter=20)
                


    # compile results into one table

    output_files = os.listdir(output_dir)
    output_summary_files = [f for f in output_files if f.endswith('summary.json')]
    other_files = [f for f in output_files if f not in output_summary_files]

    all_res = []
    df_cols = ['model_kind','model_name','n_input ft','cv_score','cv_score_std','train_score','val_score', 'test_score', 'test_score (fit on train+val)']
    for f in output_summary_files:
        print(f)
        model_name = f.split('_summary.json')[0]
        res = json.load(open(os.path.join(output_dir, f)))
        res_df = pd.DataFrame({k: res[k] for k in df_cols}, index=[model_name])
        all_res.append(res_df)


    res_summary = pd.concat(all_res, axis=0)    
    res_summary = res_summary.round(4)
    res_summary.to_csv(os.path.join(task_dir, f'{subdir}_summary.csv'))            

## Use the validation set to choose the best model

In [14]:
model_kinds = ['logistic_regression', 'random_forest', 'svc']


feat_filt_names = ['filtered peaks', 'all peaks']
chosen_fts_list = [filtered_feats, all_feats]
subdir = 'classical_models_alt4'
output_dir = os.path.join(task_dir, subdir)
os.makedirs(output_dir, exist_ok=True)



for model_kind in model_kinds:
    print(model_kind)
    for feat_filt_name, chosen_fts in zip(feat_filt_names, chosen_fts_list):
        print(feat_filt_name)
        model_name = f'{model_kind} optimal'  + f'_{feat_filt_name}'

        data_dict = {'X_train': X_train[chosen_fts], 
                    'y_train': y_train, 
                    'X_val': X_val[chosen_fts], 
                    'y_val': y_val, 
                    'X_test': X_test[chosen_fts], 
                    'y_test': y_test}
        
        if os.path.exists(os.path.join(output_dir, f'{model_name}_summary.csv')):
            continue

        sklearn_fit_eval_wrapper(
            data_dict=data_dict, 
            model_kind=model_kind, 
            output_dir=output_dir, 
            model_name=model_name,
            # param_grid=random_forest_param_grid,
            n_iter=100
        )


logistic_regression
filtered peaks
all peaks
random_forest
filtered peaks
400


all peaks
398
svc
filtered peaks
77
all peaks
78
