In [1]:
import os
import sys
from  study_alignment.utils import get_dropbox_dir
import pandas as pd
import numpy as np
import json
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold

from ml.prep import SimpleDataset

from study_alignment.standardize import fill_na_by_cohort, standardize_across_cohorts

In [2]:

dropbox_dir = get_dropbox_dir()
base_dir = os.path.join(dropbox_dir, 'development_CohortCombination','alignment_RCC_2024_Feb_27')
data_dir = os.path.join(base_dir, 'alignment_id_36', 'grid_search_index_1')
# rcc_metadata_file = os.path.join(dropbox_dir, 'development_CohortCombination','clean_rcc_metadata.csv')
subset_id = "subset_robust_Freq, Cohort Log Size Weighted_0.2_rem_['549', '551', '547']_recompute"
matt_ft_dir = os.path.join(base_dir, 'matt_top_fts')

subset_dir = os.path.join(data_dir, subset_id)

In [3]:
# Helper function to get the key features
def get_key_ft_dct(ref_freq=0.6,matt_ft_dir=matt_ft_dir):

    matt_ft_files = os.listdir(matt_ft_dir)
    matt_ft_files = [f for f in matt_ft_files if f.endswith('.txt')]

    matt_ft_dict = {}
    for f in matt_ft_files:
        ft_name = f.split('_feats')[0]
        # with open(os.path.join(matt_ft_dir, f), 'r') as file:
        #     ft = file.read().split(', ')
        # if len(ft) == 1:
        with open(os.path.join(matt_ft_dir, f), 'r') as file:
            ft = file.read().splitlines()
            # print(file.read()
        # remove all of the ', and commas from the strings in the list
        ft = [x.strip(',').strip(' ').strip('"').strip("'").strip('\n').strip('\t') for x in ft]
        matt_ft_dict[ft_name] = ft
        # break
        print(ft_name + ': ' + str(len(ft)))

    # %% [markdown]
    # ### RCC Target Metabolites

    # %%
    # %%
    rcc_peak_info_file = os.path.join(base_dir, 'rcc_result', 'peak_info.csv')
    rcc_peak_info_df = pd.read_csv(rcc_peak_info_file, index_col=0)

    rcc_peak_info_df = rcc_peak_info_df[rcc_peak_info_df['freq'] >= ref_freq].copy()

    print(f'Number of peaks in the reference cohort after {ref_freq} filter: ', rcc_peak_info_df.shape[0])

        
    rcc_matched_targets_file = os.path.join(base_dir,'rcc_result', 'matched_targets HILIC POSITIVE ION MODE.csv')
    rcc_matched_targets_df = pd.read_csv(rcc_matched_targets_file, index_col=0)
    rcc_matched_targets_df.loc[rcc_peak_info_df.index]

    potential_feats = rcc_matched_targets_df[rcc_matched_targets_df['potential_target']].index.to_list()
    print('Number of features that potentially match to a target metabolite: ', len(potential_feats))

    double_match_ids = rcc_matched_targets_df[rcc_matched_targets_df['potential_target_count'] > 1]
    num_double_match = double_match_ids.shape[0]
    print('Number of features that potentially match to more than one target metabolite: ', double_match_ids.shape[0])
    print(rcc_matched_targets_df.loc[double_match_ids.index, 'potential_target_id'])

    # here are the double matches in RCC, two are the same metabolite (but different adducts?)
    # FT3202                           tryptophanTryptophan_μM
    # FT3237                           kynurenineKynurenine_μM
    # FT8451    C18:1 LPC plasmalogen_AC18:1 LPC plasmalogen_B


    potential_feat_names = rcc_matched_targets_df.loc[potential_feats]['potential_target_id'].unique()
    # print('Number of potential feature names: ', len(potential_feat_names))
    print(potential_feat_names)

    print('Number of target metabolite captured: ', len(potential_feat_names))

    # for now don't remove the double counts, since they are NOT actually double counts
    num_rcc_targets_found =  len(potential_feat_names)
    rcc_target_feats = potential_feats

    # add to the matt ft dictionary
    matt_ft_dict['rcc_targets'] = rcc_target_feats

    return matt_ft_dict, rcc_peak_info_df

In [4]:
matt_ft_dict, rcc_peak_info_df = get_key_ft_dct()

top_10: 10
top_25: 25
168_os_pfs: 168
net_matched: 86
Number of peaks in the reference cohort after 0.6 filter:  4016
Number of features that potentially match to a target metabolite:  188
Number of features that potentially match to more than one target metabolite:  3
feats
FT3202                           tryptophanTryptophan_μM
FT3237                           kynurenineKynurenine_μM
FT8451    C18:1 LPC plasmalogen_AC18:1 LPC plasmalogen_B
Name: potential_target_id, dtype: object
['trimethylamine-N-oxide' 'alanine' 'GABA' 'serine' 'hypotaurine'
 'cytosine' 'creatinine' 'betaine' 'threonine' 'niacinamide' 'taurine'
 'ornithine' 'N-acetylalanine' 'N-carbamoyl-beta-alanine'
 'N-methylproline' 'leucine' 'hydroxyproline' 'N-acetylputrescine'
 '1-methylnicotinamide' 'trigonelline' 'anthranilic acid' 'urocanic acid'
 'imidazole propionate' 'ectoine' 'proline-betaine' 'glutamate'
 '4-acetamidobutanoate' 'butyrobetaine' 'glutamine' 'lysine' 'methionine'
 'N1-methyl-2-pyridone-5-carboxamide' 

In [5]:
desc_str = 'x'

In [6]:
# rcc_metadata_file = os.path.join(dropbox_dir, 'development_CohortCombination','clean_rcc_metadata_encoded.csv')
rcc_metadata_file = os.path.join(dropbox_dir, 'development_CohortCombination','alignment_RCC_2024_Feb_27/rcc_result','clean_metadata_encoded.csv')
rcc_metadata = pd.read_csv(rcc_metadata_file, index_col=0)

In [7]:
rcc_metadata['Set'].value_counts()

Set
Pretrain      903
Finetune      449
Test          149
Validation    143
Name: count, dtype: int64

In [8]:
# data_corrected = pd.read_csv(os.path.join(data_dir, 'subset_data_corrected.csv'), index_col=0)
# nan_mask = pd.read_csv(os.path.join(data_dir, 'subset_nan_mask.csv'), index_col=0)


# metadata_df = pd.read_csv(os.path.join(data_dir, 'metadata_df.csv'), index_col=0)
X_data = pd.read_csv(os.path.join(subset_dir, 'X.csv'), index_col=0)
nan_mask = pd.read_csv(os.path.join(subset_dir, 'nans.csv'), index_col=0)
y_data = pd.read_csv(os.path.join(subset_dir, 'y.csv'), index_col=0)


## Clean up the y_data

In [8]:
# previous age group was wrong
# y_data.drop('Age_Group', axis=1, inplace=True)

In [26]:
y_data.drop(['Set','Matt Set'], axis=1, inplace=True)
missing_y_cols = [c for c in rcc_metadata.columns if c not in y_data.columns]
print(missing_y_cols)

y_data = pd.concat([y_data, rcc_metadata[missing_y_cols]], axis=1)

['Matt Set', 'Set']


In [27]:
y_data['Set'].fillna("Pretrain", inplace=True)

In [28]:
y_data['Set'].value_counts()

Set
Pretrain      16944
Finetune        449
Test            149
Validation      143
Name: count, dtype: int64

In [29]:
y_data.to_csv(os.path.join(subset_dir, 'y.csv'))

In [30]:
print('number of samples: ', X_data.shape[0])
print('number of features: ', X_data.shape[1])

number of samples:  17685
number of features:  2736


In [31]:
y_data['Set'].value_counts()

Set
Pretrain      16944
Finetune        449
Test            149
Validation      143
Name: count, dtype: int64

In [32]:
# check how many of the key features are missing
kept_fts = X_data.columns.to_list()

for key_name, key_feat in matt_ft_dict.items():
    captured_fts = [ft for ft in key_feat if ft in kept_fts]
    perct_captured = len(captured_fts)/len(key_feat)
    print(f'Percentage of {key_name} captured: {perct_captured}')

Percentage of top_10 captured: 1.0
Percentage of top_25 captured: 0.92
Percentage of 168_os_pfs captured: 0.7976190476190477
Percentage of net_matched captured: 0.8488372093023255
Percentage of rcc_targets captured: 0.8457446808510638


In [33]:
pretrain_files = y_data[y_data['Set']=='Pretrain'].index.to_list()
finetune_files = y_data[y_data['Set']=='Finetune'].index.to_list()
holdout_test_files = y_data[y_data['Set']=='Test'].index.to_list()
holdout_val_files = y_data[y_data['Set']=='Validation'].index.to_list()

## Stratified Split the Pretraining data

In [16]:
rskf_params = {
        'n_splits': 5,
        'n_repeats': 2,
        'random_state': 42,
    }
stratify_col = 'Cohort ID Expanded'

save_dir = os.path.join(subset_dir, f'{stratify_col} pretrain_folds')
os.makedirs(save_dir, exist_ok=True)


In [17]:
rskf = RepeatedStratifiedKFold(**rskf_params)

rskf_list = []

for i, (train_idx, test_idx) in enumerate(rskf.split(pretrain_files, y_data.loc[pretrain_files,stratify_col])):
    test_bool = np.zeros(len(pretrain_files), dtype=bool)
    test_bool[test_idx] = True
    rskf_list.append(test_bool)

rskf_df = pd.DataFrame(index=pretrain_files)
for i, test_idx in enumerate(rskf_list):
    rskf_df[f'fold_{i}'] = test_idx

rskf_df.to_csv(os.path.join(save_dir, 'splits.csv'))

rskf_info = {
    'rskf_params': rskf_params,
    'stratify_col': stratify_col,
    'desc_str': desc_str,
    'size': rskf_df.shape[0],
    'Train Bool' : False,
    'Test Bool' : True,
}

with open(os.path.join(save_dir, 'splits_info.json'), 'w') as f:
    json.dump(rskf_info, f, indent=4)





## Stratified Split the Finetune data, using single column

In [48]:
rskf_params = {
        'n_splits': 5,
        'n_repeats': 10,
        'random_state': 42,
    }


metadata_subset = y_data.loc[finetune_files].copy()
yes_remove_nans = True

# stratify_col = 'Nivo Benefit BINARY'
stratify_col = 'MSKCC BINARY'

save_dir = os.path.join(subset_dir, f'{stratify_col} finetune_folds')
os.makedirs(save_dir, exist_ok=True)

In [49]:
metadata_subset[stratify_col].max()

1.0

In [50]:
rskf = RepeatedStratifiedKFold(**rskf_params)

rskf_list = []

# check for nans
if metadata_subset[stratify_col].isna().any():
    if yes_remove_nans:
        metadata_subset = metadata_subset[~metadata_subset[stratify_col].isna()]
    else:
        print('There are nans in the stratify column')
        fill_val = metadata_subset[stratify_col].max() + 1
        metadata_subset[stratify_col] = metadata_subset[stratify_col].fillna(fill_val)




for i, (train_idx, test_idx) in enumerate(rskf.split(metadata_subset.index, metadata_subset[stratify_col])):

    test_bool = np.zeros(metadata_subset.shape[0], dtype=bool)
    test_bool[test_idx] = True
    rskf_list.append(test_bool)

rskf_df = pd.DataFrame(index=metadata_subset.index)
for i, test_idx in enumerate(rskf_list):
    rskf_df[f'fold_{i}'] = test_idx

rskf_df.to_csv(os.path.join(save_dir, 'splits.csv'))

rskf_info = {
    'rskf_params': rskf_params,
    'stratify_col': stratify_col,
    'desc_str': desc_str,
    'size': rskf_df.shape[0],
    'Train Bool' : False,
    'Test Bool' : True,
    'remove nans': yes_remove_nans
}

with open(os.path.join(save_dir, 'splits_info.json'), 'w') as f:
    json.dump(rskf_info, f, indent=4)

## Stratified Split the Finetune data, using multiple columns

In [43]:
rskf_params = {
        'n_splits': 5,
        'n_repeats': 10,
        'random_state': 42,
    }

metadata_subset = y_data.loc[finetune_files].copy()

# stratify_cols =['MSKCC','Treatment','Benefit']
# stratify_cols =['MSKCC','Benefit']
stratify_cols =['Treatment','Benefit']

stratify_col = 'multiple_categories'
metadata_subset[stratify_col] = metadata_subset[stratify_cols].apply(lambda x: '_'.join(x), axis=1)

save_dir = os.path.join(subset_dir, f'{stratify_cols} finetune_folds')
os.makedirs(save_dir, exist_ok=True)

In [44]:
metadata_subset['multiple_categories'].value_counts()

multiple_categories
EVEROLIMUS_ICB    97
NIVOLUMAB_ICB     86
NIVOLUMAB_NCB     80
NIVOLUMAB_CB      76
EVEROLIMUS_NCB    55
EVEROLIMUS_CB     55
Name: count, dtype: int64

In [45]:
rskf = RepeatedStratifiedKFold(**rskf_params)

rskf_list = []

for i, (train_idx, test_idx) in enumerate(rskf.split(metadata_subset.index, metadata_subset[stratify_col])):

    test_bool = np.zeros(len(metadata_subset), dtype=bool)
    test_bool[test_idx] = True
    rskf_list.append(test_bool)

rskf_df = pd.DataFrame(index=metadata_subset.index)
for i, test_idx in enumerate(rskf_list):
    rskf_df[f'fold_{i}'] = test_idx

rskf_df.to_csv(os.path.join(save_dir, 'splits.csv'))

rskf_info = {
    'rskf_params': rskf_params,
    'stratify_col': stratify_col,
    'desc_str': desc_str,
    'size': rskf_df.shape[0],
    'Train Bool' : False,
    'Test Bool' : True,
}

with open(os.path.join(save_dir, 'splits_info.json'), 'w') as f:
    json.dump(rskf_info, f, indent=4)



## Run Classical Models on the desired tasks

In [47]:
print(subset_dir)

/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/alignment_id_36/grid_search_index_1/subset_robust_Freq, Cohort Log Size Weighted_0.2_rem_['549', '551', '547']_recompute


In [51]:
# what are the available splits?

subset_dir_files = os.listdir(subset_dir)
split_dirs = [f for f in subset_dir_files if '_folds' in f]
print(split_dirs)


['Nivo Benefit BINARY finetune_folds', 'MSKCC BINARY finetune_folds', "['MSKCC', 'Benefit'] finetune_folds", 'Cohort ID Expanded pretrain_folds', "['Treatment', 'Benefit'] finetune_folds", "['MSKCC', 'Treatment', 'Benefit'] finetune_folds", 'x_finetune_folds']


In [52]:
from ml.sklearn_models import run_train_sklearn_model


In [53]:
rcc3_baseline = y_data.loc[finetune_files].copy()

In [56]:
# desc_str =['MSKCC','Treatment','Benefit']
# desc_str = 'Nivo Benefit BINARY'
desc_str = 'MSKCC BINARY'

save_dir = os.path.join(subset_dir, f'{desc_str} finetune_folds')

# finetune_label_col = 'Benefit'
# finetune_label_mapper  = {'CB': 1, 'NCB': 0, 'ICB': np.nan}
# finetune_filter = rcc3_baseline[rcc3_baseline['Treatment'].isin(['NIVOLUMAB'])].index.to_list()


# finetune_label_col = 'MSKCC'
# finetune_label_mapper  = {'FAVORABLE': 1, 'POOR': 0, 'INTERMEDIATE': np.nan}
# finetune_filter = rcc3_baseline.index.to_list()

# finetune_label_col = 'Nivo Benefit BINARY'
# finetune_label_mapper = {}
# finetune_filter = []

finetune_label_col = 'MSKCC BINARY'
finetune_label_mapper = {}
finetune_filter = []





print(finetune_label_col)
print(finetune_label_mapper)


task_dir = os.path.join(save_dir, finetune_label_col)

splits_df = pd.read_csv(os.path.join(save_dir, 'splits.csv'), index_col=0)
if len(finetune_filter) > 0:
    print('number of samples after the finetune filter:' , len(finetune_filter))
    splits_df = splits_df.loc[finetune_filter].copy()

X = X_data.loc[splits_df.index]
y = y_data.loc[splits_df.index]
nans = nan_mask.loc[splits_df.index].astype(bool)

n_folds = splits_df.shape[1]
yes_dropna = True

task_info = {
    'label_col': finetune_label_col,
    'label_mapper': finetune_label_mapper,
    'filter': finetune_filter,
    'desc_str': desc_str,
    'size': splits_df.shape[0],
    'folds': splits_df.shape[1],
    'dropna': yes_dropna,
}

if finetune_label_mapper:
    task_info['size by label'] =  y[finetune_label_col].map(finetune_label_mapper).value_counts().to_dict(),
else:
    task_info['size by label'] =  y[finetune_label_col].value_counts().to_dict()

os.makedirs(task_dir, exist_ok=True)
with open(os.path.join(task_dir, 'task_info.json'), 'w') as f:
    json.dump(task_info, f, indent=4)

print(task_dir)

MSKCC BINARY
{}
/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/alignment_id_36/grid_search_index_1/subset_robust_Freq, Cohort Log Size Weighted_0.2_rem_['549', '551', '547']_recompute/MSKCC BINARY finetune_folds/MSKCC BINARY


In [57]:

output_dir = os.path.join(task_dir, 'classical_models')
os.makedirs(save_dir, exist_ok=True)

if finetune_label_mapper:
    y_values = y[finetune_label_col].map(finetune_label_mapper)
else:
    y_values = y[finetune_label_col]
    
if yes_dropna:
    print('dropping nan values in the y column')
    y_values = y_values.dropna()
    X = X.loc[y_values.index]
    splits_df = splits_df.loc[y_values.index]

    print('number of samples after dropping nan values in the y column: ', y_values.shape[0])    

for model_kind in ['logistic_regression', 'random_forest', 'svc']:
    gather_output = []
    model_name = model_kind + '_default'
    output_summary_file = os.path.join(output_dir, f'{model_name}_summary.csv')
    for subset_id in range(n_folds):



        train_idx = splits_df.index[splits_df[f'fold_{subset_id}'] == False]
        test_idx = splits_df.index[splits_df[f'fold_{subset_id}'] == True]
        finetune_dataset = SimpleDataset(X.loc[train_idx], y_values.loc[train_idx])
        
        class_weights = 1 / torch.bincount(finetune_dataset.y.long())

        test_dataset = SimpleDataset(X.loc[test_idx], y_values.loc[test_idx])

        data_dict = {'train': finetune_dataset, 'test': test_dataset}

        output_data = run_train_sklearn_model(data_dict,output_dir,
                                model_kind = model_kind,
                                model_name=f'{model_name}_{subset_id}',
                                # model_name=model_name,
                                param_grid={})
        

        gather_output.append(output_data)

    ## Summarize the important results
    # best_epoch_list = [output['best_epoch'] for output in gather_output]
    val_auroc_list = [output['end_state_auroc']['test'] for output in gather_output]
    train_auroc_list = [output['end_state_auroc']['train'] for output in gather_output]
    # model_name = gather_output[0]['model_name']
    auc_summary = pd.DataFrame({
                    'train_auroc': train_auroc_list,
                    'test_auroc': val_auroc_list,})


    result_summary_avg = auc_summary.mean()
    result_summary_std = auc_summary.std()
    result_summary_avg.index = [f'AVG {col}' for col in result_summary_avg.index]
    result_summary_std.index = [f'STD {col}' for col in result_summary_std.index]
    result_summary = pd.concat([result_summary_avg, result_summary_std])
    result_summary.to_csv(output_summary_file)





output_files = os.listdir(output_dir)
output_summary_files = [f for f in output_files if f.endswith('summary.csv')]
other_files = [f for f in output_files if f not in output_summary_files]

all_res = []

for f in output_summary_files:
    print(f)
    model_name = f.split('_summary.csv')[0]
    res = pd.read_csv(os.path.join(output_dir, f), index_col=0)
    res.columns = [model_name]
    all_res.append(res)
    
# delete the other files to save room
for f in other_files:
    os.remove(os.path.join(output_dir, f))    

res_summary = pd.concat(all_res, axis=1)    
res_summary = res_summary.round(4)
res_summary.to_csv(os.path.join(task_dir, 'classical_summary.csv'))


dropping nan values in the y column
number of samples after dropping nan values in the y column:  240
svc_default_summary.csv
logistic_regression_default_summary.csv
random_forest_default_summary.csv


In [None]:
res_summary.round(3)

Unnamed: 0,svc_default,logistic_regression_default,random_forest_default
AVG train_auroc,0.999,1.0,1.0
AVG test_auroc,0.902,0.901,0.88
STD train_auroc,0.0,0.0,0.0
STD test_auroc,0.036,0.036,0.039
