# SurvivalLCS Experiment Runs

## Import and Setup

### Load packages

In [47]:
import os
import pandas as pd
import numpy as np
import random
import sys
import glob
from datetime import date
import argparse
from random import shuffle
from random import sample
import matplotlib.pyplot as plt
import sys
import shutil
import sksurv
import pickle
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from survival_LCS_pipeline import survivalLCS

In [48]:
sys.path.append("/home/bandheyh/common/survival-lcs-telo")

In [49]:
plt.ioff()
plt.ioff()

<contextlib.ExitStack at 0x15551938b550>

### Import py scripts

In [50]:
import survival_AttributeTracking
import survival_Classifier
import survival_ClassifierSet
import survival_DataManagement
import survival_ExpertKnowledge
import survival_ExSTraCS
import survival_IterationRecord
import survival_Pareto
import survival_Prediction
import survival_StringEnumerator
import survival_OfflineEnvironment
import survival_Timer
import survival_RuleCompaction
import survival_Metrics
import utils
import nonparametric_estimators
import importGametes
import survival_data_simulator

### Test run scripts interactively

In [51]:
# %run -i survival_AttributeTracking.py
# %run -i survival_Classifier.py
# %run -i survival_ClassifierSet.py
# %run -i survival_DataManagement.py
# %run -i survival_ExpertKnowledge.py
# %run -i survival_ExSTraCS.py
# %run -i survival_IterationRecord.py
# %run -i survival_Pareto.py
# %run -i survival_Prediction.py
# %run -i survival_StringEnumerator.py
# %run -i survival_OfflineEnvironment.py
# %run -i survival_Timer.py
# %run -i survival_RuleCompaction.py
# %run -i survival_Metrics.py
# %run -i utils.py
# %run -i nonparametric_estimators.py

## Survival-LCS Parameters

### Set file names and necessary parameters

In [52]:
# parameter to run using hpc resources
HPC = True

homedir = "/home/bandheyh/common/survival-lcs-telo/pipeline2"
models = ['me', 'epi', 'het', 'add']
m0s = []

c = [0.1,0.4,0.8]
nfeat = ['f100','f1000', 'f10000'] #add f10000 when on cluster
maf = ['maf0.2','maf0.4']

iterations = 100000
cv_splits = 5

DEBUG = False
if DEBUG:
    models = ['me']
    c = [0.1]
    nfeat = ['f100', 'f1000']
    maf = ['maf0.2', 'maf0.4']
    iterations = 1000
    cv_splits = 3

### Create empty brier score DataFrame
brier_df = pd.DataFrame()
cox_brier_df = pd.DataFrame()

# other non-parameters

simulated = True # CHANGE THIS TO FALSE IF RUNNING REAL DATA

lcs_run = True
dtype_list = []

### Import the survival_LCS pipeline

In [53]:
from survival_LCS_pipeline import survivalLCS

### Making the directory structure

You'll need to create the following folders and subfolders, INSIDE of the home directory for output files:
1. cv_sim_data (with subfolders: cv_me, cv_epi, cv_het, cv_add)
2. pickled_cv_models (with subfolders: me, epi, het, add)
3. sim_lcs_output (with subfolders: me, epi, het, add)

In [54]:
def make_folder(path, overwrite=False):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        if overwrite:
            shutil.rmtree(path)
            os.makedirs(path)

In [55]:
def make_folder_structure(homedir, models, overwrite=True):
    if overwrite==True:
        make_folder(homedir+'/cv_sim_data/')
        make_folder(homedir+'/pickled_cv_models/')
        make_folder(homedir+'/sim_lcs_output/')
        for model in models:
            make_folder(homedir+'/cv_sim_data/cv_' + model, overwrite=overwrite)
            make_folder(homedir+'/pickled_cv_models/' + model, overwrite=overwrite)
            make_folder(homedir+'/sim_lcs_output/' + model, overwrite=overwrite)
    else:
        raise NotImplemented

In [56]:
make_folder_structure(homedir, models)

### Run the survival_LCS pipeline

In [57]:
def get_parameters(models, nfeat, maf, i, j, k):

    g = homedir + '/' + 'simulated_datasets/' + \
        'EDM-1_one_of_each/'+str(models[i]) + \
        '_' + str(nfeat[j]) + '_' + str(maf[k]) + '_' + 'EDM-1_01.txt'
    dtype = str(models[i]) + '_' + str(nfeat[j]) + '_' + str(maf[k])
    dtype_list.append(dtype)
    print(g)

    d = homedir + '/' + 'cv_sim_data/cv_' + str(models[i]) + '/' + dtype
    m = homedir + '/' + 'pickled_cv_models/' + str(models[i]) + '/' + dtype
    o = homedir + '/' + 'sim_lcs_output/' + str(models[i]) + '/' + dtype

    ### Set m0_path
    if models[i] in ['me','add','het']:
        m0_path = homedir+'/'+'simulated_datasets/'+'EDM-1_one_of_each/model_files/me_h0.2_'+str(maf[k])+'_Models.txt'
    else:
        m0_path = homedir+'/'+'simulated_datasets/'+'EDM-1_one_of_each/model_files/epi_h0.2_'+str(maf[k])+'_Models.txt'

    ### Set m1_path
    if models[i] in ['me','epi']:
        m1_path = None
    else:
        m1_path = homedir+'/'+'simulated_datasets/'+'EDM-1_one_of_each/model_files/epi_h0.2_'+str(maf[k])+'_Models.txt'

    ### Set m0_type
    if models[i] in ['me','add','het']:
        m0_type = 'main_effect'
    else:
        m0_type = '2way_epistasis'

    ### Set m1_type
    if models[i] in ['me', 'epi']:
        m1_type = None
    else:
        m1_type = '2way_epistasis'

    ### Set mtype
    if models[i] == 'me':
        mtype = 'main_effect'
    elif models[i] == 'epi':
        mtype = '2way_epistasis'
    elif models[i] == 'add':
        mtype = 'additive'
    else:
        mtype = 'heterogeneous'


    e = "testallsims"
    print(str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k]))

    return g, mtype, d, m, o, e,brier_df,cox_brier_df, m0_path, m0_type, m1_path, m1_type



In [58]:
def run_slcs(survivalLCS):
    survivalLCS.returnPenetrance()
    survivalLCS.returnSurvivalData()

    lcs_run = True

    if lcs_run == True:
        survivalLCS.returnCVDatasets()
        survivalLCS.returnCVModelFiles()

        current_ibs = survivalLCS.returnIBSresults()
        # current_ibs = current_ibs.rename(columns={"mean": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k]), 
        #                                           "ci_lower": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_lower', 
        #                                           "ci_upper": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_upper'})
    else:
        print("Datasets generated only")

    print(survivalLCS.model_type)

    return current_ibs

In [59]:
def run(m):
    m.run()

In [60]:
def make_breir_output(brier_df_list, output_path, model_type, models, dtype_list, i):
    brier_df = pd.concat(brier_df_list, axis = 1, sort = False).reset_index()

    brier_df.to_csv(homedir +'/'+'sim_lcs_output/'+str(models[i])+'/ibs_data_'+mtype+'.txt', index = False)

    plt.figure(figsize=(10, 10))
    plt.xlabel('Time')
    plt.ylabel('Brier score')
    plt.ylim(0,1)

    for i in range(1,len(dtype_list)):
        plt.plot(brier_df['times'], brier_df[dtype_list[i]],label = brier_df[dtype_list[i]].name)
        plt.fill_between(brier_df['times'], brier_df[dtype_list[i]+'_ci_lower'], brier_df[dtype_list[i]+'_ci_upper'], color='b', alpha=.1)
    plt.savefig(output_path+'/brier_scores_'+model_type + '.png')

In [71]:
class Test:
    def run(self):
        print("Running")
        return 1

In [72]:
%%capture
from survival_LCS_pipeline import survivalLCS
job_obj_list = list()
for i in range(0,len(models)):
    for j in range(0,len(nfeat)):
        brier_df_list = list()
        for k in range(0,len(maf)):
#             g, mtype, d, m, o, e,brier_df,cox_brier_df, m0_path, m0_type, m1_path, m1_type = get_parameters(models, nfeat, maf, i, j, k)
#             slcs = survivalLCS(g, mtype, d, m, o, e, cox_brier_df, m0_path, m0_type, m1_path, m1_type)
            test = Test()
            if HPC == False:
                current_ibs = test.run()
            else:
                job_obj_list.append(test)
        if HPC == False:
            if lcs_run == True:
                make_breir_output(brier_df_list, survivalLCS.output_path, survivalLCS.model_type, models, dtype_list, i)
            else:
                print('LCS not run, no brier scores available')

## HPC Code

In [73]:
import dask
from dask.distributed import Client
from dask_jobqueue import SLURMCluster, LSFCluster, SGECluster

In [74]:
def get_cluster(cluster_type='SLURM', output_path=".", queue='defq', memory=16):
    client = None
    try:
        if cluster_type == 'SLURM':
            cluster = SLURMCluster(queue=queue,
                                   cores=1,
                                   memory=str(memory) + "G",
                                   walltime="24:00:00",
                                   log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == "LSF":
            cluster = LSFCluster(queue=queue,
                                 cores=1,
                                 mem=memory * 1000000000,
                                 memory=str(memory) + "G",
                                 walltime="24:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'UGE':
            cluster = SGECluster(queue=queue,
                                 cores=1,
                                 memory=str(memory) + "G",
                                 resource_spec="mem_free=" + str(memory) + "G",
                                 walltime="24:00:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'Local':
            c = Client()
            cluster = c.cluster
        else:
            raise Exception("Unknown or Unsupported Cluster Type")
        client = Client(cluster)
    except Exception as e:
        print(e)
        raise Exception("Exception: Unknown Exception")
    print("Running dask-cluster")
    print(client.scheduler_info())
    return client

In [78]:
cluster = get_cluster(output_path=homedir)

Running dask-cluster
{'type': 'Scheduler', 'id': 'Scheduler-d654c421-1349-44f2-8c93-ccca33ef7399', 'address': 'tcp://172.21.0.91:35577', 'services': {'dashboard': 34803}, 'started': 1732010123.6078544, 'workers': {}}


Perhaps you already have a cluster running?
Hosting the HTTP server on port 34803 instead


In [79]:
make_folder(homedir+'/dask_logs/', overwrite=True)

In [80]:
def run_parallel(model):
    try:
        brier_df = model.run()
    except Exception as e:
        raise e
        brier_df = e
    return brier_df

In [81]:
job_obj_list

[<__main__.Test at 0x155519c54d90>,
 <__main__.Test at 0x15551a1c9700>,
 <__main__.Test at 0x15551a1c9ac0>,
 <__main__.Test at 0x15551a1c90d0>,
 <__main__.Test at 0x15551a1c9b50>,
 <__main__.Test at 0x15551b886520>,
 <__main__.Test at 0x155519c54d60>,
 <__main__.Test at 0x155519367a30>,
 <__main__.Test at 0x155519c4aca0>,
 <__main__.Test at 0x155519367d90>,
 <__main__.Test at 0x1555193678b0>,
 <__main__.Test at 0x155519050070>,
 <__main__.Test at 0x15551b886fd0>,
 <__main__.Test at 0x155519050310>,
 <__main__.Test at 0x1555190509a0>,
 <__main__.Test at 0x155519050100>,
 <__main__.Test at 0x155519050640>,
 <__main__.Test at 0x1555190505e0>,
 <__main__.Test at 0x155519050940>,
 <__main__.Test at 0x155519050b80>,
 <__main__.Test at 0x1555190507c0>,
 <__main__.Test at 0x1555190508b0>,
 <__main__.Test at 0x1555190503d0>,
 <__main__.Test at 0x155519050bb0>]

In [82]:
if HPC == True:
    delayed_results = []
    for model in job_obj_list:
        brier_df = dask.delayed(run_parallel)(model)
        delayed_results.append(brier_df)
    results = dask.compute(*delayed_results)

In [83]:
results

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

In [1]:
# if HPC:
#     results = dask.compute([dask.delayed(run_parallel)(model) for model in job_obj_list])

In [None]:
with open(homedir+'/results.pkl', 'wb') as file:
    pickle.dump(results, file, pickle.HIGHEST_PROTOCOL)

### Error Checking

In [None]:
error_idxs = list()
for i in range(len(results)):
    if type(results[i]) ==  ValueError:
        print(i, results[i])
        error_idxs.append(i)


In [None]:
arr = np.arange(len(results)).reshape(len(models), len(nfeat), len(maf))

# Convert a 1D index to a 3D index
for x in error_idxs:
    i, j, k = np.unravel_index(x, arr.shape)
    print(models[i], nfeat[j], maf[k])

### IBS Tables

In [None]:
brier_df_list = list()
arr = np.arange(len(results)).reshape(len(models), len(nfeat), len(maf))
for x in range(len(results)):
    i, j, k = np.unravel_index(x, arr.shape)
    print(models[i], nfeat[j], maf[k])
    current_ibs = results[x]
    current_ibs = current_ibs.rename(columns={"mean": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k]), 
                                            "ci_lower": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_lower', 
                                            "ci_upper": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_upper'})
    brier_df_list.append(current_ibs)
brier_df = pd.concat(brier_df_list, axis = 1, sort = False).reset_index()
#print('brier_df:', brier_df)
brier_df.to_csv(homedir+'/ibs_data_all.csv', index = False)
brier_df

me f100 maf0.2
me f100 maf0.4
me f1000 maf0.2
me f1000 maf0.4
me f10000 maf0.2
me f10000 maf0.4
epi f100 maf0.2
epi f100 maf0.4
epi f1000 maf0.2
epi f1000 maf0.4
epi f10000 maf0.2
epi f10000 maf0.4
het f100 maf0.2
het f100 maf0.4
het f1000 maf0.2
het f1000 maf0.4
het f10000 maf0.2
het f10000 maf0.4
add f100 maf0.2
add f100 maf0.4
add f1000 maf0.2
add f1000 maf0.4
add f10000 maf0.2
add f10000 maf0.4


Unnamed: 0,times,me_f100_maf0.2_cens0.1,me_f100_maf0.2_cens0.1_ci_lower,me_f100_maf0.2_cens0.1_ci_upper,me_f100_maf0.2_cens0.4,me_f100_maf0.2_cens0.4_ci_lower,me_f100_maf0.2_cens0.4_ci_upper,me_f100_maf0.2_cens0.8,me_f100_maf0.2_cens0.8_ci_lower,me_f100_maf0.2_cens0.8_ci_upper,...,add_f10000_maf0.2_cens0.8_ci_upper,add_f10000_maf0.4_cens0.1,add_f10000_maf0.4_cens0.1_ci_lower,add_f10000_maf0.4_cens0.1_ci_upper,add_f10000_maf0.4_cens0.4,add_f10000_maf0.4_cens0.4_ci_lower,add_f10000_maf0.4_cens0.4_ci_upper,add_f10000_maf0.4_cens0.8,add_f10000_maf0.4_cens0.8_ci_lower,add_f10000_maf0.4_cens0.8_ci_upper
0,0.0,,,,,,,,,,...,,,,,,,,,,
1,1.0,0.222524,0.165508,0.291573,0.181917,0.124445,0.251519,0.198263,0.153941,0.250794,...,0.0,0.010006,-0.002644,0.024999,0.005000,0.005000,0.005000,0.002018,-0.006053,0.011583
2,2.0,0.213417,0.162385,0.275219,0.185741,0.148247,0.231149,0.220543,0.206041,0.237732,...,0.0,0.010005,-0.002645,0.024998,0.005000,0.005000,0.005000,0.005063,-0.006011,0.018189
3,3.0,0.219092,0.187741,0.257060,0.205159,0.181368,0.233971,0.257546,0.221498,0.300272,...,0.0,0.010004,-0.002647,0.024998,0.005000,0.005000,0.005000,0.006087,-0.003810,0.017817
4,4.0,0.227527,0.206796,0.252633,0.225538,0.217243,0.235582,0.292960,0.227421,0.370638,...,0.0,0.010003,-0.002648,0.024998,0.005000,0.005000,0.005000,0.006087,-0.003810,0.017817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96.0,,,,,,,,,,...,,0.020680,0.009290,0.034180,0.065028,-0.011193,0.157337,0.534299,-0.140229,1.333767
97,97.0,,,,,,,,,,...,,0.018811,0.005251,0.035232,0.043842,0.022488,0.070428,0.585393,-0.262527,1.612274
98,98.0,,,,,,,,,,...,,0.024115,-0.002458,0.058421,,,,1.264741,-0.114296,3.045070
99,99.0,,,,,,,,,,...,,,,,,,,,,
