# SurvivalLCS Experiment Runs

## Import and Setup

### Load packages

In [1]:
import os
import pandas as pd
import numpy as np
import random
import sys
import glob
from datetime import date
import argparse
from random import shuffle
from random import sample
import matplotlib.pyplot as plt
import sys
import shutil
import sksurv
import pickle
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from survival_LCS_coxchecks import survivalLCS_coxChecks as survivalLCS

In [2]:
sys.path.append("/home/bandheyh/common/survival-lcs")

In [3]:
plt.ioff()
plt.ioff()

<contextlib.ExitStack at 0x15555019bc10>

## Survival-LCS Parameters

### Set file names and necessary parameters

In [4]:
# parameter to run using hpc resources
HPC = True

homedir = "/home/bandheyh/common/survival-lcs/pipeline"
models = ['me', 'epi', 'het', 'add']
m0s = []

c = [0.1,0.4,0.8]
nfeat = ['f100','f1000', 'f10000'] #add f10000 when on cluster
maf = ['maf0.2','maf0.4']

iterations = 50000
cv_splits = 5

DEBUG = False
if DEBUG:
    models = ['me']
    c = [0.1]
    nfeat = ['f100', 'f1000']
    maf = ['maf0.2', 'maf0.4']
    iterations = 1000
    cv_splits = 3

### Create empty brier score DataFrame
brier_df = pd.DataFrame()
cox_brier_df = pd.DataFrame()

# other non-parameters

simulated = True # CHANGE THIS TO FALSE IF RUNNING REAL DATA

lcs_run = True
dtype_list = []

### Import the survival_LCS pipeline

In [5]:
from survival_LCS_coxchecks import survivalLCS_coxChecks as survivalLCS

### Run the survival_LCS pipeline

In [6]:
def get_parameters(models, nfeat, maf, i, j, k):

    g = homedir + '/' + 'simulated_datasets/' + \
        'EDM-1_one_of_each/'+str(models[i]) + \
        '_' + str(nfeat[j]) + '_' + str(maf[k]) + '_' + 'EDM-1_01.txt'
    dtype = str(models[i]) + '_' + str(nfeat[j]) + '_' + str(maf[k])
    dtype_list.append(dtype)
    print(g)

    d = homedir + '/' + 'cv_sim_data/cv_' + str(models[i]) + '/' + dtype
    m = homedir + '/' + 'pickled_cv_models/' + str(models[i]) + '/' + dtype
    o = homedir + '/' + 'sim_lcs_output/' + str(models[i]) + '/' + dtype

    ### Set m0_path
    if models[i] in ['me','add','het']:
        m0_path = homedir+'/'+'simulated_datasets/'+'EDM-1_one_of_each/model_files/me_h0.2_'+str(maf[k])+'_Models.txt'
    else:
        m0_path = homedir+'/'+'simulated_datasets/'+'EDM-1_one_of_each/model_files/epi_h0.2_'+str(maf[k])+'_Models.txt'

    ### Set m1_path
    if models[i] in ['me','epi']:
        m1_path = None
    else:
        m1_path = homedir+'/'+'simulated_datasets/'+'EDM-1_one_of_each/model_files/epi_h0.2_'+str(maf[k])+'_Models.txt'

    ### Set m0_type
    if models[i] in ['me','add','het']:
        m0_type = 'main_effect'
    else:
        m0_type = '2way_epistasis'

    ### Set m1_type
    if models[i] in ['me', 'epi']:
        m1_type = None
    else:
        m1_type = '2way_epistasis'

    ### Set mtype
    if models[i] == 'me':
        mtype = 'main_effect'
    elif models[i] == 'epi':
        mtype = '2way_epistasis'
    elif models[i] == 'add':
        mtype = 'additive'
    else:
        mtype = 'heterogeneous'


    e = "testallsims"
    print(str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k]))

    return g, mtype, d, m, o, e,brier_df,cox_brier_df, m0_path, m0_type, m1_path, m1_type



In [7]:
def run_slcs(survivalLCS):

    lcs_run = True

    if lcs_run == True:
        survivalLCS.returnCVModelFiles()
        ibs, cox_ibs, perm = survivalLCS.return_all_results()

    else:
        print("Datasets generated only")

    print(survivalLCS.model_type)

    return ibs, cox_ibs, perm

In [8]:
def make_breir_output(brier_df_list, output_path, model_type, models, dtype_list, i, df_type):
    brier_df = pd.concat(brier_df_list, axis = 1, sort = False).reset_index()

    brier_df.to_csv(homedir +'/'+'sim_lcs_output/'+str(models[i])+'/'+df_type+'_data_'+mtype+'.txt', index = False)

    plt.figure(figsize=(10, 10))
    plt.xlabel('Time')
    plt.ylabel('Brier score')
    plt.ylim(0,1)

    for i in range(1,len(dtype_list)):
        plt.plot(brier_df['times'], brier_df[dtype_list[i]],label = brier_df[dtype_list[i]].name)
        plt.fill_between(brier_df['times'], brier_df[dtype_list[i]+'_ci_lower'], brier_df[dtype_list[i]+'_ci_upper'], color='b', alpha=.1)
    plt.savefig(output_path+'/'+df_type+'_scores_'+model_type + '.png')

In [9]:
from survival_LCS_coxchecks import survivalLCS_coxChecks as survivalLCS
job_obj_list = list()
for i in range(0,len(models)):
    for j in range(0,len(nfeat)):
        brier_df_list = list()
        cox_brier_df_list = list()
        for k in range(0,len(maf)):
            g, mtype, d, m, o, e,brier_df,cox_brier_df, m0_path, m0_type, m1_path, m1_type = get_parameters(models, nfeat, maf, i, j, k)
            slcs = survivalLCS(g, mtype, d, m, o, e,brier_df,cox_brier_df, m0_path, m0_type, m1_path, m1_type, 
                                      c = c,iterations = iterations, cv = cv_splits)
            if HPC == False:
                ibs, cox_ibs, perm = run_slcs(slcs)
                brier_df_list.append(ibs)
                cox_brier_df_list.append(cox_ibs)
            else:
                job_obj_list.append(slcs)
        if HPC == False:
            if lcs_run == True:
                make_breir_output(brier_df_list, survivalLCS.output_path, survivalLCS.model_type, models, dtype_list, i, 'ibs')
                make_breir_output(brier_df_list, survivalLCS.output_path, survivalLCS.model_type, models, dtype_list, i, 'cox_ibs')
            else:
                print('LCS not run, no brier scores available')

/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/me_f100_maf0.2_EDM-1_01.txt
me_f100_maf0.2
None
/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/me_f100_maf0.4_EDM-1_01.txt
me_f100_maf0.4
None
/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/me_f1000_maf0.2_EDM-1_01.txt
me_f1000_maf0.2
None
/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/me_f1000_maf0.4_EDM-1_01.txt
me_f1000_maf0.4
None
/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/me_f10000_maf0.2_EDM-1_01.txt
me_f10000_maf0.2
None
/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/me_f10000_maf0.4_EDM-1_01.txt
me_f10000_maf0.4
None
/home/bandheyh/common/survival-lcs/pipeline_moreiter/simulated_datasets/EDM-1_one_of_each/epi_f100_maf0.2_EDM-1_01.txt
epi_f100_maf0.2
None
/home/bandheyh/commo

## HPC Code

In [10]:
import dask
from dask.distributed import Client
from dask_jobqueue import SLURMCluster, LSFCluster, SGECluster

In [11]:
def get_cluster(cluster_type='SLURM', output_path=".", queue='defq', memory=4):
    client = None
    try:
        if cluster_type == 'SLURM':
            cluster = SLURMCluster(queue=queue,
                                   cores=1,
                                   memory=str(memory) + "G",
                                   walltime="24:00:00",
                                   log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == "LSF":
            cluster = LSFCluster(queue=queue,
                                 cores=1,
                                 mem=memory * 1000000000,
                                 memory=str(memory) + "G",
                                 walltime="24:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'UGE':
            cluster = SGECluster(queue=queue,
                                 cores=1,
                                 memory=str(memory) + "G",
                                 resource_spec="mem_free=" + str(memory) + "G",
                                 walltime="24:00:00",
                                 log_directory=output_path + "/dask_logs/")
            cluster.adapt(maximum_jobs=400)
        elif cluster_type == 'Local':
            c = Client()
            cluster = c.cluster
        else:
            raise Exception("Unknown or Unsupported Cluster Type")
        client = Client(cluster)
    except Exception as e:
        print(e)
        raise Exception("Exception: Unknown Exception")
    print("Running dask-cluster")
    print(client.scheduler_info())
    return client

In [12]:
cluster = get_cluster(output_path=homedir)

Running dask-cluster
{'type': 'Scheduler', 'id': 'Scheduler-3926f782-fd4a-4d8a-9d2a-57aac3f7d004', 'address': 'tcp://10.17.134.112:40753', 'services': {'dashboard': 33415}, 'started': 1712768517.1279297, 'workers': {}}


Perhaps you already have a cluster running?
Hosting the HTTP server on port 33415 instead


In [13]:
def run_parallel(model):
    try:
        ibs, cox_ibs, perm = run_slcs(model)
    except Exception as e:
        raise e
        brier_df = e
    return ibs, cox_ibs, perm

In [14]:
job_obj_list

[<survival_LCS_coxchecks.survivalLCS_coxChecks at 0x15552478a940>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247a8130>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x15552478adc0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247a1e20>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247a6ca0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247a6fd0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x15552478a3a0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247509a0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555501c16d0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x155524755a00>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x155524750d60>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x155524759a60>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247a13a0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x15552475dcd0>,
 <survival_LCS_coxchecks.survivalLCS_coxChecks at 0x1555247593

In [15]:
if HPC == True:
    delayed_results = []
    for model in job_obj_list:
        brier_df = dask.delayed(run_parallel)(model)
        delayed_results.append(brier_df)
    results = dask.compute(*delayed_results)

In [16]:
# if HPC:
#     results = dask.compute([dask.delayed(run_parallel)(model) for model in job_obj_list])

In [17]:
with open(homedir+'/results_cox.pkl', 'wb') as file:
    pickle.dump(results, file, pickle.HIGHEST_PROTOCOL)

### Error Checking

In [18]:
error_idxs = list()
for i in range(len(results)):
    if type(results[i]) ==  ValueError:
        print(i, results[i])
        error_idxs.append(i)


In [19]:
arr = np.arange(len(results)).reshape(len(models), len(nfeat), len(maf))

# Convert a 1D index to a 3D index
for x in error_idxs:
    i, j, k = np.unravel_index(x, arr.shape)
    print(models[i], nfeat[j], maf[k])

### IBS Tables

In [20]:
brier_df_list = list()
arr = np.arange(len(results)).reshape(len(models), len(nfeat), len(maf))
for x in range(len(results)):
    i, j, k = np.unravel_index(x, arr.shape)
    print(models[i], nfeat[j], maf[k])
    current_ibs = results[x][0]
    current_ibs = current_ibs.rename(columns={"mean": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k]), 
                                            "ci_lower": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_lower', 
                                            "ci_upper": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_upper'})
    brier_df_list.append(current_ibs)
brier_df = pd.concat(brier_df_list, axis = 1, sort = False).reset_index()
#print('brier_df:', brier_df)
brier_df.to_csv(homedir+'/ibs_data_all.csv', index = False)
brier_df

me f100 maf0.2
me f100 maf0.4
me f1000 maf0.2
me f1000 maf0.4
me f10000 maf0.2
me f10000 maf0.4
epi f100 maf0.2
epi f100 maf0.4
epi f1000 maf0.2
epi f1000 maf0.4
epi f10000 maf0.2
epi f10000 maf0.4
het f100 maf0.2
het f100 maf0.4
het f1000 maf0.2
het f1000 maf0.4
het f10000 maf0.2
het f10000 maf0.4
add f100 maf0.2
add f100 maf0.4
add f1000 maf0.2
add f1000 maf0.4
add f10000 maf0.2
add f10000 maf0.4


Unnamed: 0,times,me_f100_maf0.2_cens0.1,me_f100_maf0.2_cens0.1_ci_lower,me_f100_maf0.2_cens0.1_ci_upper,me_f100_maf0.2_cens0.4,me_f100_maf0.2_cens0.4_ci_lower,me_f100_maf0.2_cens0.4_ci_upper,me_f100_maf0.2_cens0.8,me_f100_maf0.2_cens0.8_ci_lower,me_f100_maf0.2_cens0.8_ci_upper,...,add_f10000_maf0.2_cens0.8_ci_upper,add_f10000_maf0.4_cens0.1,add_f10000_maf0.4_cens0.1_ci_lower,add_f10000_maf0.4_cens0.1_ci_upper,add_f10000_maf0.4_cens0.4,add_f10000_maf0.4_cens0.4_ci_lower,add_f10000_maf0.4_cens0.4_ci_upper,add_f10000_maf0.4_cens0.8,add_f10000_maf0.4_cens0.8_ci_lower,add_f10000_maf0.4_cens0.8_ci_upper
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,,,,,,,,,,...,,,,,,,,,,
97,97,,,,,,,,,,...,,,,,,,,,,
98,98,,,,,,,,,,...,,,,,,,,,,
99,99,,,,,,,,,,...,,,,,,,,,,


In [21]:
# results[x][1]

In [22]:
brier_df_list = list()
arr = np.arange(len(results)).reshape(len(models), len(nfeat), len(maf))
for x in range(len(results)):
    i, j, k = np.unravel_index(x, arr.shape)
    print(models[i], nfeat[j], maf[k])
    current_ibs = results[x][1]
    current_ibs = current_ibs.rename(columns={"mean": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k]), 
                                            "ci_lower": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_lower', 
                                            "ci_upper": str(models[i])+'_'+str(nfeat[j])+'_'+str(maf[k])+'_ci_upper'})
    brier_df_list.append(current_ibs)
brier_df = pd.concat(brier_df_list, axis = 1, sort = False).reset_index()
#print('brier_df:', brier_df)
brier_df.to_csv(homedir+'/cox_ibs_data_all.csv', index = False)
brier_df

me f100 maf0.2
me f100 maf0.4
me f1000 maf0.2
me f1000 maf0.4
me f10000 maf0.2
me f10000 maf0.4
epi f100 maf0.2
epi f100 maf0.4
epi f1000 maf0.2
epi f1000 maf0.4
epi f10000 maf0.2
epi f10000 maf0.4
het f100 maf0.2
het f100 maf0.4
het f1000 maf0.2
het f1000 maf0.4
het f10000 maf0.2
het f10000 maf0.4
add f100 maf0.2
add f100 maf0.4
add f1000 maf0.2
add f1000 maf0.4
add f10000 maf0.2
add f10000 maf0.4


Unnamed: 0,times,me_f100_maf0.2,me_f100_maf0.2_ci_lower,me_f100_maf0.2_ci_upper,me_f100_maf0.2.1,me_f100_maf0.2_ci_lower.1,me_f100_maf0.2_ci_upper.1,me_f100_maf0.2.2,me_f100_maf0.2_ci_lower.2,me_f100_maf0.2_ci_upper.2,...,add_f10000_maf0.2_ci_upper,add_f10000_maf0.4,add_f10000_maf0.4_ci_lower,add_f10000_maf0.4_ci_upper,add_f10000_maf0.4.1,add_f10000_maf0.4_ci_lower.1,add_f10000_maf0.4_ci_upper.1,add_f10000_maf0.4.2,add_f10000_maf0.4_ci_lower.2,add_f10000_maf0.4_ci_upper.2
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,0.282613,0.240720,0.332266,0.002015,-0.002896,0.007835,0.014068,0.003247,0.026894,...,,,,,,,,,,
2,2,0.271127,0.225276,0.325470,0.004030,-0.003417,0.012856,0.084255,0.036581,0.140759,...,,,,,,,,,,
3,3,0.223917,0.172209,0.285204,0.005045,-0.003864,0.015605,0.131183,0.071481,0.201943,...,,,,,,,,,,
4,4,0.179916,0.124200,0.245953,0.008056,-0.003819,0.022130,0.226504,0.144499,0.323697,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,,,,,,,,,,...,,,,,,,,,,
97,97,,,,,,,,,,...,,,,,,,,,,
98,98,,,,,,,,,,...,,,,,,,,,,
99,99,,,,,,,,,,...,,,,,,,,,,


In [23]:
pd.DataFrame(results[0][2].mean().sort_values(ascending=False)).T

Unnamed: 0,M0P1,N98,N70,N67,N65,N14,N17,N7,N48,N90,...,N0,N21,N34,N2,N37,N49,N41,N93,N42,N29
0,0.065439,0.005568,0.004366,0.003822,0.003797,0.003447,0.003044,0.002698,0.002657,0.002633,...,0.000206,0.000173,9.3e-05,9e-05,7.4e-05,1.1e-05,-3.1e-05,-5.8e-05,-0.000193,-0.000201
