# Making Datasets for PPSN

## Imports

In [1]:
import os
import random
import numpy as np
import pandas as pd
from src.skfibers.experiments.survival_sim_simple import survival_data_simulation

current_working_directory = os.getcwd()
print(current_working_directory)

/Users/harshbandhey/Local/Cedars/Urbslab/scikit-FIBERS-ryan_dev


## Setting up Local Parameters

In [2]:
local_save = True
folder_path = None
if local_save:
    output_folder = './PPSNDatasets/'
else:
    output_folder = folder_path
if not os.path.exists(output_folder):
        os.makedirs(output_folder)
print(output_folder)

./PPSNDatasets/


In [3]:
random.seed(42)
np.random.seed(42)

## Making Datasets

In [4]:
data_name = 'standard_no_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold = 0, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.0, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42)
data_conf = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold = 0, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.0, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42)
assert(data.equals(data_conf))
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 1023
Unique LR Combos: 1
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 96950
Unique binary numbers: 1024
Unique HR Combos: 1023
Unique LR Combos: 1
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 96950
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [5]:
data_name = 'standard_with_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold = 0, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.2, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42)
data_conf = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold = 0, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.2, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42)
assert(data.equals(data_conf))
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 1023
Unique LR Combos: 1
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 47038
Unique binary numbers: 1024
Unique HR Combos: 1023
Unique LR Combos: 1
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 47038
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [6]:
data_name = 'thershold_0_no_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=0, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.0, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 1023
Unique LR Combos: 1
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 96950
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [7]:
data_name = 'thershold_1_no_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=1, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.0, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 1013
Unique LR Combos: 11
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 62924
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [8]:
data_name = 'thershold_2_no_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=2, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.0, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 968
Unique LR Combos: 56
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 29973
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [9]:
data_name = 'thershold_4_no_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=4, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.0, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 638
Unique LR Combos: 386
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 53432
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [10]:
data_name = 'thershold_0_with_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=0, feature_frequency_range=(0.1, 0.4), 
noise_frequency=0.2, class0_time_to_event_range=(1.5, 0.2), 
class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 1023
Unique LR Combos: 1
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 47038
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [11]:
data_name = 'thershold_1_with_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=1, feature_frequency_range=(0.1, 0.4), 
noise_frequency=0.2, class0_time_to_event_range=(1.5, 0.2), 
class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 1013
Unique LR Combos: 11
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 16841
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [12]:
data_name = 'thershold_2_with_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=2, feature_frequency_range=(0.1, 0.4), 
noise_frequency=0.2, class0_time_to_event_range=(1.5, 0.2), 
class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 968
Unique LR Combos: 56
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 60566
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64


In [13]:
data_name = 'thershold_4_with_noise'
data = survival_data_simulation(instances=10000, total_features=100, predictive_features=10, low_risk_proportion=0.5, threshold=4, feature_frequency_range=(0.1, 0.4), 
                                noise_frequency=0.2, class0_time_to_event_range=(1.5, 0.2), 
                                class1_time_to_event_range=(1, 0.2), censoring_frequency=0.2, 
                                covariates_to_sim=0, covariates_signal_range=(0.2,0.4), random_seed=42
)
data.to_csv(output_folder+'/'+data_name+'.csv', index=False)
true_risk_group = data[['TrueRiskGroup']]
value_counts = true_risk_group['TrueRiskGroup'].value_counts()
print(value_counts)

Unique binary numbers: 1024
Unique HR Combos: 638
Unique LR Combos: 386
Target predictive feature(s) 'one's counts: [2918, 1075, 1825, 1669, 3209, 3030, 3676, 1260, 2265, 1089]
Random Number Check: 29610
TrueRiskGroup
0    5000
1    5000
Name: count, dtype: int64
