In [None]:
# general libraries
import os
import pandas as pd
import numpy as np
import scipy.stats as stats
import warnings
import multiprocessing
import time

# sampling code
import sys
sys.path.append('..')
import wismut.basics as basics
from wismut.MCMC import MCMC
path = os.getcwd() + "/"

In [None]:
# data = pd.read_csv(path + f"../data/real_data/Daten_Wismut_M0M1aM1bM2M2_ExpertM3.csv")
data = pd.read_csv(path + f"../data/real_data/Daten_Wismut_M0M1aM1bM2M2_ExpertM3M4.csv", engine="pyarrow")

In [None]:
data.loc[data.model == "M1b", "model"] = "M1a"
data.rename(columns={"ID": "Ident",
                     "start": "truncation",
                     "stop": "t",
                     "WLM_calculated": "Z",
                     "cum_WLM_calculated": "Zcum",
                     "delta": "event",
                     },
            inplace=True)

data[["Z", "Zcum"]] = data[["Z", "Zcum"]] / 100

In [None]:
def generate_proposal_sds(disease_model='cox'):
    proposal_sd = {
            'beta': 0.011 if disease_model == 'cox' else 0.00011*10,
            'lambda1': 0.000211,
            'lambda2': 0.000611,
            'lambda3': 0.000611,
            'lambda4': 0.000211,
            'C_Rn_old_mu': 0.2,
            'C_Rn_old_sigma': 0.2,
            'C_Rn_ref_mu': 0.2,
            'C_Rn_ref_sigma': 0.2,
            'C_Rn_mu': 0.2,
            'C_Rn_sigma': 0.2,
            'C_RPD_mu': 0.2,
            'C_RPD_sigma': 0.2,
            'C_Exp_mu': 0.4,
            'C_Exp_sigma': 0.5,
            'zeta_alpha': 0.5,
            'zeta_beta': 0.5,
            'E_Rn_mu': 0.2,
            'E_Rn_sigma': 0.2,
            'gamma_alpha': 0.5,
            'gamma_beta': 0.5,
            'phi_alpha': 0.5,
            'phi_beta': 0.5,
            'omega_alpha': 0.5,
            'omega_beta': 0.5,
            }
    return proposal_sd


In [None]:
def generate_prior_parameters():
    prior_parameters = {'beta': {'dist': "normal", 'mean': 0, 'sd': 100},
                        'lambda1': {'dist': "gamma",'shape': 600,
                                    'scale': 1 / 10000000,
                                    'min': 0, 'max': 200
                                    },
                        'lambda2': {'dist': "gamma", 'shape': 12000,
                                    'scale': 1 / 1000000,
                                    'min': 0, 'max': 200
                                    },
                        'lambda3': {'dist': "gamma", 'shape': 46000,
                                    'scale': 1 / 1000000,
                                    'min': 0, 'max': 200
                                    },
                        'lambda4': {'dist': "gamma", 'shape': 1000,
                                    'scale': 1 / 100000,
                                    'min': 0, 'max': 200
                                    },
                        'C_Rn_mu': {'dist': "normal", 'mean': 6, 'sd': 5},
                        'C_Rn_sigma': {'dist': "normal", 'mean': 8, 'sd': 0.5},
                        'C_Exp_mu': {'dist': "normal", 'mean': 1.78, 'sd': 3},
                        'C_Exp_sigma': {'dist': "normal", 'mean': 0.79, 'sd': 2},
                        'C_RPD_mu': {'dist': "normal", 'mean': 0.15, 'sd': 0.03},
                        'C_RPD_sigma': {'dist': "normal", 'mean': 0.2, 'sd': 0.03},
                        'zeta_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'zeta_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'E_Rn_mu': {'dist': "normal", 'mean': 2, 'sd': 3},
                        'E_Rn_sigma': {'dist': "normal", 'mean': 0.8, 'sd': 2},
                        'gamma_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'gamma_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'phi_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'phi_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'omega_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'omega_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        }
    return prior_parameters


In [None]:
def generate_start_values(seed, chain, disease_model="cox_like", me_correction=True):
    np.random.seed(seed)
    rnd = lambda: stats.uniform(loc=0.9, scale=0.2).rvs(1)[0]

    beta_true = 0.3 if disease_model == "cox_like" else 1.0
    l1 = 0.00006
    l2 = 0.00120
    l3 = 0.00460
    l4 = 0.01000

    start_values = {chain: {'beta': beta_true * rnd(),
                               'lambda1': l1 * rnd(),
                               'lambda2': l2 * rnd(),
                               'lambda3': l3 * rnd(),
                               'lambda4': l4 * rnd(),
                               # values for truncnorm
                               'prior_parameters': {
                                                    'M2': {'C_Rn': {'mu': 6 * rnd(),
                                                                    'sigma': 8 * rnd()
                                                                    },
                                                           },
                                                    'M2_Expert': {'C_Exp': {'mu': 1.78 * rnd(),
                                                                            'sigma': 0.79 * rnd()
                                                                            },
                                                                  },
                                                    'M3': {'C_RPD': {'mu': 0.15 * rnd(),
                                                                    'sigma': 0.2 * rnd()
                                                                     },
                                                           'zeta': {'alpha': 3 * rnd(),
                                                                    'beta': 3 * rnd()
                                                                    }
                                                           },
                                                    'M4': {'E_Rn': {'mu': 2 * rnd(),
                                                                    'sigma': 0.8 * rnd()
                                                                    },
                                                           },
                                                    'equilibrium': {'gamma': {'alpha': 3 * rnd(),
                                                                              'beta': 3 * rnd()
                                                                              }
                                                                    },
                                                    'activity': {'phi': {'alpha': 3 * rnd(),
                                                                         'beta': 3 * rnd()
                                                                         }
                                                                 },
                                                    'working_time': {'omega': {'alpha': 3 * rnd(),
                                                                               'beta': 3 * rnd()
                                                                                }
                                                                     },
                                                    }
                               }
                    }
    if not me_correction:
        del start_values[chain]['prior_parameters']

    return start_values

In [None]:
def generate_uncertainty_characteristics(me_correction):
    uncertainty_characteristics = {
            'M1a': {'C_Rn_old': {'classical_error': {'sd': 6.56, 'structure': 'additive', 'proposal_sd': 1.0},
                                 'Berkson_error': {'sd': 0},
                                 'exposure_model_distribution': 'norm',
                                 'exposure_model_parameters': {'mu': 22.5, 'sigma': 4},
                                 'exposure_model_truncation': {'lower': 1e-10},
                                 'mapping_identifier_classical': ['cluster_C_Rn_old'],
                                 'name_obs_values': 'C_Rn_old'
                                 },
                    'C_Rn_ref': {'classical_error': {'sd': 5.29, 'structure': 'additive', 'proposal_sd': 1.0},
                                 'Berkson_error': {'sd': 0},
                                 'exposure_model_distribution': 'norm',
                                 'exposure_model_parameters': {'mu': 34.09, 'sigma': 10},
                                 'exposure_model_truncation': {'lower': 1e-10},
                                 'mapping_identifier_classical': ['cluster_C_Rn_obs_ref'],
                                 'name_obs_values': 'C_Rn_obs_ref'
                                 },
                    'b': {'classical_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                          'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                          'exposure_model_distribution': 'beta',
                          'exposure_model_parameters': {'alpha': 1, 'beta': 1},
                          'exposure_model_truncation': {'lower': 0.15, 'upper': 1.1},
                          'mapping_identifier_classical': ['b_period'],
                          'mapping_identifier_Berkson': ['object'],
                          'name_obs_values': 'b'
                          },
                    'tau_e': {'classical_error': {'sd': 0.37, 'structure': 'multiplicative', 'proposal_sd': 0.9},
                              'Berkson_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.9},
                              'exposure_model_distribution': 'beta',
                              'exposure_model_parameters': {'alpha': 1, 'beta': 1},
                              'exposure_model_truncation': {'lower': 0.3, 'upper': 1},
                              'mapping_identifier_classical': ['tau_e_period'],
                              'mapping_identifier_Berkson': ['object', 'tau_e_period'],
                              'name_obs_values': 'tau_e'
                              },
                    'A': {'classical_error': {'sd': 0},
                          'Berkson_error': {'sd': 0},
                          'name_obs_values': 'A_calculated'
                          },
                    'A_ref': {'classical_error': {'sd': 0},
                              'Berkson_error': {'sd': 0},
                              'name_obs_values': 'A_ref'
                              },
                    'r': {'classical_error': {'sd': 0},
                          'Berkson_error': {'sd': 0},
                          'name_obs_values': 'r'
                          },
                    },
            'M2': {'C_Rn': {'classical_error': {'sd': 0.59, 'structure': 'additive', 'proposal_sd': 0.1},
                            'Berkson_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.05},
                            'exposure_model_distribution': 'norm',
                            'exposure_model_parameters': {'mu': 6, 'sigma': 8},
                            'exposure_model_truncation': {'lower': 1e-10},
                            'mapping_identifier_classical': ['cluster'],
                            'mapping_identifier_Berkson': ['year', 'OBJECT_GROUP', 'cluster'],
                            'reference_Berkson': 'transfer_reference',
                            'selection_Berkson': 'transfer_cluster',
                            'name_obs_values': 'C_Rn_obs',
                            'vectorized_exposure': True
                            },
                   },
            'M2_Expert': {'C_Exp': {'classical_error': {'sd': 0.936, 'structure': 'multiplicative', 'proposal_sd': 0.2},
                            'Berkson_error': {'sd': 0},
                            'exposure_model_distribution': 'lognorm',
                            'exposure_model_parameters': {'mu': 1.7, 'sigma': 1},
                            'exposure_model_truncation': {'lower': 1e-10},
                            'mapping_identifier_classical': ['cluster'],
                            'name_obs_values': 'C_Rn_obs',
                            'vectorized_exposure': True
                            },
                   },
            'M3': {'C_RPD': {'classical_error': {'sd': 0.03, 'structure': 'additive', 'proposal_sd': 0.005},
                            'Berkson_error': {'sd': 0.13, 'structure': 'multiplicative', 'proposal_sd': 0.005},
                            'exposure_model_distribution': 'norm',
                            'exposure_model_parameters': {'mu': 0.15, 'sigma': 0.2},
                            'exposure_model_truncation': {'lower': 1e-10},
                            'mapping_identifier_classical': ['cluster'],
                            'mapping_identifier_Berkson': ['year', 'OBJECT_GROUP', 'cluster'],
                            'reference_Berkson': 'transfer_reference',
                            'selection_Berkson': 'transfer_cluster',
                            'name_obs_values': 'C_Rn_obs',
                            'vectorized_exposure': True
                            },
                   'zeta': {'classical_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                            'Berkson_error': {'sd': 1.45, 'structure': 'multiplicative', 'proposal_sd': 0.3},
                            'exposure_model_distribution': 'beta',
                            'exposure_model_parameters': {'alpha': 1, 'beta': 1},
                            'exposure_model_truncation': {'lower': 1, 'upper': 1.7},
                            'mapping_identifier_classical': ['object'],
                            'mapping_identifier_Berkson': ['object'],
                            'name_obs_values': 'c_classical'
                            },
                   },
            'M4': {'E_Rn': {'classical_error': {'sd': 0.936, 'structure': 'multiplicative', 'proposal_sd': 0.12},
                            'Berkson_error': {'sd': 0.18, 'structure': 'multiplicative', 'proposal_sd': 0.02},
                            'exposure_model_distribution': 'norm',
                            'exposure_model_parameters': {'mu': -16.21057, 'sigma': 2.851629},  # calculated using script "est_params_M4"
                            'exposure_model_truncation': {'lower': 1e-10},
                            'mapping_identifier_classical': ['cluster'],
                            'mapping_identifier_Berkson': ['year', 'OBJECT_GROUP'],
                            'reference_Berkson': 'transfer_reference',
                            'selection_Berkson': 'transfer_cluster',
                            'name_obs_values': 'C_Rn_obs',
                            'vectorized_exposure': False
                            },
                   },
            'activity': {'phi': {'classical_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                                 'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                                 'exposure_model_distribution': 'beta',
                                 'exposure_model_parameters': {'alpha': 1, 'beta': 1},
                                 'exposure_model_truncation': {'lower': 1e-10, 'upper': 1.3},
                                 'mapping_identifier_classical': ['object', 'activity'],
                                 'mapping_identifier_Berkson': ['year', 'object', 'activity'],
                                 'name_obs_values': 'f_classical'
                                 },

                         },
            'working_time': {'omega': {'classical_error': {'sd': 0.04, 'structure': 'multiplicative', 'proposal_sd': 0.005},
                                       'Berkson_error': {'sd': 0.12, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                                       'exposure_model_distribution': 'beta',
                                       'exposure_model_parameters': {'alpha': 1, 'beta': 1},
                                       'exposure_model_truncation': {'lower': 0.6, 'upper': 1.5},
                                       'mapping_identifier_classical': ['w_period'],
                                       'mapping_identifier_Berkson': ['year', 'object'],
                                       'name_obs_values': 'w_classical'
                                       }
                             },
            'equilibrium': {'gamma': {'classical_error': {'sd': 0.23, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                                      'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                                      'exposure_model_distribution': 'beta',
                                      'exposure_model_parameters': {'alpha': 1, 'beta': 1},
                                      'exposure_model_truncation': {'lower': 0.05, 'upper': 0.8},
                                      'mapping_identifier_classical': ['g_period', 'object'],
                                      'mapping_identifier_Berkson': ['year', 'OBJECT_GROUP'],
                                      'name_obs_values': 'g_classical'
                                      },
                            }
            }

    if not me_correction:
        for model in uncertainty_characteristics:
            for factor in uncertainty_characteristics[model]:
                uncertainty_characteristics[model][factor]['classical_error']['sd'] = 0
                uncertainty_characteristics[model][factor]['Berkson_error']['sd'] = 0

    return uncertainty_characteristics


In [None]:
def run_chain(seed, me_correction, chain, disease_model, path_results, iterations, burnin, phases, thin, data=data):
    proposal_sds = generate_proposal_sds()
    prior_parameters = generate_prior_parameters()
    start_values = generate_start_values(seed, chain, disease_model, me_correction)
    uncertainty_characteristics = generate_uncertainty_characteristics(me_correction)

    if os.path.exists(path_results):
        print('Attention! Result path exists. Results may get overwritten!')
    else:
        os.makedirs(path_results)

    if me_correction:
        for measurement_model in start_values[chain]['prior_parameters']:
            for uncertain_factor in start_values[chain]['prior_parameters'][measurement_model]:
                start_values[chain]['prior_parameters'][measurement_model][uncertain_factor] = uncertainty_characteristics[measurement_model][uncertain_factor]['exposure_model_parameters']

    mcmc = MCMC(data=data,
                uncertainty_characteristics=uncertainty_characteristics,
                s=np.array([0, 40, 55, 75, 104]),
                path_results=path_results,
                proposal_sd=proposal_sds,
                prior_parameters=prior_parameters,
                start_values=start_values,
                informative_priors=False,
                chain=chain,
                disease_model=disease_model,
                fixed_parameters=False)
    
    time_mcmc = time.time()
    mcmc.run_adaptive_algorithm(iterations=iterations,
                                burnin=burnin,
                                adaptive_phases=phases,
                                save_chains=True,
                                thin=thin,
                                clear_display=True)

    return (True, time.time() - time_mcmc)

In [None]:
if __name__ == '__main__':
    # cores = 32
    cores = 2
    number_chains = 8
    
    np.random.seed(123)
    seeds = np.random.randint(2**32-1, size=number_chains * 2 * 2)
    me_correction = np.repeat([True, False], number_chains).tolist() * 2
    chains = [f"chain{i}" for i in range(1, number_chains+1)] * 2 * 2
    # disease_model = ["cox_like"] * number_chains * 2
    
    disease_model = ["ERR"] * number_chains * 2
    disease_model.extend(["cox_like"] * number_chains * 2)
    
    path_results = [os.getcwd() + f"/../results/application/main_results/{corr}/" for corr in np.repeat(["corrected_EHR", "uncorrected_EHR"], number_chains)]
    path_results.extend([os.getcwd() + f"/../results/application/main_results/{corr}/" for corr in np.repeat(["corrected_cox", "uncorrected_cox"], number_chains)])
    
    
    iterations = [2] * number_chains * 2 * 2
    burnin = [1] * number_chains * 2 * 2
    phases = [0] * number_chains * 2 * 2
    thin = [1] * number_chains * 2 * 2
    
    #### Uncomment, to reproduce results from paper
    # iterations = [100_000] * number_chains * 2 * 2
    # burnin = [50_000] * number_chains * 2 * 2
    # phases = [100] * number_chains * 2 * 2
    # thin = [200] * number_chains * 2 * 2
    
    args = zip(seeds, me_correction, chains, disease_model, path_results, iterations, burnin, phases, thin)
    
    warnings.filterwarnings('ignore')
    t = time.time()
    with multiprocessing.Pool(cores) as pool:
        res = pool.starmap(run_chain, args)
    print("Full calculation time: " + str(time.time() - t))
