# Simulation study
### Setting:
- $\beta = 0.003$
- 5000 miners
- Measurement error correction
- Missspecification: lognorm instead of truncnorm and vice versa

Written to `missspec1`

In [1]:
# general libraries
import os
import numpy as np
import scipy.stats as stats
import warnings
import multiprocessing
import time
import matplotlib.pyplot as plt
import seaborn as sns

# sampling code
import sys
sys.path.append('..')
import wismut.basics as basics
from wismut.MCMC import MCMC
import wismut.analyze_chains as ac
path = os.getcwd() + "/"

### Define prior parameters

In [2]:
def generate_prior_parameters():
    prior_parameters = {'beta': {'dist': "normal", 'mean': 0, 'sd': 200},
                        'lambda1': {'dist': "gamma",'shape': 600,
                                    'scale': 1 / 10000000,
                                    'min': 0, 'max': 200
                                    },
                        'lambda2': {'dist': "gamma", 'shape': 12000,
                                    'scale': 1 / 1000000,
                                    'min': 0, 'max': 200
                                    },
                        'lambda3': {'dist': "gamma", 'shape': 46000,
                                    'scale': 1 / 1000000,
                                    'min': 0, 'max': 200
                                    },
                        'lambda4': {'dist': "gamma", 'shape': 1000,
                                    'scale': 1 / 100000,
                                    'min': 0, 'max': 200
                                    },
                        'C_Rn_mu': {'dist': "normal", 'mean': 1.887864, 'sd': 2},
                        'C_Rn_sigma': {'dist': "normal", 'mean': 0.9852445, 'sd': 0.1},
                        'C_Exp_mu': {'dist': "normal", 'mean': -77.12443, 'sd': 50},
                        'C_Exp_sigma': {'dist': "normal", 'mean': 27.34967, 'sd': 5},
                        'C_RPD_mu': {'dist': "normal", 'mean': -1.80017, 'sd': 0.5},
                        'C_RPD_sigma': {'dist': "normal", 'mean': 0.989060, 'sd': 0.02},
                        'zeta_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'zeta_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'gamma_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'gamma_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'phi_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'phi_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'omega_alpha': {'dist': "normal", 'mean': 3, 'sd': 2},
                        'omega_beta': {'dist': "normal", 'mean': 3, 'sd': 2},
                        }
    return prior_parameters



### Deinfe Proposal sds for MCMC

In [3]:
def generate_proposal_sds(disease_model='cox'):
    proposal_sd = {
            'beta': 0.011 if disease_model == 'cox' else 0.00011*10,
            'lambda1': 0.000211,
            'lambda2': 0.000611,
            'lambda3': 0.000611,
            'lambda4': 0.000211,
            'C_Rn_old_mu': 0.2,
            'C_Rn_old_sigma': 0.2,
            'C_Rn_ref_mu': 0.2,
            'C_Rn_ref_sigma': 0.2,
            'C_Rn_mu': 0.1,
            'C_Rn_sigma': 0.2,
            'C_RPD_mu': 0.2,
            'C_RPD_sigma': 0.2,
            'C_Exp_mu': 0.4,
            'C_Exp_sigma': 0.5,
            'zeta_alpha': 0.5,
            'zeta_beta': 0.5,
            'gamma_alpha': 0.5,
            'gamma_beta': 0.5,
            'phi_alpha': 0.5,
            'phi_beta': 0.5,
            'omega_alpha': 0.2,
            'omega_beta': 0.5,
            }
    return proposal_sd

### Define Start values of Markov chains

In [4]:
def generate_start_values(seed, chain, disease_model="cox_like", me_correction=True):
    np.random.seed(seed)
    rnd = lambda: stats.uniform(loc=0.9, scale=0.2).rvs(1)[0]
    
    beta_true = 0.3 if disease_model == "cox_like" else 1.0
    l1 = 0.00006
    l2 = 0.00120
    l3 = 0.00460
    l4 = 0.01000

    start_values = {chain: {'beta': beta_true * rnd(),
                               'lambda1': l1 * rnd(),
                               'lambda2': l2 * rnd(),
                               'lambda3': l3 * rnd(),
                               'lambda4': l4 * rnd(),
                               # values for truncnorm
                               'prior_parameters': {
                                                    'M2': {'C_Rn': {'mu': 2 * rnd(),
                                                                    'sigma': 1 * rnd()
                                                                    },
                                                           },
                                                    'M2_Expert': {'C_Exp': {'mu': -77 * rnd(),
                                                                            'sigma': 27 * rnd()
                                                                            },
                                                                  },
                                                    'M3': {'C_RPD': {'mu': -2 * rnd(),
                                                                    'sigma': 1 * rnd()
                                                                     },
                                                           'zeta': {'alpha': 3 * rnd(),
                                                                    'beta': 3 * rnd()
                                                                    }
                                                           },
                                                    'equilibrium': {'gamma': {'alpha': 3 * rnd(),
                                                                              'beta': 3 * rnd()
                                                                              }
                                                                    },
                                                    'activity': {'phi': {'alpha': 3 * rnd(),
                                                                         'beta': 3 * rnd()
                                                                         }
                                                                 },
                                                    'working_time': {'omega': {'alpha': 3 * rnd(),
                                                                               'beta': 3 * rnd()
                                                                                }
                                                                     },
                                                    }
                               }
                    }
    if not me_correction:
        del start_values[chain]['prior_parameters']

    return start_values


### Define uncertainty characteristics

In [5]:
###########################
# M1a M2 M2_Expert M3 M4
###########################
uncertainty_characteristics = {
        'M1a': {'C_Rn_old': {'classical_error': {'sd': 6.56, 'structure': 'additive', 'proposal_sd': 1.5},
                             'Berkson_error': {'sd': 0},
                             'exposure_model_distribution': 'lognorm',
                             'exposure_model_parameters': {'mu': 3.097054, 'sigma': 0.1857366},
                             'exposure_model_truncation': {'lower': 1e-10},
                             'mapping_identifier_classical': ['cluster_C_Rn_old'],
                             'name_obs_values': 'C_Rn_old'
                             },
                'C_Rn_ref': {'classical_error': {'sd': 5.29, 'structure': 'additive', 'proposal_sd': 1.0},
                             'Berkson_error': {'sd': 0},
                             'exposure_model_distribution': 'lognorm',
                             'exposure_model_parameters': {'mu': 3.296464, 'sigma': 0.1513905},

                             'exposure_model_truncation': {'lower': 1e-10},
                             'mapping_identifier_classical': ['cluster_C_Rn_obs_ref'],
                             'name_obs_values': 'C_Rn_obs_ref'
                             },
                'b': {'classical_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                      'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                      'exposure_model_distribution': 'beta',
                      'exposure_model_parameters': {'alpha': 3, 'beta': 3},
                      'exposure_model_truncation': {'lower': 0.17, 'upper': 1},
                      'mapping_identifier_classical': ['b_period'],
                      'mapping_identifier_Berkson': ['year', 'object'],
                      'name_obs_values': 'b'
                      },
                'tau_e': {'classical_error': {'sd': 0.37, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                          'Berkson_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                          'exposure_model_distribution': 'beta',
                          'exposure_model_parameters': {'alpha': 3, 'beta': 3},
                          'exposure_model_truncation': {'lower': 0.46, 'upper': 1},
                          'mapping_identifier_classical': ['tau_e_period'],
                          'mapping_identifier_Berkson': ['year','object'],
                          'name_obs_values': 'tau_e'
                          },
                'A': {'classical_error': {'sd': 0},
                      'Berkson_error': {'sd': 0},
                      'name_obs_values': 'A_calculated'
                      },
                'A_ref': {'classical_error': {'sd': 0},
                          'Berkson_error': {'sd': 0},
                          'name_obs_values': 'A_ref'
                          },
                'r': {'classical_error': {'sd': 0},
                      'Berkson_error': {'sd': 0},
                      'name_obs_values': 'r'
                      },
                },
        'M2': {'C_Rn': {'classical_error': {'sd': 0.59, 'structure': 'additive', 'proposal_sd': 0.1},
                        'Berkson_error': {'sd': 0},
                        'exposure_model_distribution': 'lognorm',
                        'exposure_model_parameters': {'mu': 1.887864, 'sigma': 0.9852445},
                        'exposure_model_truncation': {'lower': 1e-10},
                        'mapping_identifier_classical': ['year', 'object'],
                        'name_obs_values': 'C_Rn_obs'
                        },
               },
        'M2_Expert': {'C_Exp': {'classical_error': {'sd': 0.936, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                                'Berkson_error': {'sd': 0},
                                'exposure_model_distribution': 'norm',
                                'exposure_model_parameters': {'mu': -77.12443, 'sigma': 27.34967},
                                'exposure_model_truncation': {'lower': 1e-10},
                                'mapping_identifier_classical': ['year', 'object'],
                                'name_obs_values': 'C_Rn_obs',
                                },
                   },
        'M3': {'C_RPD': {'classical_error': {'sd': 0.03, 'structure': 'additive', 'proposal_sd': 0.001},
                         'Berkson_error': {'sd': 0},
                         'exposure_model_distribution': 'lognorm',
                         'exposure_model_parameters': {'mu': -1.80017, 'sigma': 0.9890602},
                         'exposure_model_truncation': {'lower': 1e-10},
                         'mapping_identifier_classical': ['year', 'object'],
                         'name_obs_values': 'C_Rn_obs'
                         },
               'zeta': {'classical_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                        'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                        'exposure_model_distribution': 'beta',
                        'exposure_model_parameters': {'alpha': 3, 'beta': 3},
                        'exposure_model_truncation': {'lower': 1.2, 'upper': 1.5},
                        'mapping_identifier_classical': ['object'],
                        'mapping_identifier_Berkson': ['year','object'],
                        'name_obs_values': 'c_classical'
                        },
               },
        'M4': {'E_Rn': {'classical_error': {'sd': 0.936, 'structure': 'multiplicative', 'proposal_sd': 0.12},
                        'Berkson_error': {'sd': 0},
                        'exposure_model_distribution': 'norm',
                        'exposure_model_parameters': {'mu': -138.846, 'sigma': 40.11283},
                        'exposure_model_truncation': {},
                        'mapping_identifier_classical': ['year', 'object'],
                        'name_obs_values': 'C_Rn_obs'
                        },
               },
        'activity': {'phi': {'classical_error': {'sd': 0.33, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                             'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                             'exposure_model_distribution': 'beta',
                             'exposure_model_parameters': {'alpha': 3, 'beta': 3},
                             'exposure_model_truncation': {'lower': 1e-10, 'upper': 1.0},
                             'mapping_identifier_classical': ['object', 'activity'],
                             'mapping_identifier_Berkson': ['year', 'object', 'activity'],
                             'name_obs_values': 'f_classical'
                             },
                     },
        'working_time': {'omega': {'classical_error': {'sd': 0.04, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                                   'Berkson_error': {'sd': 0.12, 'structure': 'multiplicative', 'proposal_sd': 0.03},
                                   'exposure_model_distribution': 'beta',
                                   'exposure_model_parameters': {'alpha': 3, 'beta': 3},
                                   'exposure_model_truncation': {'lower': 0.88, 'upper': 1.2},
                                   'mapping_identifier_classical': ['w_period'],
                                   'mapping_identifier_Berkson': ['year', 'object'],
                                   'name_obs_values': 'w_classical'
                                   }
                         },
        'equilibrium': {'gamma': {'classical_error': {'sd': 0.23, 'structure': 'multiplicative', 'proposal_sd': 0.01},
                                  'Berkson_error': {'sd': 0.69, 'structure': 'multiplicative', 'proposal_sd': 0.1},
                                  'exposure_model_distribution': 'beta',
                                  'exposure_model_parameters': {'alpha': 3, 'beta': 3},
                                  'exposure_model_truncation': {'lower': 0.2, 'upper': 0.6},
                                  'mapping_identifier_classical': ['g_period', 'object'],
                                  'mapping_identifier_Berkson': ['year', 'object'], 
                                  'name_obs_values': 'g_classical'
                                  },
                        },
        }

### Define function to run one chain on one dataset

In [6]:
def run_dataset(nb, seed, chain):
    data = basics.read_data(path + f"../data/M1M2M2_ExpertM3M4-b3-5/Data_{nb}.csv")
    data['tau'] = 1
    path_results = path + f'../results/simulation_study/missspec1/{nb}/'
    # print(path_results)
    os.makedirs(path_results, exist_ok=True)

    disease_model = 'cox_like'

    start_values = generate_start_values(seed, chain, disease_model)
    proposal_sd = generate_proposal_sds()
    prior_parameters = generate_prior_parameters()
    s = np.array([0, 40, 55, 75, 104])

    mcmc = MCMC(data=data,uncertainty_characteristics=uncertainty_characteristics, s=s, path_results=path_results, proposal_sd=proposal_sd,
            prior_parameters=prior_parameters, start_values=start_values,
            informative_priors=False, chain=chain, 
            disease_model=disease_model, fixed_parameters=False)


    # iterations = 100_000; burnin = 20_000; phases = 100; thin = 100
    iterations = 2; burnin = 1; phases = 0; thin = 1
    mcmc.run_adaptive_algorithm(iterations=iterations,
                                burnin=burnin,
                                adaptive_phases=phases,
                                thin=thin,
                                clear_display=True
                               )

### Run 4 chains on 100 datasets

In [None]:
nb_data_sets = 100
cores = 50

nb_data_sets = 2
cores = 2


datasets = list(range(1,nb_data_sets+1))*4
seeds = np.random.randint(2**32-1, size=nb_data_sets*4)
chains = [*["chain1"]*nb_data_sets, *["chain2"]*nb_data_sets,  *["chain3"]*nb_data_sets,  *["chain4"]*nb_data_sets]
arg_list = list(zip(datasets, seeds, chains))
warnings.filterwarnings('ignore')

t = time.time()
with multiprocessing.Pool(cores) as pool:
    res = pool.starmap(run_dataset, arg_list)
t = time.time() - t
print("Full calculation time: " + str(time.time() - t))
print(f"Time in hours: {t/3600}")