In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import geopandas as gpd
from tableone import TableOne
from utils import read_data, plot_covariate_distributions, plot_match, compare_balance, sizeof_fmt, optimize_memory_df, plot_categorical_proportional_diff, compute_mean_differences_and_proportions, love_plot, sensitivity_analysis_k_neighbors

## CHECK VARIABLES INPATIENT AND OUTPATIENT HOSPITALISATION !!

Weirdly enough, these seem to be identical ! -> Done and it is correct !!

In [None]:
main_folder = Path('../Manuscript/Economic analysis of integrative medicine/')
data_folder = Path('../Data')
results_folder = main_folder/'Results'
figures_folder = main_folder/'Figures'

In [None]:
import os
model_folder = results_folder/'Models'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

## Load data

In [None]:
df_treated_filtered = pd.read_parquet('../Data/processed/df_treated_filtered.parquet.gzip')

In [None]:
df_treated_filtered.shape[0]

In [None]:
df_treated_filtered[df_treated_filtered.gp.isnull()]

In [None]:
df_treated_filtered.groupby(['NOANNEE','gp']).size()

In [None]:
df_treated_filtered.groupby('treatment_lca_cam').size()

In [None]:
df_treated_filtered['PRESTATIONS_BRUTES_AOS'].sum()

## Table 1 

In [None]:
variable_names = pd.DataFrame({"old": ['multimorbidity','ssep3_q','Urbanicity_simple','n_atc','n_flags','NBAGE',"NBAGE_std",'age_group', "ssep3_std",'ssep3', 'region_DE', 'region_FR', 'region_IT','urb_Urbain','urb_Périurbain','Asthma_PCG', 'Cancer_PCG', 'Diabetes_PCG', 'Epilepsy_PCG',
       'Glaucoma_PCG', 'HIV_AIDS_PCG', 'Heart_disease_PCG',
       'Hypertension_related_PCG', 'Immune_PCG', 'Inflammatory_PCG',
       'Mental_PCG', 'Other_PCG', 'Pain_PCG', 'Parkinson_PCG', 'Thyroid_PCG', "SEX_F",'SEX','CDPHYSSEXE','LANG', "cds_std",'cds','LANG_FR','D_MEDIC_B','D_MEDIC_S','D_MEDIC_B_std','D_MEDIC_S_std','DEDUCTIBLE_above_500','E_std','N_std','E_std:N_std','PRESTATIONS_BRUTES_ATC','PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_CAM','PRESTATIONS_BRUTES_AMBULATOIRE','PRESTATIONS_BRUTES_STATIONNAIRE','PRESTATIONS_ACCIDENT','PRESTATIONS_DISEASE','PRESTATIONS_BIRTH','MTFRANCHISECOUV','mean_pm10','mean_no2','mean_pm25','mean_ndvi','mean_lst','mean_carnight'],
                           "new": ['Multimorbidity','Swiss-SEP','Urbanicity','Number of ATC','Number of PCG flags','Age',"Age", 'Age Group', "SES index",'SES index','German', 'French', 'Italian','Urban','Periurban', 'Asthma', 'Cancer', 'Diabetes', 'Epilepsy', 'Glaucoma', 'HIV/AIDS',
       'Heart disease', 'Hypertension related', 'Immune', 'Inflammatory',
       'Mental', 'Other', 'Pain', 'Parkinson', 'Thyroid', "Sex (Female)",'Sex','Sex','Langage', "CDS",'CDS','French speaker','Access to prim. care med.','Access to spec. med.','Access to prim. care med.','Access to spec. med.','Deductible (>500)','E','N','E:N','Drug-related claims (CHF)','Total claims amount (CHF)','CM claims (MHI) (CHF)','CAM claims (SI) (CHF)','CAM claims (MHI) (CHF)','Ambulatory claims (CHF)','Stationary claims (CHF)','Accident-related claims (CHF)','Disease-related claims (CHF)','Birth-related claims (CHF)','Deductible','PM10','NO2','PM25','NDVI','LST','Nighttime car noise']})
def update_variable_names(summary_table, variable_names, table_type):
    name_mapper = variable_names.set_index('old')['new'].to_dict()
    if table_type == 'summary':
        name_mapper = {f"{key}, mean (SD)": f"{value}, mean (SD)" for key, value in name_mapper.items()}
    elif table_type == 'categorical':
        name_mapper = {f"{key}, n (%)": f"{value}, n (%)" for key, value in name_mapper.items()}
    summary_table = summary_table.rename(index=name_mapper)
    return summary_table

In [None]:
columns = ['PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_CAM','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_ATC','PRESTATIONS_BRUTES_AMBULATOIRE','PRESTATIONS_BRUTES_STATIONNAIRE','n_flags','n_atc']
mytable = TableOne(df_treated_filtered, columns, groupby='NOANNEE', 
                   categorical = [],
                   pval=True,
                   pval_adjust='bonferroni', 
                   htest_name=False, 
                   nonnormal = ['PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_CAM','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_ATC','PRESTATIONS_BRUTES_AMBULATOIRE','PRESTATIONS_BRUTES_STATIONNAIRE','n_flag','n_atc'],
                   missing=False,
                   normal_test=True,
                   tukey_test=True)
summary_table_year = update_variable_names(mytable.tableone,variable_names,'summary')
summary_table_year

In [None]:
summary_table_year.to_clipboard()

In [None]:
df_treated_filtered.n_atc.plot.kde()

In [None]:
columns = ['PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_CAM','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_ATC','PRESTATIONS_BRUTES_AMBULATOIRE','PRESTATIONS_BRUTES_STATIONNAIRE','n_flags','n_atc']
mytable = TableOne(df_treated_filtered, columns, groupby='CDPHYSSEXE', 
                   categorical = [],
                   pval=True,
                   pval_adjust='bonferroni', 
                   htest_name=False, 
                   nonnormal = ['PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_CAM','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_ATC','PRESTATIONS_BRUTES_AMBULATOIRE','PRESTATIONS_BRUTES_STATIONNAIRE','n_flags','n_atc'],
                   missing=False,
                   normal_test=True,
                   tukey_test=True)
summary_table_year = update_variable_names(mytable.tableone,variable_names,'summary')
summary_table_year

In [None]:
summary_table_year.to_clipboard()

In [None]:
columns = ['PRESTATIONS_BRUTES_CAM']
mytable = TableOne(df_treated_filtered[df_treated_filtered.PRESTATIONS_BRUTES_CAM>0], columns, groupby='NOANNEE', 
                   categorical = [],
                   pval=True,
                   pval_adjust='bonferroni', 
                   htest_name=False, 
                   nonnormal = ['PRESTATIONS_BRUTES_CAM'],
                   missing=False,
                   normal_test=True,
                   tukey_test=True)
summary_table_year = update_variable_names(mytable.tableone,variable_names,'summary')
summary_table_year

In [None]:
summary_table_year.to_clipboard()

In [None]:
columns = ['PRESTATIONS_BRUTES_CAM']
mytable = TableOne(df_treated_filtered[df_treated_filtered.PRESTATIONS_BRUTES_CAM>0], columns, groupby='CDPHYSSEXE', 
                   categorical = [],
                   pval=True,
                   pval_adjust='bonferroni', 
                   htest_name=False, 
                   nonnormal = ['PRESTATIONS_BRUTES_CAM'],
                   missing=False,
                   normal_test=True,
                   tukey_test=True)
summary_table_year = update_variable_names(mytable.tableone,variable_names,'summary')
summary_table_year

In [None]:
summary_table_year.to_clipboard()

In [None]:
columns = ['CDPHYSSEXE','age_group', 'Language','Urbanicity_simple','ssep3_q','Asthma_PCG', 'Cancer_PCG', 'Diabetes_PCG', 'Epilepsy_PCG',
       'Glaucoma_PCG', 'HIV_AIDS_PCG', 'Heart_disease_PCG',
       'Hypertension_related_PCG', 'Immune_PCG', 'Inflammatory_PCG',
       'Mental_PCG', 'Other_PCG', 'Pain_PCG', 'Parkinson_PCG', 'Thyroid_PCG','n_inpatient_hosp', 'multimorbidity']
mytable = TableOne(df_treated_filtered, columns, groupby='NOANNEE', 
                   categorical = ['CDPHYSSEXE','age_group', 'Language','Urbanicity_simple','ssep3_q','Asthma_PCG', 'Cancer_PCG', 'Diabetes_PCG', 'Epilepsy_PCG',
       'Glaucoma_PCG', 'HIV_AIDS_PCG', 'Heart_disease_PCG',
       'Hypertension_related_PCG', 'Immune_PCG', 'Inflammatory_PCG',
       'Mental_PCG', 'Other_PCG', 'Pain_PCG', 'Parkinson_PCG', 'Thyroid_PCG','n_inpatient_hosp','multimorbidity'],
                   pval=True,
                   pval_adjust='bonferroni', 
                   htest_name=False, 
                   missing=False,
                   normal_test=True,
                   tukey_test=True)
summary_table_year = update_variable_names(mytable.tableone,variable_names,'categorical')
summary_table_year

In [None]:
summary_table_year.to_clipboard()

In [None]:
n_sex_uuid = df_treated_filtered.groupby('uuid').CDPHYSSEXE.nunique().sort_values()

In [None]:
columns = ['PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_CAM','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_ATC','PRESTATIONS_BRUTES_AMBULATOIRE','PRESTATIONS_BRUTES_STATIONNAIRE','n_flags','n_atc', 'CDPHYSSEXE','age_group', 'Language','Urbanicity_simple','ssep3_q','Asthma_PCG', 'Cancer_PCG', 'Diabetes_PCG', 'Epilepsy_PCG',
       'Glaucoma_PCG', 'HIV_AIDS_PCG', 'Heart_disease_PCG',
       'Hypertension_related_PCG', 'Immune_PCG', 'Inflammatory_PCG',
       'Mental_PCG', 'Other_PCG', 'Pain_PCG', 'Parkinson_PCG', 'Thyroid_PCG', 'multimorbidity']
mytable = TableOne(df_treated_filtered, columns, groupby='treatment_lca_cam', 
                   categorical = ['CDPHYSSEXE','age_group', 'Language','Urbanicity_simple','ssep3_q','Asthma_PCG', 'Cancer_PCG', 'Diabetes_PCG', 'Epilepsy_PCG',
       'Glaucoma_PCG', 'HIV_AIDS_PCG', 'Heart_disease_PCG',
       'Hypertension_related_PCG', 'Immune_PCG', 'Inflammatory_PCG',
       'Mental_PCG', 'Other_PCG', 'Pain_PCG', 'Parkinson_PCG', 'Thyroid_PCG','multimorbidity'],
                   pval=True,
                   pval_adjust='bonferroni', 
                   htest_name=False, 
                   missing=False,
                   normal_test=True,
                   tukey_test=True)
summary_table_year = update_variable_names(mytable.tableone,variable_names,'categorical')
summary_table_year

In [None]:
df_treated_filtered['treatment_lca_cam']

In [None]:
summary_table_year.to_clipboard()

## Multilevel model

### Prestations LCA

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
sample_uuid = df_treated_filtered.sample(1000, random_state=42).uuid.unique()

In [None]:
df_sample = df_treated_filtered[df_treated_filtered.uuid.isin(sample_uuid)]

In [None]:
df_sample.to_csv("/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/df_sample.csv")

## Comparison between R and Python code

In [None]:
Fribourg_data = pd.read_stata('/Users/david/Dropbox/PhD/Course material/Multilevel modelling of clustered data/data_for_participants/Fribourg_data_new.dta')

In [None]:
model_ri_frib = smf.mixedlm(formula='q1km ~ sex + generati + wend + C(otime)', 
                             data=Fribourg_data, 
                             groups=Fribourg_data['code'], 
                             re_formula='~1').fit(reml=False)

In [None]:
print(model_ri_frib.summary())

In [None]:
# Obtain the residuals
residuals = model_ri_frib.resid
# Obtain the fitted values  
fitted_values = model_ri_frib.fittedvalues
random_intercepts = pd.DataFrame({k: v['Group'] for k, v in  model_ri_frib.random_effects.items()}, index=['random_intercept']).T


In [None]:
model_rs_frib = smf.mixedlm(formula='q1km ~ sex + generati + wend + C(otime)', 
                             data=Fribourg_data, 
                             groups=Fribourg_data['code'], 
                             re_formula='~wend').fit(reml=False)

In [None]:
print(model_rs_frib.summary())

In [None]:
# Ensure 'Fribourg_data' is a pandas DataFrame and that 'otime' is treated as a categorical variable
Fribourg_data['otime'] = Fribourg_data['otime'].astype('category')

# Define the mixed effects model formula
# 'C()' is used to specify categorical variables in statsmodels
model_formula = 'q1km ~ sex + generati + wend + C(otime)'

# Fit the mixed effects model
# Specify the random intercepts and slopes for 'code' and random intercepts for 'famcode'
model_rIs_frib = smf.mixedlm(model_formula, Fribourg_data, 
                          groups=Fribourg_data['code'], 
                          re_formula='1 + wend', 
                          vc_formula={'famcode': '0 + C(famcode)'}).fit(reml=False)


In [None]:
print(model_rIs_frib.summary())

In [None]:
# random.famcode <- ranef(model_rIs_frib)$famcode
# random.famcode$`(Intercept)`[row.names(random.famcode) == "2726"]

## Multilevel Modeling 

In [None]:
formula_1 = 'ihs_cost_aos ~ treatment*year'
formula_2 = 'ihs_cost_aos ~ treatment*year + NBAGE + SEX_F'
formula_3 = 'ihs_cost_aos ~ treatment*year + NBAGE + SEX_F + cds + ssep3'
formula_4 = 'ihs_cost_aos ~ treatment*year + NBAGE + SEX_F + cds + ssep3 + CDLANGUE + D_MEDIC_S + D_MEDIC_B + mean_lst+mean_pm10 + mean_ndvi'

In [None]:
def output_model(model, model_n, model_type):
    model_n_folder = model_folder/f"Model{model_n}"
    model_n_type_folder = model_n_folder/model_type
    if not os.path.exists(model_n_folder):
        os.makedirs(model_n_folder)
    if not os.path.exists(model_n_type_folder):
        os.makedirs(model_n_type_folder)
    # Convert summary to dataframe
    quality_metrics = pd.DataFrame(model.summary().tables[0])
    parameters = pd.DataFrame(model.summary().tables[1])
    if model_type == 'OLS':
        diagnostics = pd.DataFrame(model.summary().tables[2])
        diagnostics.to_csv(model_n_type_folder/'model_diagnostics.csv')
    else:
        quality_metrics.loc[6] = ['AIC',model.aic,'','']
        quality_metrics.loc[7] = ['BIC',model.bic,'','']
    # Save to CSV
    quality_metrics.to_csv(model_n_type_folder/'model_metrics.csv')
    parameters.to_csv(model_n_type_folder/'model_parameters.csv')

In [None]:
from statsmodels.graphics.gofplots import qqplot

def plot_model_diagnostics(model, model_type, fitted_values, residuals, random_intercepts, figsize=(10, 6)):
    """
    Plots diagnostics for a fitted regression model.

    Parameters:
    fitted_values -- an array-like object of fitted values from the model
    residuals -- an array-like object of residuals from the model
    figsize -- a tuple defining the figure size for the plots
    """
    model_n_folder = model_folder/model
    model_n_type_folder = model_n_folder/model_type

    if not os.path.exists(model_n_folder):
        os.makedirs(model_n_folder)
    if not os.path.exists(model_n_type_folder):
        os.makedirs(model_n_type_folder)
    # Residuals vs Fitted plot
    plt.figure(figsize=figsize)
    plt.scatter(fitted_values, residuals, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted')
    plt.savefig(model_n_type_folder/'residuals_vs_fitted.png', dpi = 300, bbox_inches = 'tight')
    
    # Residuals across years
    plt.figure(figsize=figsize)
    sns.stripplot(x='year', y=residuals, data=df, jitter=True, alpha = 0.5)
    sns.despine()
    plt.savefig(model_n_type_folder/'res_over_years.png', dpi = 300, bbox_inches = 'tight')
    
    # QQ plot for level 1 residuals
    qqplot(residuals, line='s')
    plt.title('QQ Plot of level 1 Residuals')
    plt.savefig(model_n_type_folder/'qqplot_lvl1.png', dpi = 300, bbox_inches = 'tight')
    # QQ plot for level 2 residuals
    qqplot(random_intercepts, line='s')
    plt.title('QQ Plot of level 2 Residuals')
    plt.savefig(model_n_type_folder/'qqplot_lvl2.png', dpi = 300, bbox_inches = 'tight')

    # Histogram of residuals
    plt.figure(figsize=figsize)
    sns.histplot(residuals, kde=True)
    plt.xlabel('Residuals')
    plt.title('Histogram of Residuals')
    plt.savefig(model_n_type_folder/'residuals_hist.png', dpi = 300, bbox_inches = 'tight')

    # Scale-Location plot (Absolute Residuals vs Fitted Values)
    plt.figure(figsize=figsize)
    plt.scatter(fitted_values, np.abs(residuals), alpha=0.5)
    plt.axhline(y=np.mean(np.abs(residuals)), color='r', linestyle='--')
    plt.xlabel('Fitted Values')
    plt.ylabel('Absolute Residuals')
    plt.title('Scale-Location Plot')
    plt.savefig(model_n_type_folder/'abs_residuals_vs_fitted.png', dpi = 300, bbox_inches = 'tight')

## Prototyping on data sample

In [None]:
df = df_sample.copy()

### Naive OLS model

In [None]:
def run_ols(df, formula, n_formula, sample=False):
    print('Starting OLS modeling')
    ols_model = smf.ols(formula, data=df).fit()
    if sample:
        pass
    else:
        output_model(ols_model, n_formula, 'OLS')
    print(ols_model.summary())
    # Obtain the residuals
    residuals = ols_model.resid
    # Obtain the fitted values  
    fitted_values = ols_model.fittedvalues
    # Store the residuals in the original dataframe
    df[f'model{n_formula}_ols_residuals'] = residuals
    # Store the fitted values in the original dataframe
    df[f'model{n_formula}_ols_fitted_values'] = fitted_values

In [None]:
run_ols(df, formula_1, 1, True)
run_ols(df, formula_2, 2, True)
run_ols(df, formula_3, 3, True)
run_ols(df, formula_4, 4, True)

### Random intercept model

In [None]:
def run_mixedlm_ri(df, formula, n_formula, sample=False):
    print('Starting Mixed effect modeling with Random Intercepts for UUID')
    me_intercept_model = smf.mixedlm(formula, data=df, groups=df['uuid']).fit(reml=False)
    if sample:
        pass
    else:
        output_model(me_intercept_model, n_formula, 'RI')
    print(me_intercept_model.summary())
    # Obtain the residuals
    residuals = me_intercept_model.resid
    # Obtain the fitted values
    fitted_values = me_intercept_model.fittedvalues
    # Obtain the random intercepts
    random_intercepts = pd.DataFrame({k: v['Group'] for k, v in  me_intercept_model.random_effects.items()}, index=['random_intercept']).T
    
    # Store the residuals in the original dataframe
    df[f'model{n_formula}_ri_residuals'] = residuals
    # Store the fitted values in the original dataframe
    df[f'model{n_formula}_ri_fitted_values'] = fitted_values
    # Store the random intercepts
    df[f'model{n_formula}_ri_random_intercepts'] = df['uuid'].map(random_intercepts['random_intercept'].to_dict())
    # Calculate linear predictor
    model1_ri_lin_pred = fitted_values - df[f'model{n_formula}_ri_random_intercepts'] # the linear predictor

In [None]:
run_mixedlm_ri(df, formula_1, 1, True)
run_mixedlm_ri(df, formula_2, 2, True)
run_mixedlm_ri(df, formula_3, 3, True)
run_mixedlm_ri(df, formula_4, 4, True)

### Random slope model

In [None]:
def run_mixedlm_rs(df, formula, n_formula, sample=False):
    print('Starting Mixed effect modeling with Random Intercepts for UUID and Random Slopes for YEAR')
    me_slope_model = smf.mixedlm(formula, data=df, 
                              groups=df['uuid'], 
                              re_formula="~year").fit(reml=False)  
    if sample:
        pass
    else:
        output_model(me_slope_model, n_formula, 'RS')

    print(me_slope_model.summary())
    # Obtain the residuals
    residuals = me_slope_model.resid
    # Obtain the fitted values
    fitted_values = me_slope_model.fittedvalues
    # Obtain the random intercepts
    random_intercepts = pd.DataFrame({k: v['Group'] for k, v in  me_slope_model.random_effects.items()}, index=['random_intercept']).T
    # Store the residuals in the original dataframe
    df[f'model{n_formula}_rs_residuals'] = residuals
    # Store the fitted values in the original dataframe
    df[f'model{n_formula}_rs_fitted_values'] = fitted_values
    # Store the random intercepts
    df[f'model{n_formula}_rs_random_intercepts'] = df['uuid'].map(random_intercepts['random_intercept'].to_dict())
    # Calculate linear predictor
    model1_ri_lin_pred = fitted_values - df[f'model{n_formula}_ri_random_intercepts'] # the linear predictor

In [None]:
run_mixedlm_rs(df, formula_1, 1, True)
run_mixedlm_rs(df, formula_2, 2, True)
run_mixedlm_rs(df, formula_3, 3, True)
run_mixedlm_rs(df, formula_4, 4, True)

### Random slope model - More complex

In [None]:
def run_mixedlm_rs_complex(df, formula, n_formula, sample=False):
    print('Starting Mixed effect modeling with Random Intercepts for UUID and CDLANGUE and Random Slopes for YEAR')
    # Specify the random intercepts and slopes for 'code' and random intercepts for 'famcode'
    me_slope_2_model = smf.mixedlm(formula, df, 
                          groups=df['uuid'], 
                          re_formula='1 + year', 
                          vc_formula={'CDLANGUE': '0 + C(CDLANGUE)'}).fit(reml=False)
    if sample:
        pass
    else:
        output_model(me_slope_2_model, n_formula, 'RS2')

    print(me_slope_2_model.summary())
    # Obtain the residuals
    residuals = me_slope_2_model.resid
    # Obtain the fitted values
    fitted_values = me_slope_2_model.fittedvalues
    # Obtain the random intercepts
    random_intercepts = pd.DataFrame({k: v['Group'] for k, v in  me_slope_2_model.random_effects.items()}, index=['random_intercept']).T
    
    # Store the residuals in the original dataframe
    df[f'model{n_formula}_rs_complex_residuals'] = residuals
    # Store the fitted values in the original dataframe
    df[f'model{n_formula}_rs_complex_fitted_values'] = fitted_values
    # Store the random intercepts
    df[f'model{n_formula}_rs_complex_random_intercepts'] = df['uuid'].map(random_intercepts['random_intercept'].to_dict())

In [None]:
run_mixedlm_rs_complex(df, formula_1, 1, True)
run_mixedlm_rs_complex(df, formula_2, 2, True)
run_mixedlm_rs_complex(df, formula_3, 3, True)
run_mixedlm_rs_complex(df, formula_4, 4, True)

### Two-part Mixed-Effect Model
#### Part 1 : Logistic Regression for Zero vs Non-Zero Healthcare Spending

In [None]:
formula_1_part1 = 'non_zero_ihs_cost_aos ~ treatment*year'
formula_2_part1 = 'non_zero_ihs_cost_aos ~ treatment*year + NBAGE + SEX_F'
formula_3_part1 = 'non_zero_ihs_cost_aos ~ treatment*year + NBAGE + SEX_F + cds + ssep2'
formula_4_part1 = 'non_zero_ihs_cost_aos ~ treatment*year + NBAGE + SEX_F + cds + ssep2 + CDLANGUE + D_MEDIC_S + D_MEDIC_B + mean_lst+mean_pm10 + mean_ndvi'

In [None]:
df['non_zero_ihs_cost_aos'] = (df['ihs_cost_aos'] > 0).astype(int)
df.to_csv('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/df_sample.csv', index=False)
# Fit the logistic regression model
logistic_model = smf.mixedlm('non_zero_ihs_cost_aos ~ NBAGE', data=df, 
                             groups=df['uuid'], family=sm.families.Binomial()).fit()

# Summary of the logistic model
print(logistic_model.summary())

In [None]:
df[df.non_zero_ihs_cost_aos != 1]

## Actual models

In [None]:
df = df_treated_filtered.copy()

### Naive OLS model

In [None]:
run_ols(df, formula_1, 1)
run_ols(df, formula_2, 2)
run_ols(df, formula_3, 3)
run_ols(df, formula_4, 4)

In [None]:
# plot_model_diagnostics('Model1','OLS',df['model1_ols_fitted_values'], df['model1_ols_residuals'], df['model1_random_intercepts'])
# plot_model_diagnostics('Model2','OLS',df['model2_ols_fitted_values'], df['model2_ols_residuals'], df['model2_random_intercepts'])
# plot_model_diagnostics('Model3','OLS',df['model3_ols_fitted_values'], df['model3_ols_residuals'], df['model3_random_intercepts'])
# plot_model_diagnostics('Model4','OLS',df['model4_ols_fitted_values'], df['model4_ols_residuals'], df['model4_random_intercepts'])

### Random intercept model

In [None]:
run_mixedlm_ri(df, formula_1, 1)
run_mixedlm_ri(df, formula_2, 2)
run_mixedlm_ri(df, formula_3, 3)
run_mixedlm_ri(df, formula_4, 4)

In [None]:
plot_model_diagnostics('Model1','RI',df['model1_ri_fitted_values'], df['model1_ri_residuals'], df['model1_ri_random_intercepts'])
plot_model_diagnostics('Model2','RI',df['model2_ri_fitted_values'], df['model2_ri_residuals'], df['model2_ri_random_intercepts'])
plot_model_diagnostics('Model3','RI',df['model3_ri_fitted_values'], df['model3_ri_residuals'], df['model3_ri_random_intercepts'])
plot_model_diagnostics('Model4','RI',df['model4_ri_fitted_values'], df['model4_ri_residuals'], df['model4_ri_random_intercepts'])

### Random slope model

In [None]:
run_mixedlm_rs(df, formula_1, 1)
run_mixedlm_rs(df, formula_2, 2)
run_mixedlm_rs(df, formula_3, 3)
run_mixedlm_rs(df, formula_4, 4)

In [None]:
plot_model_diagnostics('Model1','RS',df['model1_rs_fitted_values'], df['model1_rs_residuals'], df['model1_rs_random_intercepts'])
plot_model_diagnostics('Model2','RS',df['model2_rs_fitted_values'], df['model2_rs_residuals'], df['model2_rs_random_intercepts'])
plot_model_diagnostics('Model3','RS',df['model3_rs_fitted_values'], df['model3_rs_residuals'], df['model3_rs_random_intercepts'])
plot_model_diagnostics('Model4','RS',df['model4_rs_fitted_values'], df['model4_rs_residuals'], df['model4_rs_random_intercepts'])

### Random slope model - More complex

In [None]:
run_mixedlm_rs_complex(df, formula_1, 1)
run_mixedlm_rs_complex(df, formula_2, 2)
run_mixedlm_rs_complex(df, formula_3, 3)
run_mixedlm_rs_complex(df, formula_4, 4)

In [None]:
plot_model_diagnostics('Model1','RS2',df['model1_rs_complex_fitted_values'], df['model1_rs_complex_residuals'], df['model1_rs_complex_random_intercepts'])
plot_model_diagnostics('Model2','RS2',df['model2_rs_complex_fitted_values'], df['model2_rs_complex_residuals'], df['model2_rs_complex_random_intercepts'])
plot_model_diagnostics('Model3','RS2',df['model3_rs_complex_fitted_values'], df['model3_rs_complex_residuals'], df['model3_rs_complex_random_intercepts'])
plot_model_diagnostics('Model4','RS2',df['model4_rs_complex_fitted_values'], df['model4_rs_complex_residuals'], df['model4_rs_complex_random_intercepts'])

## Model comparison

In [None]:
# Compare Models
print('Starting model comparision')
models = [('OLS', ols_model), ('Mixed_Intercepts', me_intercept_model), ('Mixed_Slopes', me_slope_model)]
for name, model in models:
    print(f"{name} AIC: {model.aic}")

In [None]:
from scipy.stats import chi2

# Function to calculate Likelihood Ratio Test
def lr_test(model1, model2):
    lr = 2 * (model2.llf - model1.llf)
    dof_difference = (model2.df_resid - model1.df_resid)
    p = chi2.sf(lr, df=dof_difference)
    return lr, p

# Compare Models
models = [('OLS', ols_model), ('Mixed_Intercepts', me_intercept_model), ('Mixed_Slopes', me_slope_model)]

for name, model in models:
    print(f"{name} AIC: {model.aic}, BIC: {model.bic}")

# Likelihood Ratio Test between mixed models
lr, p = lr_test(me_intercept_model, me_slope_model)
print(f"\nLikelihood Ratio Test between Mixed_Intercepts and Mixed_Slopes: LR = {lr}, p-value = {p}")


In [None]:
lr = (-2 * (me_intercept_model.llf) - (-2*(me_slope_model.llf)))
dof_difference = (me_slope_model.df_resid - me_intercept_model.df_resid)
p = chi2.sf(lr, df=100)

## Model diagnostics

In [None]:
# Example usage of the function with your dataframe 'df':
# plot_model_diagnostics('Model1','OLS',df['model1_ols_fitted_values'], df['model1_ols_residuals'])
plot_model_diagnostics('Model1','RI',df['model1_ri_fitted_values'], df['model1_ri_residuals'], df['model1_ri_random_intercepts'])
# plot_model_diagnostics('Model1','RS',df['model1_rs_fitted_values'], df['model1_rs_residuals'])

## Compute the Intraclass correlation coef for the random effect model

In [None]:
# Extract the variance components
variance_components = me_intercept_model.cov_re
group_variance = variance_components.iloc[0, 0]  # Variance due to grouping
residual_variance = model.scale  # Residual variance

# Calculate ICC
icc = group_variance / (group_variance + residual_variance)

print(f"Intraclass Correlation Coefficient (ICC): {icc}")

In [None]:
lr