In [None]:
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import networkx as nx
from pathlib import Path
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import sys
sys.path.append('/Users/david/Dropbox/PhD/Scripts/Spatial analyses')
import pyspace
import libpysal as lps
from scipy.spatial import cKDTree
from libpysal.weights.distance import get_points_array
from esda import fdr
from importlib import reload
pd.set_option('display.max_rows', 500)
reload(pyspace)
import seaborn as sns
from esda.moran import Moran
# sns.set_theme(font = 'Helvetica')
%matplotlib inline
from numba import NumbaDeprecationWarning
from matplotlib.patheffects import withStroke
import pyogrio
import warnings
import esda
# Suppress NumbaDeprecationWarning
warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)

In [None]:
# data_folder = Path('../data')
data_folder  = Path('../../SanteIntegra/Data/')
results_folder = Path('../output')

In [None]:
data = pd.read_parquet(data_folder/'processed'/'df_treated_filtered_nominors.parquet.gzip')
data = gpd.GeoDataFrame(data, crs = 4326, geometry=gpd.points_from_xy(data.lon_masked, data.lat_masked))

data = data.to_crs(2056)
data['E'], data['N'] = data['geometry'].x, data['geometry'].y

data_final = data[data.treatment.isnull()==False]

In [None]:
data_2017 = data_final[data_final.NOANNEE == 2017]
data_2018 = data_final[data_final.NOANNEE == 2018]
data_2019 = data_final[data_final.NOANNEE == 2019]
data_2020 = data_final[data_final.NOANNEE == 2020]
data_2021 = data_final[data_final.NOANNEE == 2021]

data_2017 = gpd.GeoDataFrame(data_2017, crs = 2056, geometry = data_2017['geometry'] )
data_2018 = gpd.GeoDataFrame(data_2018, crs = 2056, geometry = data_2018['geometry'] )
data_2019 = gpd.GeoDataFrame(data_2019, crs = 2056, geometry = data_2019['geometry'] )
data_2020 = gpd.GeoDataFrame(data_2020, crs = 2056, geometry = data_2020['geometry'] )
data_2021 = gpd.GeoDataFrame(data_2021, crs = 2056, geometry = data_2021['geometry'] )

In [None]:
cantons = gpd.read_file(
    '/Users/david/Dropbox/PhD/Data/Databases/SITG/SHAPEFILE_LV95_LN02/swissBOUNDARIES3D_1_3_TLM_KANTONSGEBIET.shp')
communes = gpd.read_file(
    '/Users/david/Dropbox/PhD/Data/Databases/SITG/SHAPEFILE_LV95_LN02/swissBOUNDARIES3D_1_3_TLM_HOHEITSGEBIET.shp')

# Spatial analyses
## Data as individual points
### Global autocorrelation of main features

- AOS Yearly Spending
- LCA Yearly Spending

In [None]:
individual_results_folder = results_folder/'Individual'

In [None]:
globalautocorr_result_folder = individual_results_folder/'Global Autocorrelation'
if not os.path.exists(globalautocorr_result_folder):
    os.makedirs(globalautocorr_result_folder)

In [None]:
def GlobalMoranI(db, col, year, distance, w, result_folder, seed=12345):
    xlabel = f"Global Moran's I - {col} - {year} - {distance}NN"
    file_path = result_folder / f'{xlabel}.pdf'

    # Check if the file already exists
    if file_path.exists():
        print(f"File '{file_path}' already exists. Skipping execution.")
        return None
    
    # Compute Moran's I
    y = db[col]
    np.random.seed(seed)
    mi = esda.moran.Moran(y, w)
    print(col, year, distance, mi.I, mi.p_sim, mi.z_sim)
    
    # Moran's I plot
    sns.kdeplot(mi.sim, fill=True)
    plt.vlines(mi.I, 0, plt.ylim()[1]*0.1, color='r', label='Moran\'s I')
    plt.vlines(mi.EI, 0, plt.ylim()[1]*0.1, label='Expected I')
    plt.xlabel(xlabel)
    plt.ylabel('Density')
    plt.title('Moran\'s I Distribution')
    plt.legend()
    # Save figure
    plt.savefig(file_path, dpi=320, bbox_inches='tight')
    plt.close()  # Close the plot after saving

    return mi

In [None]:
col_names = ['PRESTATIONS_BRUTES_AOS', 'PRESTATIONS_BRUTES_LCA']  

# Calculate weights once for each year and store them
years = [2017, 2018, 2019, 2020, 2021]
weights_by_year = {}
nn = 32

for year, df in zip(years, [data_2017, data_2018, data_2019, data_2020, data_2021]):
    # Calculate weights here (w)
    # Store the weights in the dictionary
    if year not in weights_by_year.keys():
        weights_by_year[year] = lps.weights.KNN(cKDTree(get_points_array(df.geometry.centroid)), nn)


# Now iterate over each column and use the pre-calculated weights
for col_name in col_names:
    for year, df in zip(years, [data_2017, data_2018, data_2019, data_2020, data_2021]):
        w = weights_by_year[year]  # Retrieve pre-calculated weights
        mi = GlobalMoranI(db=df, col=col_name, year=year, distance=nn, w=w, result_folder=globalautocorr_result_folder)

### Local autocorrelation using Getis Ord Gi* statistic

In [None]:
localautocorr_result_folder = individual_results_folder/'Local Autocorrelation'
if not os.path.exists(localautocorr_result_folder):
    os.makedirs(localautocorr_result_folder)

In [None]:
# Now iterate over each column and use the pre-calculated weights
for col_name in col_names:
    for year, df in zip(years, [data_2017, data_2018, data_2019, data_2020, data_2021]):
        w = weights_by_year[year]  # Retrieve pre-calculated weights
        getis_values = pyspace.compute_getis(df, col_name, w, 999, transform_type='B', p_001=False)
        fig, ax = pyspace.plotGetisMap(df, f"{col_name}_G_cl", markersize_s=0.01, markersize_l=0.1, p_001=False, commune_name=False)
        xlabel = f"Getis - {col_name} - {year} - {nn}NN"
        file_path = localautocorr_result_folder / f'{xlabel}.png'
        plt.savefig(file_path, dpi=1000, bbox_inches='tight')

In [None]:
dict_labels = {'PRESTATIONS_BRUTES_AOS':'Conv. Med. Yearly Spending (CHF)',
              'PRESTATIONS_BRUTES_LCA':'Compl. Med. Yearly Spending (CHF)',
              'ihs_cost_lca':'IHS transformed CAM claims amount'}

In [None]:
df_aos_costs = data_final[data_final.PRESTATIONS_BRUTES_AOS > data_final.MTFRANCHISECOUV]
df_lca_costs = data_final[data_final.PRESTATIONS_BRUTES_LCA > 0]
df_cam_costs = data_final[data_final.PRESTATIONS_BRUTES_CAM > 0]

### AOS USE

In [None]:
avg_prestation_aos_by_canton = df_aos_costs.groupby('CANTON_NAME', observed=True)['PRESTATIONS_BRUTES_AOS'].median()
rank = avg_prestation_aos_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_aos_costs, y="PRESTATIONS_BRUTES_AOS",x = 'CANTON_NAME',hue = 'lang_region',showfliers = False,dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
plt.legend(title='')
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CM (MHI) expenditures (CHF)', fontsize=12)
plt.savefig(results_folder/'Avg_CM_MHI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
gdf_names = communes[communes.NAME.isin(['Lausanne','Genève','Zürich','Basel','Bern'])]

In [None]:
gdf_names = gdf_names[gdf_names.EINWOHNERZ.isnull()==False]

In [None]:
col_name = 'ihs_cost_aos'
nn=32
df = df_aos_costs[~df_aos_costs[col_name].isnull()]

w = lps.weights.KNN(cKDTree(get_points_array(df.geometry.centroid)), nn)
getis_values = pyspace.compute_getis(df, col_name, w, 999, transform_type='B', p_001=False)
fig, ax = pyspace.plotGetisMap(df, f"{col_name}_G_cl", markersize_s=0.08, markersize_l=1, p_001=False, commune_name=False)
for x, y, label in zip(gdf_names.geometry.centroid.x, gdf_names.geometry.centroid.y, gdf_names['NAME']):
    ax.text(x, y, label, fontsize=8, ha='right', va='bottom',
            path_effects=[withStroke(linewidth=3, foreground='white')], zorder=8)
xlabel = f"Getis - {col_name} - {nn}NN"
file_path = localautocorr_result_folder / f'{xlabel}.png'
plt.savefig(file_path, dpi=640, bbox_inches='tight')

In [None]:
xlabel = f"Getis Bar Plot - {col_name} - {nn}NN"
file_path = Path(localautocorr_result_folder) / f'{xlabel}.png'
fig, ax = pyspace.plot_getis_by_class(df = df,x = f'{col_name}_G_cl',y = 'PRESTATIONS_BRUTES_AOS', label = 'Annual CM (MHI) expenditures (CHF)', xtick_size=8, title_size=12, xlabel_size=8,ylabel_size= 8, p_001=False, showfliers = False)
plt.savefig(file_path, dpi=320, bbox_inches='tight')

### CAM - SI USE

In [None]:
avg_prestation_aos_by_canton = df_lca_costs.groupby('CANTON_NAME', observed=True)['PRESTATIONS_BRUTES_LCA'].median()
rank = avg_prestation_aos_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_lca_costs, y="PRESTATIONS_BRUTES_LCA",x = 'CANTON_NAME',hue = 'lang_region',showfliers = False,dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
plt.legend(title='Language regions')
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CAM (SI) expenditures (CHF)', fontsize=12)
plt.savefig(results_folder/'Avg_CAM_SI_by_canton.png', dpi = 300, bbox_inches='tight')

### CAM - MHI USE

In [None]:
avg_prestation_aos_by_canton = df_cam_costs.groupby('CANTON_NAME', observed=True)['PRESTATIONS_BRUTES_CAM'].median()
rank = avg_prestation_aos_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_cam_costs, y="PRESTATIONS_BRUTES_CAM",x = 'CANTON_NAME',hue = 'lang_region',showfliers = False,dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
plt.legend(title='Language regions')
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CAM (MHI) expenditures (CHF)', fontsize=12)
plt.savefig(results_folder/'Avg_CAM_MHI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
col_name = 'ihs_cost_cam'
nn=32
df = df_cam_costs[~df_cam_costs[col_name].isnull()]

w = lps.weights.KNN(cKDTree(get_points_array(df.geometry.centroid)), nn)
getis_values = pyspace.compute_getis(df, col_name, w, 999, transform_type='B', p_001=False)
fig, ax = pyspace.plotGetisMap(df, f"{col_name}_G_cl", markersize_s=0.08, markersize_l=1, p_001=False, commune_name=False)
for x, y, label in zip(gdf_names.geometry.centroid.x, gdf_names.geometry.centroid.y, gdf_names['NAME']):
    ax.text(x, y, label, fontsize=8, ha='right', va='bottom',
            path_effects=[withStroke(linewidth=3, foreground='white')], zorder=8)
xlabel = f"Getis - {col_name} - {nn}NN"
file_path = localautocorr_result_folder / f'{xlabel}.png'
plt.savefig(file_path, dpi=640, bbox_inches='tight')

In [None]:
xlabel = f"Getis Bar Plot - {col_name} - {nn}NN"
file_path = Path(localautocorr_result_folder) / f'{xlabel}.png'
fig, ax = pyspace.plot_getis_by_class(df = df,x = f'{col_name}_G_cl',y = 'PRESTATIONS_BRUTES_CAM', label = 'Annual CAM (MHI) expenditures (CHF)', xtick_size=8, title_size=12, xlabel_size=8,ylabel_size= 8, p_001=False, showfliers = False)
plt.savefig(file_path, dpi=320, bbox_inches='tight')

## LCA USE

In [None]:
col_name = 'ihs_cost_lca'
nn=32
df = df_lca_costs[~df_lca_costs[col_name].isnull()]

w = lps.weights.KNN(cKDTree(get_points_array(df.geometry.centroid)), nn)
getis_values = pyspace.compute_getis(df, col_name, w, 999, transform_type='B', p_001=False)
fig, ax = pyspace.plotGetisMap(df, f"{col_name}_G_cl", markersize_s=0.08, markersize_l=1, p_001=False, commune_name=False)

for x, y, label in zip(gdf_names.geometry.centroid.x, gdf_names.geometry.centroid.y, gdf_names['NAME']):
    ax.text(x, y, label, fontsize=8, ha='right', va='bottom',
            path_effects=[withStroke(linewidth=3, foreground='white')], zorder=8)
# ax.set_title('A', loc = 'left', size= 16)
ax.set_axis_off()  # Hide axes

xlabel = f"Getis - {col_name} - {nn}NN"
file_path = localautocorr_result_folder / f'{xlabel}.png'
plt.savefig(file_path, dpi=640, bbox_inches='tight')

In [None]:
xlabel = f"Getis Bar Plot - {col_name} - {nn}NN"
file_path = Path(localautocorr_result_folder) / f'{xlabel}.png'

fig, ax = pyspace.plot_getis_by_class(df = df,x = f'{col_name}_G_cl',y = 'PRESTATIONS_BRUTES_LCA', label = 'Annual CAM (SI) expenditures (CHF)', xtick_size=8, title_size=12, xlabel_size=8,ylabel_size= 8, p_001=False, showfliers = False)
plt.savefig(file_path, dpi=320, bbox_inches='tight')

### Global autocorrelation of main features - Moran's I

In [None]:
from libpysal.weights import Queen, Rook, KNN
from libpysal.weights import lat2W, higher_order

In [None]:
agg_results_folder = results_folder/'Aggregated'

In [None]:
globalautocorr_agg_result_folder = agg_results_folder/'Global Autocorrelation'
if not os.path.exists(globalautocorr_agg_result_folder):
    os.makedirs(globalautocorr_agg_result_folder)

In [None]:
localautocorr_agg_result_folder = agg_results_folder/'Local Autocorrelation'
if not os.path.exists(localautocorr_agg_result_folder):
    os.makedirs(localautocorr_agg_result_folder)

In [None]:
weights_by_year_h3 = {}
weights_by_year_h3_pop_scaled = {}

In [None]:
def get_weights(df, nn, pop_scaled=False):
    print('Calculating weights')
    w = lps.weights.KNN(cKDTree(get_points_array(df.geometry.centroid)), nn)
    if pop_scaled:
        print('Scaling weights')
        # Adjust weights by the square root of the product of populations
        for i, neighbors in w.neighbors.items():
            for j in neighbors:
                original_weight = w.weights[i][neighbors.index(j)]
                adjusted_weight = original_weight *  df['n'].iloc[j]
                w.weights[i][neighbors.index(j)] = adjusted_weight
                w.transform = 'R'
        return w
    else:
        return w

In [None]:
col_names = ['PRESTATIONS_BRUTES_AOS', 'PRESTATIONS_BRUTES_LCA', 'cds']  # replace with your actual column names
# Calculate weights once for each year and store them
years = [2017, 2018, 2019, 2020, 2021]
nn = 18

for col_name in col_names:
    if col_name not in weights_by_year_h3.keys():
        weights_by_year_h3[col_name] = {}
        weights_by_year_h3_pop_scaled[col_name] = {}
    for year in years:
        print(col_name, year)
        # Store the weights in the dictionary
        df = h3_500_GM_merged_wfeatures[h3_500_GM_merged_wfeatures.NOANNEE == year]
        df_nonull = df[df[col_name].isnull()==False]
        if year not in weights_by_year_h3[col_name].keys():
            weights_by_year_h3[col_name][year] = get_weights(df_nonull, nn, pop_scaled=False)
            weights_by_year_h3_pop_scaled[col_name][year] = get_weights(df_nonull, nn, pop_scaled=True) 


# Now iterate over each column and use the pre-calculated weights
for col_name in col_names:
    for year in years:
        df = h3_500_GM_merged_wfeatures[h3_500_GM_merged_wfeatures.NOANNEE == year]
        df_nonull = df[df[col_name].isnull()==False]
        w = weights_by_year_h3[col_name][year]  # Retrieve pre-calculated weights
        mi = GlobalMoranI(db=df_nonull, col=col_name, year=year, distance=nn, w=w, result_folder=globalautocorr_agg_result_folder)

In [None]:
year = 2018
col_name = 'PRESTATIONS_BRUTES_AOS'
for nn in [6,18,36, 72, 144, 288]:
    df = h3_500_GM_merged_wfeatures[h3_500_GM_merged_wfeatures.NOANNEE == year]
    df_nonull = df[df[col_name].isnull()==False]
    w = lps.weights.KNN(cKDTree(get_points_array(df_nonull.geometry.centroid)), nn)  # Retrieve pre-calculated weights
    mi = GlobalMoranI(db=df_nonull, col=col_name, year=year, distance=nn, w=w, result_folder=globalautocorr_agg_result_folder)

### Local autocorrelation with Getis Ord Gi

In [None]:
from esda import fdr

In [None]:
nn = 36

In [None]:
def compute_plot_getis(df_merged, col_names, years, nn, pop_scaled, result_folder):
    df_getis = {}
    for col_name in col_names:
        df_getis[col_name] = {}
        for year in years:
            df = df_merged[df_merged.NOANNEE == year]
            df_nonull = df[df[col_name].notnull()]

            if pop_scaled:
                w = get_weights(df_nonull, nn, pop_scaled=True)
                getis_values = pyspace.compute_getis(df_nonull, col_name, w, 9999, star=False, transform_type='R', p_001=True)

            else:
                w = get_weights(df_nonull, nn)
                getis_values = pyspace.compute_getis(df_nonull, col_name, w, 9999, star=False, transform_type='B', p_001=True)
            xlabel = f"Getis - {col_name} - {year} - {nn}NN"
            file_path = Path(result_folder) / f'{xlabel}.png'
            if not file_path.exists():
                fig, ax = pyspace.plotGetisMap(df_nonull, f"{col_name}_G_cl", markersize_s=0.01, markersize_l=0.1, p_001=True, commune_name=False)
                plt.savefig(file_path, dpi=1000, bbox_inches='tight')
            
            # FDR adjustment
            fdr_pvalue = fdr(getis_values.p_sim, 0.05)
            df_nonull[f"{col_name}_G_cl_fdr"] = df_nonull[f"{col_name}_G_cl"]
            df_nonull.loc[df_nonull[f"{col_name}_G_psim"] >= fdr_pvalue, f"{col_name}_G_cl_fdr"] = 'Not significant'
            
            xlabel_fdr = f"Getis - {col_name} - {year} - {nn}NN_fdr"
            file_path_fdr = Path(result_folder) / f'{xlabel_fdr}.png'
            if not file_path_fdr.exists():
                fig, ax = pyspace.plotGetisMap(df_nonull, f"{col_name}_G_cl_fdr", markersize_s=0.01, markersize_l=0.1, p_001=True, commune_name=False)
                plt.savefig(file_path_fdr, dpi=1000, bbox_inches='tight')
            
            df_getis[col_name][year] = df_nonull
    
    return df_getis

In [None]:
# Binary weighting matrix
df_getis_results = compute_plot_getis(h3_500_GM_merged_wfeatures, col_names, years, nn, False, localautocorr_agg_result_folder)

In [None]:
# Population scaled weights (more populated h3 have more weights) then row-standardized
df_getis_results_pop_scaled = compute_plot_getis(h3_500_GM_merged_wfeatures, col_names, years, nn, True, localautocorr_agg_result_folder/'Population Scaled weights')

In [None]:
dict_labels = {'PRESTATIONS_BRUTES_AOS':'Conv. Med. Yearly Spending (CHF)',
              'PRESTATIONS_BRUTES_LCA':'Compl. Med. Yearly Spending (CHF)',
              'cds':'Chronic Disease Score (CDS)',
              'ihs_cost_lca':'IHS transformed CAM claims amount'}

### Avg by Getis class - Bar plots

In [None]:
for col_name in col_names:
    for year in years:
        xlabel = f"Getis Bar Plot - {col_name} - {year} - {nn}NN"
        file_path = Path(localautocorr_agg_result_folder) / f'{xlabel}.png'

        fig, ax = pyspace.plot_getis_by_class(df_getis_results[col_name][year],f'{col_name}_G_cl', col_name, dict_labels[col_name], 8, 12, 8, 8, p_001=True, showfliers = False)
        plt.savefig(file_path, dpi=320, bbox_inches='tight')

In [None]:
for col_name in col_names:
    for year in years:
        xlabel = f"Getis Bar Plot - {col_name} - {year} - {nn}NN"
        file_path = Path(localautocorr_agg_result_folder/'Population Scaled weights') / f'{xlabel}.png'

        fig, ax = pyspace.plot_getis_by_class(df_getis_results_pop_scaled[col_name][year],f'{col_name}_G_cl', col_name, dict_labels[col_name], 8, 12, 8, 8, p_001=True, showfliers = False)
        plt.savefig(file_path, dpi=320, bbox_inches='tight')