In [None]:
import pandas as pd
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import networkx as nx
from pathlib import Path
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import sys
sys.path.append('/Users/david/Dropbox/PhD/Scripts/Spatial analyses')
import pyspace
import libpysal as lps
from scipy.spatial import cKDTree
from libpysal.weights.distance import get_points_array
from esda import fdr
from importlib import reload
pd.set_option('display.max_rows', 500)
reload(pyspace)
import seaborn as sns
from esda.moran import Moran
# sns.set_theme(font = 'Helvetica')
%matplotlib inline
from numba import NumbaDeprecationWarning
import warnings
# Suppress NumbaDeprecationWarning
warnings.filterwarnings("ignore", category=NumbaDeprecationWarning)

In [None]:
data_folder  = Path('../data/')
main_folder = Path('../manuscript/')
results_folder = main_folder/'output'

In [None]:
cantons = gpd.read_file(data_folder/'raw/Linkage/swissBOUNDARIES3D_1_3_TLM_KANTONSGEBIET.shp', engine='pyogrio')
cantons['dummies'] = 1
ch_polygon = cantons.dissolve('dummies')

In [None]:
ch_polygon.to_file(data_folder/'raw/CH/ch_contour.shp', engine='pyogrio')
ch_polygon = ch_polygon.to_crs(2056)
ch_polygon.to_file(data_folder/'raw/CH/ch_contour_2056.shp', engine='pyogrio')

In [None]:
# Save the exploded dispensers DataFrame to a parquet file
df_dispensateurs_lca_nodupli = pd.read_parquet(
    data_folder/'processed'/'df_dispensateurs_lca_nodupli.parquet.gzip')

df_dispensateurs_lca_exploded = pd.read_parquet(data_folder/'processed'/'df_dispensateur_lca_exploded.parquet.gzip')

## Import LCA data

In [None]:
df_prestation_lca = pd.read_parquet(data_folder/'processed'/'df_prestation_lca_processed.parquet_gzip')

In [None]:
df_prestation_lca_1spe = df_prestation_lca[df_prestation_lca.n_therapies == 1].explode('THERAPIES_SIMPLIFIED_SET')
df_prestation_lca_exploded = df_prestation_lca.explode('THERAPIES_SIMPLIFIED_SET')

In [None]:
(df_prestation_lca_1spe.groupby(['NOANNEE','THERAPIES_SIMPLIFIED_SET']).uuid.nunique()/df_prestation_lca_1spe.groupby(['NOANNEE']).uuid.nunique()).mul(100)

In [None]:
prev_therapies_by_year = (df_prestation_lca_exploded.groupby(['NOANNEE','THERAPIES_SIMPLIFIED_SET']).uuid.nunique()/df_prestation_lca_exploded.groupby(['NOANNEE']).uuid.nunique()).mul(100)

In [None]:
prev_therapies_by_year = prev_therapies_by_year.reset_index()

In [None]:
prev_therapies_by_year[prev_therapies_by_year.NOANNEE == 2021].sort_values('uuid').tail(40).set_index('THERAPIES_SIMPLIFIED_SET')['uuid'].plot.bar()

## Analyses Getis - Outcomes principaux

In [None]:
df_treated_filtered = pd.read_parquet(data_folder/'/processed/df_treated_filtered.parquet.gzip')
df_treated_filtered = gpd.GeoDataFrame(df_treated_filtered, crs = 4326, geometry=gpd.points_from_xy(df_treated_filtered.lon_masked, df_treated_filtered.lat_masked))
df_treated_filtered = df_treated_filtered.to_crs(2056)

In [None]:
df_treated_filtered['CAREMODEL'] = df_treated_filtered.filter(regex='MODEL_').idxmax(axis=1).str.replace('MODEL_', '')

In [None]:
canton_to_language = {
    'Zürich': 'German',
    'Bern': 'German',
    'Luzern': 'German',
    'Uri': 'German',
    'Schwyz': 'German',
    'Obwalden': 'German',
    'Nidwalden': 'German',
    'Glarus': 'German',
    'Zug': 'German',
    'Fribourg': 'Mix',
    'Solothurn': 'German',
    'Basel-Stadt': 'German',
    'Basel-Landschaft': 'German',
    'Schaffhausen': 'German',
    'Appenzell Ausserrhoden': 'German',
    'Appenzell Innerrhoden': 'German',
    'St. Gallen': 'German',
    'Graubünden': 'German', # Note: Graubünden is trilingual with German, Romansh and Italian communities
    'Aargau': 'German',
    'Thurgau': 'German',
    'Ticino': 'Italian',
    'Vaud': 'French',
    'Valais': 'Mix', # Note: Valais is bilingual with French and German communities
    'Neuchâtel': 'French',
    'Genève': 'French',
    'Jura': 'French',
}

In [None]:
df_treated_filtered['lang_region'] = df_treated_filtered['CANTON_NAME'].map(canton_to_language)

In [None]:
def compute_and_save_getis(data, outcome, weights, year, store, threshold=0.05):
    getis_values = pyspace.compute_getis(data, outcome, weights, 999, transform_type='B', p_001=False)
    fdr_pvalue = fdr(getis_values.p_sim, threshold)
    data[f'{outcome}_G_cl_fdr'] = data[f'{outcome}_G_cl']
    data.loc[data[f'{outcome}_G_psim'] >= fdr_pvalue, f'{outcome}_G_cl_fdr'] = 'Not significant'

    # Plot and save
    fig, ax = pyspace.plotGetisMap(data, f'{outcome}_G_cl', markersize_s=0.01, markersize_l=0.1, p_001=False, commune_name=False)
    file_path = f'../output/Getis/{outcome}_{year}.png'
    plt.savefig(file_path, dpi=480, bbox_inches='tight')
    store[f'df_getis_{outcome}_{year}'] = data.drop('geometry', axis=1)

def main():
    outcomes = ['PRESTATIONS_TOTAL','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_AOS','cds','Methodes de massage_amount','Methodes energetiques_amount','Methodes occidentales_amount','Methodes orientales_amount','Methodes hydrotherapeutiques_amount']
    _weights = {}  # Initialize your weights dictionary
    
    store = pd.HDFStore(output_folder/'Getis/dfs_getis.h5')
    dfs_getis = []

    for year in df_treated_filtered.NOANNEE.unique():
        data_year = df_treated_filtered[df_treated_filtered.NOANNEE == year].set_index('uuid')[outcomes + ['E', 'N', 'geometry']]

        wnn32 = _weights.get(year)
        if wnn32 is None:
            wnn32 = lps.weights.KNN(cKDTree(get_points_array(data_year.geometry.centroid)), 32)
            _weights[year] = wnn32

        for outcome in outcomes:
            file_path = f'../output/Getis/{outcome}_{year}.png'
            if not os.path.isfile(file_path):
                compute_and_save_getis(data_year, outcome, wnn32, year, store)
            else:
                data_year = store[f'df_getis_{outcome}_{year}']
        dfs_getis.append(data_year)
    store.close()
    return _weights

if __name__ == "__main__":
    weights_dict = main()

In [None]:
store = pd.HDFStore(output_folder/'Getis/dfs_getis.h5')
outcome = 'PRESTATIONS_BRUTES_AOS'
year='2017'
df_treated_filtered_year = store[f'df_getis_{outcome}_{str(year)}']

In [None]:
pyspace.plot_getis_by_class(df_treated_filtered_year,'PRESTATIONS_BRUTES_AOS_G_cl', 'PRESTATIONS_BRUTES_AOS', 'Average AOS spending amount (CHF) in 2017', 8, 12, 8, 8, p_001=False,showfliers = False)

In [None]:
outcome = 'PRESTATIONS_BRUTES_LCA'
year='2017'
data_final_year = store[f'df_getis_{outcome}_{str(year)}']

In [None]:
pyspace.plot_getis_by_class(df_treated_filtered_year,'PRESTATIONS_BRUTES_LCA_G_cl', 'PRESTATIONS_BRUTES_LCA', 'Average LCA spending amount (CHF) in 2017', 8, 12, 8, 8, p_001=False,showfliers = False)

In [None]:
outcome = 'PRESTATIONS_BRUTES_LCA'
year=2019
df_treated_filtered_year = store[f'df_getis_{outcome}_{str(year)}']

In [None]:
pyspace.plot_getis_by_class(df_treated_filtered_year,'PRESTATIONS_BRUTES_LCA_G_cl', 'PRESTATIONS_BRUTES_LCA', 'Avg CM spending amount (CHF) in 2019', 8, 12, 8, 8, p_001=False,showfliers = False)

## Adjustment models and Getis on residuals

In [None]:
outcome = 'PRESTATIONS_BRUTES_LCA'
year='2017'
data_final_year = store[f'df_getis_{outcome}_{str(year)}']

In [None]:
data_2017 = df_treated_filtered[df_treated_filtered.NOANNEE == 2017]
data_2018 = df_treated_filtered[df_treated_filtered.NOANNEE == 2018]
data_2019 = df_treated_filtered[df_treated_filtered.NOANNEE == 2019]
data_2020 = df_treated_filtered[df_treated_filtered.NOANNEE == 2020]
data_2021 = df_treated_filtered[df_treated_filtered.NOANNEE == 2021]

data_2017 = gpd.GeoDataFrame(data_2017, crs = 2056, geometry = data_2017['geometry'] )
data_2018 = gpd.GeoDataFrame(data_2018, crs = 2056, geometry = data_2018['geometry'] )
data_2019 = gpd.GeoDataFrame(data_2019, crs = 2056, geometry = data_2019['geometry'] )
data_2020 = gpd.GeoDataFrame(data_2020, crs = 2056, geometry = data_2020['geometry'] )
data_2021 = gpd.GeoDataFrame(data_2021, crs = 2056, geometry = data_2021['geometry'] )

In [None]:
data_2018.CAREMODEL.unique()

In [None]:
data_2018 = data_2018[data_2018.CANTON_ACRONYM.isnull()==False]

In [None]:
chronic_diseases = data_2018.filter(regex='_PCG').columns.tolist()

In [None]:
import statsmodels.api as sm

# Step 1: Prepare the data
X = data_2018[['cds','SEX_F','NBAGE','ssep3','CANTON_ACRONYM','E','N']]  # Covariates
y = data_2018['PRESTATIONS_BRUTES_AOS']  # Dependent variable

formula = "PRESTATIONS_BRUTES_AOS ~  CDPHYSSEXE + CANTON_ACRONYM + NBAGE + CAREMODEL + CDLANGUE + MTFRANCHISECOUV"
model = sm.formula.ols(formula, data=data_2018).fit()
print(model.summary())

In [None]:
data_2018['residuals'] = model.resid

In [None]:
wnn32 = lps.weights.KNN(cKDTree(get_points_array(data_2018.geometry.centroid)), 32)

In [None]:
getis_values = pyspace.compute_getis(data_2018, 'residuals', wnn32, 999, transform_type='B', p_001=False)

In [None]:
fig, ax = pyspace.plotGetisMap(data_2018, 'residuals_G_cl', markersize_s=0.01, markersize_l=0.1, p_001=False, commune_name=False)
file_path = output_folder/'Getis/PRESTATIONS_BRUTES_AOS_2018_adjusted.png'
plt.savefig(file_path, dpi=1000, bbox_inches='tight')

## Claims amount by sex

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_AOS",x = 'CDPHYSSEXE',showfliers = False, dodge = False, ax = ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_LCA",x = 'CDPHYSSEXE',showfliers = False, dodge = False, ax = ax)
plt.show()

## Claims amount by canton / region

In [None]:
avg_prestation_lca_by_canton = df_treated_filtered.groupby('CANTON_NAME')['PRESTATIONS_BRUTES_LCA'].mean()
rank = avg_prestation_lca_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_lca_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.barplot(data=df_treated_filtered, y="treatment",x = 'CANTON_NAME',hue = 'lang_region', dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.legend(title='')
plt.xticks(rotation = 90)
sns.despine()
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CAM claims amount (SI) (CHF)', fontsize=12)
plt.grid(axis = 'y')
plt.savefig(results_folder/'Figures_rapport/Prevalence_CAM_SI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
avg_prestation_lca_by_canton = df_treated_filtered.groupby('CANTON_NAME')['PRESTATIONS_BRUTES_LCA'].mean()
rank = avg_prestation_lca_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_lca_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_LCA",x = 'CANTON_NAME',hue = 'lang_region',showfliers = False, dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.legend(title='')
plt.xticks(rotation = 90)
sns.despine()
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CAM claims amount (SI) (CHF)', fontsize=12)
plt.grid(axis = 'y')
plt.savefig(results_folder/'Figures_rapport/Avg_CAM_SI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
avg_prestation_aos_by_canton = df_treated_filtered.groupby('CANTON_NAME')['PRESTATIONS_BRUTES_AOS'].median()
rank = avg_prestation_aos_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_AOS",x = 'CANTON_NAME',hue = 'lang_region',showfliers = False,dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
plt.legend(title='')
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CM claims amount (MHI) (CHF)', fontsize=12)
plt.savefig(results_folder/'figures_rapport/Avg_CM_MHI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
avg_prestation_lca_by_canton = df_treated_filtered.groupby('CANTON_NAME')['treatment_cam_only'].mean()
rank = avg_prestation_lca_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_lca_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.barplot(data=df_treated_filtered, y="treatment_cam_only",x = 'CANTON_NAME',hue = 'lang_region', dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.legend(title='')
plt.xticks(rotation = 90)
sns.despine()
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CAM claims amount (SI) (CHF)', fontsize=12)
plt.grid(axis = 'y')
plt.savefig(results_folder/'figures_rapport/Prevalence_CAM_MHI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
avg_prestation_cam_by_canton = df_treated_filtered.groupby('CANTON_NAME')['PRESTATIONS_BRUTES_CAM'].mean()
rank = avg_prestation_cam_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_cam_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.barplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_CAM",x = 'CANTON_NAME',hue = 'lang_region',dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.legend(title='', loc = 'upper left')
plt.xticks(rotation = 90)
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Annual CAM claims amount (MHI) (CHF)', fontsize=12)
plt.savefig(results_folder/'figures_rapport/Avg_CAM_MHI_by_canton.png', dpi = 300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize = (8,8))
ordered_labels = df_treated_filtered.groupby('CANTON_NAME')['PRESTATIONS_BRUTES_LCA'].median().sort_values().index
# Plot the orbital period with horizontal boxes
g = sns.boxplot(data=df_treated_filtered, x="PRESTATIONS_BRUTES_LCA",y = 'CANTON_NAME', order = ordered_labels, showfliers=False,width=.6, palette="vlag", ax = ax)
# ax.set_xlim(0,5000)
g.set_xlabel('Annual CAM claims (SI)')
g.set_ylabel('Canton')
plt.savefig(results_folder/'figures_rapport/Avg_CAM_SI_by_canton_cmap.png', dpi = 300, bbox_inches='tight')

## Claims amount by age group

In [None]:
fig, ax = plt.subplots(figsize = (8,8))

g = sns.boxplot(data=df_treated_filtered, x="PRESTATIONS_BRUTES_LCA",y = 'age_group', width=.6, palette="vlag", showfliers=False, ax = ax)
g.set_xlabel('Annual CAM claims amount (SI) (CHF) ')
g.set_ylabel('Age group')
plt.savefig(results_folder/'figures_rapport/Avg_CAM_SI_by_age_group_cmap.png', dpi = 300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize = (8,8))

g = sns.boxplot(data=df_treated_filtered, x="PRESTATIONS_BRUTES_AOS",y = 'age_group', width=.6, palette="vlag", showfliers=False, ax = ax)
g.set_xlabel('Annual CM claims amount (MHI) (CHF) ')
g.set_ylabel('Age group')
plt.savefig(results_folder/'figures_rapport/Avg_CM_MHI_by_age_group_cmap.png', dpi = 300, bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(figsize = (8,8))

g = sns.barplot(data=df_treated_filtered, x="PRESTATIONS_BRUTES_CAM",y = 'age_group', width=.6, palette="vlag", ax = ax)
g.set_xlabel('Annual CAM claims amount (MHI) (CHF) ')
g.set_ylabel('Age group')
plt.savefig(results_folder/'figures_rapport/Avg_CAM_MHI_by_age_group_cmap.png', dpi = 300, bbox_inches='tight')

### Prop of minors by canton

In [None]:
dict_region_colors = {'German': '#55a868', 'French': '#4c72b0', 'Italian': '#c44e52', 'Mix': '#dd8452'}

In [None]:
df_treated_filtered['is_minor'] = df_treated_filtered['NBAGE'].apply(lambda x: 1 if x < 19 else 0)
counts = df_treated_filtered.groupby('CANTON_NAME')['is_minor'].agg(['sum', 'count'])
counts['proportion_of_minors'] =( counts['sum'] / counts['count']) *100
counts['LANG_REGION'] = counts.index.map(canton_to_language).map(dict_region_colors)
# Plotting the proportions
plt.figure(figsize=(12,8))
counts['proportion_of_minors'].sort_values().plot(kind='bar', color = 'grey')
plt.ylabel('Percentage of age 0 to 18')
plt.xlabel('Canton')

# plt.title('Proportion of minors by canton')
plt.savefig(results_folder/'figures_rapport/Perc_minors_by_canton.png', dpi = 300, bbox_inches='tight')

### Avg age by canton

In [None]:
avg_age_by_canton = df_treated_filtered.groupby('CANTON_NAME')['NBAGE'].mean()
rank = avg_age_by_canton.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_age_by_canton))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="NBAGE",x = 'CANTON_NAME',hue = 'lang_region',showfliers = False,dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Cantons', fontsize=12)
plt.ylabel('Average age', fontsize=12)
plt.show()

## Claims by franchise amount
### For minors

In [None]:
avg_prestation_aos_by_franchise = df_treated_filtered[df_treated_filtered.NBAGE <= 18].groupby('MTFRANCHISECOUV')['PRESTATIONS_BRUTES_AOS'].mean()
rank = avg_prestation_aos_by_franchise.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_franchise))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered[df_treated_filtered.NBAGE <= 18], y="PRESTATIONS_BRUTES_AOS",x = 'MTFRANCHISECOUV',color = '#4E79A7', showfliers = False,dodge = False, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
sns.despine()
plt.grid(axis = 'y')
plt.ylabel('Annual CM claims amount (MHI) (CHF)', fontsize=12)
plt.xlabel('Annual deductible amount (CHF)', fontsize=12)
plt.savefig(results_folder/'figures_rapport/Avg_CM_MHI_by_franchise_minors.png', dpi = 300, bbox_inches='tight')

### For adults

In [None]:
avg_prestation_aos_by_franchise = df_treated_filtered[df_treated_filtered.NBAGE > 18].groupby('MTFRANCHISECOUV')['PRESTATIONS_BRUTES_AOS'].mean()
rank = avg_prestation_aos_by_franchise.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_franchise))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered[df_treated_filtered.NBAGE > 18], y="PRESTATIONS_BRUTES_AOS",x = 'MTFRANCHISECOUV',color = '#4E79A7',showfliers = False, dodge = False, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Annual deductible amount (CHF)', fontsize=12)
plt.ylabel('Annual CM claims amount (MHI) (CHF)', fontsize=12)
# plt.title('')
plt.savefig(results_folder/'figures_rapport/Avg_CM_MHI_by_franchise_adults.png', dpi = 300, bbox_inches='tight')

In [None]:
avg_prestation_aos_by_franchise = df_treated_filtered[df_treated_filtered.NBAGE > 18].groupby('MTFRANCHISECOUV')['PRESTATIONS_BRUTES_LCA'].mean()
rank = avg_prestation_aos_by_franchise.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_franchise))
fig, ax = plt.subplots(figsize = (8,8))
sns.barplot(data=df_treated_filtered[df_treated_filtered.NBAGE > 18], y="PRESTATIONS_BRUTES_LCA",x = 'MTFRANCHISECOUV',color = '#F28E2B', dodge = False, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
sns.despine()
plt.grid(axis = 'y')
plt.xlabel('Annual deductible amount (CHF)', fontsize=12)
plt.ylabel('Annual CAM claims amount (SI) (CHF)', fontsize=12)
# plt.title('')
plt.savefig(results_folder/'figures_rapport/Avg_CAM_SI_by_franchise_adults.png', dpi = 300, bbox_inches='tight')

## Claims by model type

In [None]:
avg_prestation_lca_by_model = df_treated_filtered.groupby('CAREMODEL')['PRESTATIONS_BRUTES_LCA'].mean()
rank = avg_prestation_lca_by_model.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_lca_by_model))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_LCA",x = 'CAREMODEL',showfliers = False, dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
sns.despine()
plt.xlabel('Healthcare Models', fontsize=12)
plt.ylabel('Average gross amount (CHF) \nof complementary health insurance claims', fontsize=12)
plt.grid(axis = 'y')
plt.show()

In [None]:
avg_prestation_aos_by_model = df_treated_filtered.groupby('CAREMODEL')['PRESTATIONS_BRUTES_AOS'].mean()
rank = avg_prestation_aos_by_model.argsort().argsort()   # http://stackoverflow.com/a/6266510/1628638
rank = rank.sort_values().index.tolist()
pal = sns.color_palette("Greens", len(avg_prestation_aos_by_model))
fig, ax = plt.subplots(figsize = (8,8))
sns.boxplot(data=df_treated_filtered, y="PRESTATIONS_BRUTES_LCA",x = 'CAREMODEL',showfliers = False, dodge = False, order = rank, ax = ax)
# ax.set_ylim(0,5000)
plt.xticks(rotation = 90)
sns.despine()
plt.xlabel('Healthcare Models', fontsize=12)
plt.ylabel('Average gross amount (CHF) \nof mandatory health insurance claims', fontsize=12)
plt.grid(axis = 'y')
plt.show()

In [None]:
counts = df_treated_filtered.groupby('CAREMODEL')['is_minor'].agg(['sum', 'count'])
counts['proportion_of_minors'] = counts['sum'] / counts['count']
# counts['LANG_REGION'] = counts.index.map(canton_to_language).map(dict_region_colors)
# Plotting the proportions
plt.figure(figsize=(12,8))
counts['proportion_of_minors'].sort_values().plot(kind='barh')
plt.xlabel('Proportion of Minors')
plt.ylabel('Healthcare model')
plt.title('Proportion of Minors by Canton')
plt.show()

In [None]:
test = df_treated_filtered.filter(regex = 'CARE|CANTON|LANG|PRESTATIONS')
test = pd.crosstab(test['CANTON_NAME'], test['CAREMODEL'], normalize='index')
test.sort_values('AH_STD').plot.bar(stacked=True, figsize = (8,8))

In [None]:
test = df_treated_filtered.filter(regex = 'CARE|CANTON|LANG|PRESTATIONS')
test = pd.crosstab(test['CANTON_NAME'], test['CAREMODEL'])
test.sort_values('AH_STD').plot.bar(stacked=True, figsize = (8,8))

In [None]:
test.sort_values('AH_STD').plot.bar(stacked=True, figsize = (8,8))

## Quelles thérapies complémentaires sont les plus "co-pratiquées" par les thérapeutes

In [None]:
therapies_serie = df_dispensateurs_lca_exploded.groupby('ID_DISPENSATEUR').THERAPIES_SIMPLIFIED.apply(set)

In [None]:
node_weights = df_dispensateurs_lca_exploded['THERAPIES_SIMPLIFIED'].value_counts().to_dict()

In [None]:
import itertools
from collections import Counter

# Create an empty counter to store co-occurrences
co_occurrences = Counter()

# Loop through each set in the series
for s in therapies_serie:
    # Ignore None sets
    if s is None:
        continue

    # Get all combinations of 2 elements from the set
    combos = itertools.combinations(s, 2)
    # Update the counter with the combinations
    co_occurrences.update(combos)

In [None]:
df = pd.DataFrame(co_occurrences.items(), columns=['combo', 'count'])
# Split the combo column into two columns
df[['node1', 'node2']] = pd.DataFrame(df['combo'].tolist())

# Drop the combo column
df = df.drop('combo', axis=1)

# Reset the index
df = df.reset_index(drop=True)

# Rename the columns
df = df.rename(columns={'count': 'weight'})

# Filter out rows where node1 or node2 is None
df = df[df['node1'].notnull() & df['node2'].notnull()]


### Full network

In [None]:
# scaler = MinMaxScaler()
# df[['weight']] = scaler.fit_transform(df[['weight']])*5

In [None]:
# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(df['node1']).union(set(df['node2']))
G.add_nodes_from(nodes)
node_sizes = [node_weights[node] / 10 for node in G.nodes()]

# Add edges to the graph
edges = [(row['node1'], row['node2'], row['weight']) for _, row in df.iterrows()]
G.add_weighted_edges_from(edges)

# Define the layout
pos = nx.spring_layout(G, k=5, iterations=50, weight = 'weight')
# Plot the graph
nx.draw(G, with_labels=True, font_size = 5, node_color='lightblue', width = df['weight']/10000, edge_color='gray', node_size=node_sizes)

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])

plt.show()

### Identifying communities of therapies

In [None]:
import networkx.algorithms.community as nxcom

In [None]:
communities = nxcom.greedy_modularity_communities(G, weight = 'weight')

I used the Louvain method for community detection, which is a widely used and fast algorithm for community detection in large networks. The Louvain method works by optimizing a modularity function that measures the strength of division of a network into communities.

In terms of parameters, I used the default parameters for the `greedy_modularity_communities` function, which are `weight=None` and `resolution=1.0`. The weight parameter allows you to specify an edge attribute to use as the weight for the modularity calculation, while the resolution parameter controls the size of the communities (higher resolution leads to smaller communities). Since our network has edge weights, I used the weight column values. And since we don't have any prior knowledge about the optimal community size, I left resolution as the default value of 1.0.

In general, the choice of parameters for community detection depends on the specific characteristics of the network and the goals of the analysis. For example, if you have a very large network, you might want to use a more scalable algorithm that can handle large amounts of data. Alternatively, if you have prior knowledge about the community structure of your network, you might want to use an algorithm that can incorporate that information into the analysis. Additionally, you may want to experiment with different values of the resolution parameter to see how it affects the number and size of the communities detected.

In [None]:
# Print the number of communities
print(f"Number of communities: {len(communities)}")

# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i

In [None]:
def set_node_community(G, communities):
    '''Add community to node attributes'''
    for c, v_c in enumerate(communities):
        for v in v_c:
            # Add 1 to save 0 for external edges
            G.nodes[v]['community'] = c + 1
def set_edge_community(G):
    '''Find internal edges and add their community to their attributes'''
    for v, w, in G.edges:
        if G.nodes[v]['community'] == G.nodes[w]['community']:
            # Internal edge, mark with community
            G.edges[v, w]['community'] = G.nodes[v]['community']
        else:
            # External edge, mark as 0
            G.edges[v, w]['community'] = 0
def get_color(i, r_off=1, g_off=1, b_off=1):
    '''Assign a color to a vertex.'''
    r0, g0, b0 = 0, 0, 0
    n = 16
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)

Found a cool blog post : https://graphsandnetworks.com/community-detection-using-networkx/

In [None]:
from pyvis.network import Network

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i
# Set node and edge communities
set_node_community(G, communities)
set_edge_community(G)
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]

node_sizes = [node_weights[node] / 20 for node in G.nodes()]

pos = nx.spring_layout(G, k=5, iterations=100, weight = 'weight')
nx.draw(G, pos, with_labels=True, font_size = 4, edge_color="#444444", node_color=node_color, width=df['weight']/4000,alpha=0.5, node_size=node_sizes)

# Add labels to the edges
# labels = nx.get_edge_attributes(G, 'weight')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8)


# Add a legend for the communities
handles = []
labels = []
# fig, ax = plt.subplots(figsize=(8, 8))
# for i, community in enumerate(communities):
#     plt.plot([], [], color=color_map_patches[i], label=f"Community {i}")
# plt.legend(scatterpoints=1, frameon=True, labelspacing=0.5, title='Communities')
# ax.set_axis_off()
for i, community in enumerate(communities):
    handles.append(plt.Line2D([], [], linewidth=0, marker='o', color=list(set(node_color))[i]))
    labels.append(f'Community {i+1}')
plt.legend(handles, labels, loc='upper right',fontsize = 10, labelspacing=0.5, title='Communities')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])

# Save the network to PDF
plt.savefig(output_folder/'Network analyses/Therapies - White.pdf', format='pdf')

plt.show()

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (19.20,10.80)})
plt.style.use('dark_background')

# Set community color for internal edges
external = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] == 0]
internal = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] > 0]
internal_color = ["red" for e in internal]
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]
# external edges
nx.draw_networkx(
    G,
    pos=pos,
    node_size=0,
    edgelist=external,
    edge_color="green",
    node_color=node_color,
    alpha=0.4,
    width = df['weight']/4000,
    with_labels=False)
# internal edges
nx.draw_networkx(
    G, pos=pos,
    node_size=node_sizes,
    edgelist=internal,
    edge_color=internal_color,
    node_color=node_color,
    alpha=0.4,
    width = df['weight']/4000,
    with_labels=False)
# Add a legend for the communities
handles = []
labels = []
for i, community in enumerate(communities):
    handles.append(plt.Line2D([], [], linewidth=0, marker='o', color=list(set(node_color))[i]))
    labels.append(f'Community {i+1}')
# plt.legend(handles, labels, loc='upper right',fontsize = 6, labelspacing=0.5, title='Communities')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.savefig(output_folder/'Network analyses/Therapies - Dark.png', format='png', dpi = 360)
plt.show()


### K-CORES

A k-core of a graph G is a maximal connected subgraph of G in which all vertices have degree at least k. Equivalently, it is one of the connected components of the subgraph of G formed by repeatedly deleting all vertices of degree less than k. If a non-empty k-core exists, then, clearly, G has degeneracy at least k, and the degeneracy of G is the largest k for which G has a k-core.



In [None]:
# cores with at lest degree 
G_core_30 = nx.k_core(G, 5)
# similarly, with at least degree 10
G_core_60 = nx.k_core(G, 10)
# Visualize network and k-cores
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
plt.style.use('dark_background')
pos = nx.spring_layout(G, k=0.1)
nx.draw_networkx(
    G, pos=pos, node_size=node_sizes, edge_color="#333333", width = df['weight']/4000, alpha=0.2, with_labels=False)
nx.draw_networkx(
    G_core_30, pos=pos, node_size=0, edge_color="green",width = df['weight']/2000, alpha=0.2, with_labels=False)
nx.draw_networkx(
    G_core_60, pos=pos, node_size=0, edge_color="red",width = df['weight']/2000, alpha=0.2, with_labels=False)
plt.show()

### GIRVAN-NEWMAN COMMUNITY DETECTION
The Girvan–Newman algorithm detects communities by progressively removing edges from the original network. The connected components of the remaining network are the communities. Instead of trying to construct a measure that tells us which edges are the most central to communities, the Girvan–Newman algorithm focuses on edges that are most likely “between” communities.



In [None]:
result = nxcom.girvan_newman(G)
communities = next(result)
len(communities)

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
# Set node and edge communities
set_node_community(G, communities)
set_edge_community(G)
# Set community color for nodes
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]
# Set community color for internal edges
external = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] == 0]
internal = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] > 0]
internal_color = [get_color(G.edges[e]['community']) for e in internal]
pos = nx.spring_layout(G)
# Draw external edges
nx.draw_networkx(
    G, pos=pos, node_size=0,
    edgelist=external, edge_color="#333333", with_labels=False)
# Draw nodes and internal edges
nx.draw_networkx(
    G, pos=pos, node_color=node_color,
    edgelist=internal, edge_color=internal_color)
plt.show()

Having only two communities is not particularly helpful, especially when visualizing them... We can discard this method

### CLIQUES
A clique is a subset of vertices of an undirected graph such that every two distinct vertices in the clique are adjacent; that is, its induced subgraph is complete. Cliques are in a way tight communities where every nodes is connected to every other.

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
cliques = list(nx.find_cliques(G))
max_clique = max(cliques, key=len)
node_color = [(0.5, 0.3, 0.5) for v in G.nodes()]
for i, v in enumerate(G.nodes()):
    if v in max_clique:
        node_color[i] = (0.5, 0.5, 0.9)
nx.draw_networkx(G, node_color=node_color,edge_color="silver", width = df['weight']/4000, pos=pos, node_size = node_sizes, alpha = 0.3, with_labels = False)
plt.show()

### Network of 100 most central nodes

In [None]:
def network_more_stats(g):
    # Compute various network metrics
    bb = nx.betweenness_centrality(g)
    dc = nx.degree_centrality(g)
    cc = nx.closeness_centrality(g)
    clc = nx.clustering(g)
    ec = nx.eigenvector_centrality(g, max_iter=1000, tol=1e-06, nstart=None, weight=None)
    
    # Set node attributes
    nx.set_node_attributes(g, bb, 'betweenness')
    nx.set_node_attributes(g, dc, 'degree_centrality')
    nx.set_node_attributes(g, cc, 'closeness')
    nx.set_node_attributes(g, clc, 'clustering')
    nx.set_node_attributes(g, ec, 'eigenvector')
    
    # Create a DataFrame to store the metrics
    metrics_df = pd.DataFrame({
        'Node': list(g.nodes()),
        'Betweenness': [bb[node] for node in g.nodes()],
        'Degree Centrality': [dc[node] for node in g.nodes()],
        'Closeness': [cc[node] for node in g.nodes()],
        'Clustering': [clc[node] for node in g.nodes()],
        'Eigenvector': [ec[node] for node in g.nodes()]
    })
    return g, metrics_df


In [None]:
degrees = [(node,deg) for (node, deg) in G.degree()]
degrees_sorted = sorted(degrees, key=lambda x: x[1], reverse=True)
pd.DataFrame(degrees_sorted,columns = ['Thérapie','degree']).plot(marker = '.',logy = False,logx = False)
plt.grid()
plt.show()

In [None]:
G, df_G_metrics = network_more_stats(G)

In [None]:
k = 150
node_sel = [n for (n, deg) in degrees_sorted[:k]]
node_deg = [deg for (n, deg) in degrees_sorted[:k]]
G2 = G.subgraph(node_sel).copy() #Returns a subgraph of the 500 most mentioned people
# compute network "core"
G3 = nx.k_core(G2, k=2)

In [None]:
# Define the layout
pos = nx.spring_layout(G2, k=0.1, iterations=50, weight = 'weight')
# Plot the graph
nx.draw(G2, with_labels=True, font_size = 5, node_color='lightblue', width = df['weight']/4000, edge_color='gray', node_size=100)

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])

plt.show()

In [None]:
communities = nxcom.greedy_modularity_communities(G2, weight = 'weight')

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i
# Set node and edge communities
set_node_community(G2, communities)
set_edge_community(G2)
node_color = [get_color(G2.nodes[v]['community']) for v in G2.nodes]

node_sizes = [node_weights[node] / 20 for node in G2.nodes()]

pos = nx.spring_layout(G2, k=0.4, iterations=50, weight = 'weight')
# pos = nx.nx_agraph.graphviz_layout(G2)
nx.draw(G2, pos, with_labels=True, font_size = 6, edge_color="#444444", node_color=node_color, width=df['weight']/4000,alpha=0.5, node_size=node_sizes)

# Add labels to the edges
# labels = nx.get_edge_attributes(G, 'weight')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8)


# Add a legend for the communities
handles = []
labels = []
# fig, ax = plt.subplots(figsize=(8, 8))
# for i, community in enumerate(communities):
#     plt.plot([], [], color=color_map_patches[i], label=f"Community {i}")
# plt.legend(scatterpoints=1, frameon=True, labelspacing=0.5, title='Communities')
# ax.set_axis_off()
for i, community in enumerate(communities):
    handles.append(plt.Line2D([], [], linewidth=0, marker='o', color=list(set(node_color))[i]))
    labels.append(f'Community {i+1}')
plt.legend(handles, labels, loc='upper right',fontsize = 6, labelspacing=0.5, title='Communities')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])

# Save the network to PDF
plt.savefig(output_folder/'Network analyses/Therapies - Top 100 - White.pdf', format='pdf')

plt.show()

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
plt.style.use('dark_background')

# Set community color for internal edges
external = [(v, w) for v, w in G2.edges if G2.edges[v, w]['community'] == 0]
internal = [(v, w) for v, w in G2.edges if G2.edges[v, w]['community'] > 0]
internal_color = ["red" for e in internal]
node_color = [get_color(G2.nodes[v]['community']) for v in G2.nodes]
# external edges
nx.draw_networkx(
    G2,
    pos=pos,
    node_size=0,
    edgelist=external,
    edge_color="green",
    node_color=node_color,
    alpha=0.2,
    with_labels=False)
# internal edges
nx.draw_networkx(
    G2, pos=pos,
    node_size=node_sizes,
    edgelist=internal,
    edge_color=internal_color,
    node_color=node_color,
    alpha=0.2,
    with_labels=False)
# Add a legend for the communities
handles = []
labels = []
for i, community in enumerate(communities):
    handles.append(plt.Line2D([], [], linewidth=0, marker='o', color=list(set(node_color))[i]))
    labels.append(f'Community {i+1}')
plt.legend(handles, labels, loc='upper right',fontsize = 6, labelspacing=0.5, title='Communities')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.savefig(output_folder/'Network analyses/Therapies - Top 100 - Dark.pdf', format='pdf')

plt.show()


In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})
cliques = list(nx.find_cliques(G2))
max_clique = max(cliques, key=len)
node_color = [(0.5, 0.3, 0.5) for v in G2.nodes()]
for i, v in enumerate(G2.nodes()):
    if v in max_clique:
        node_color[i] = (0.5, 0.5, 0.9)
nx.draw_networkx(G2, node_color=node_color, font_size = 6, edge_color="silver", width = df['weight']/4000, pos=pos, node_size = node_sizes, alpha = 0.3, with_labels = True)
plt.savefig(output_folder/'Network analyses/Therapies - Top 100 - Cliques.pdf', format = 'pdf', bbox_inches = 'tight')
plt.show()

In [None]:
# create the network visualization
net = Network(width='800px', height='800px')
net.from_nx(G2)
net.show_buttons()

# export the visualization to HTML
net.show(output_folder/'EDA/Interactive - Top 100.html')

#### Insights

- Communities exist
- The identified communities make sense in the context of practice
- ...

## Quels thérapeutes se ressemblent le plus en terme de pratique, observe-t-on des cliques/communautés?

## Is there any community of patients using similar complementary medicine?

In [None]:
from utils import optimize_memory_df, feature_map, show_values, sizeof_fmt, find_intersection, read_data
from pathlib import Path
%matplotlib inline
# Define base data folder
data_folder  = Path('../Data/')
# Define base result folder
result_folder = Path('../Results')

In [None]:
df_prestation_lca = read_data(data_folder/'/processed/df_prestation_lca_processed.parquet_gzip')

In [None]:
df_prestation_lca = df_prestation_lca[df_prestation_lca.medcomp_status == 'Clear']

In [None]:
df_n_therapies_by_patient = df_prestation_lca.groupby('uuid').n_therapies.apply(set).apply(sum).sort_values()

In [None]:
df_prestation_lca_restricted = df_prestation_lca[df_prestation_lca.n_therapies == 1]

In [None]:
df_prestation_lca_restricted['DISCIPLINES_SIMPLIFIED_SET'] = df_prestation_lca_restricted.DISCIPLINES_SIMPLIFIED_SET.apply(lambda x: x[0])
df_prestation_lca_restricted['THERAPIES_SIMPLIFIED_SET'] = df_prestation_lca_restricted.THERAPIES_SIMPLIFIED_SET.apply(lambda x: x[0])

In [None]:
df_amount_by_uuid = df_prestation_lca_restricted.groupby('uuid', observed = True)['PRESTATIONS_BRUTES'].sum().to_dict()

In [None]:
df_patient_lca_similarity = pd.DataFrame(df_prestation_lca_restricted.groupby('uuid',observed=True)['THERAPIES_SIMPLIFIED_SET'].apply(set)).reset_index()

In [None]:
df_patient_lca_similarity['n_therapies'] = df_patient_lca_similarity['THERAPIES_SIMPLIFIED_SET'].apply(len)

In [None]:
df_patient_lca_similarity_3plus = df_patient_lca_similarity[df_patient_lca_similarity.n_therapies>2]

In [None]:
df_uuid = df_patient_lca_similarity_3plus.uuid.unique()

In [None]:
from itertools import combinations

# Define Jaccard similarity function
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

In [None]:
df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_SET_TEST'] = df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_SET'].apply(lambda x: set(i.replace(' ','') for i in x))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances

# Sort elements of the sets
# df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_SET'] = df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_SET'].apply(lambda x: sorted(x))

# Convert the set into string format which will be treated as "documents" for Count Vectorizer
df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_STRING'] = df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_SET_TEST'].apply(lambda x: ' '.join(x))

# Use Count Vectorizer to create a matrix representation
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), binary=True)
matrix = vectorizer.fit_transform(df_patient_lca_similarity_3plus['THERAPIES_SIMPLIFIED_STRING']).astype('bool')

# Define a function to compute Jaccard similarity in chunks
def compute_jaccard_in_chunks(matrix, chunk_size=500):
    n_rows = matrix.shape[0]
    jaccard_similarity_matrix = np.zeros((n_rows, n_rows))
    
    for i in range(0, n_rows, chunk_size):
        end_i = min(i + chunk_size, n_rows)
        chunk = matrix[i:end_i].toarray()  # Convert only the current chunk to dense
        
        for j in range(0, n_rows, chunk_size):
            end_j = min(j + chunk_size, n_rows)
            distances = pairwise_distances(chunk, matrix[j:end_j].toarray(), metric='jaccard')
            jaccard_similarity_matrix[i:end_i, j:end_j] = 1 - distances

    return jaccard_similarity_matrix

# Calculate the Jaccard similarity
jaccard_similarity = compute_jaccard_in_chunks(matrix)

In [None]:
# df_patient_lca_similarity_3plus = df_patient_lca_similarity_3plus.set_index('uuid')

# jaccard_df_similarity_3plus = pd.DataFrame()
# # Compute the Jaccard index for all combinations
# for row1, row2 in combinations(df_patient_lca_similarity_3plus.index, 2):
#     set1 = df_patient_lca_similarity_3plus.loc[row1, 'THERAPIES_SIMPLIFIED_SET']
#     set2 = df_patient_lca_similarity_3plus.loc[row2, 'THERAPIES_SIMPLIFIED_SET']
#     intersection = len(set1 & set2)
#     union = len(set1 | set2)
#     jaccard_index = intersection / union
    
#     # Fill the DataFrame
#     jaccard_df_similarity_3plus.loc[row1, row2] = jaccard_index
#     jaccard_df_similarity_3plus.loc[row2, row1] = jaccard_index

# # Fill diagonal with 1s, as they are identical
# for row in df.index:
#     jaccard_df_similarity_3plus.loc[row, row] = 1.0

In [None]:
# melted_jaccard_df_3plus = pd.melt(jaccard_df_similarity_3plus.reset_index(), id_vars="index", var_name="Method_2", value_name="jaccard_index")

# melted_jaccard_df = melted_jaccard_df[melted_jaccard_df.jaccard_index.isnull()==False]

# melted_jaccard_df.columns = ['cat_dispensateur_A','cat_dispensateur_B', 'jaccard_index']


# melted_jaccard_df['cat_dispensateur_A'] = melted_jaccard_df.apply(lambda x: x.cat_dispensateur_B if x.cat_dispensateur_A > x.cat_dispensateur_B else x.cat_dispensateur_A ,axis =1)
# melted_jaccard_df['cat_dispensateur_B'] = melted_jaccard_df.apply(lambda x: x.cat_dispensateur_A if x.cat_dispensateur_A > x.cat_dispensateur_B else x.cat_dispensateur_B ,axis =1)

# melted_jaccard_df['set_cat'] = melted_jaccard_df[['cat_dispensateur_A','cat_dispensateur_B']].apply(set, axis = 1)

# melted_jaccard_df = melted_jaccard_df.drop_duplicates('set_cat')

# melted_jaccard_df['len_set'] = melted_jaccard_df['set_cat'].apply(len)

# melted_jaccard_df = melted_jaccard_df[melted_jaccard_df.len_set != 1]

# # melted_jaccard_df = melted_jaccard_df[melted_jaccard_df.jaccard_index > 0.05]

In [None]:
uuids = df_patient_lca_similarity_3plus['uuid'].tolist()
similarity_df = pd.DataFrame(jaccard_similarity, index=uuids, columns=uuids)

In [None]:
similar_pairs = similarity_df[similarity_df>0.5].stack().reset_index().drop_duplicates()

In [None]:
similar_pairs['uuid_A'] = similar_pairs.apply(lambda x: x.level_1 if x.level_0 > x.level_1 else x.level_0 ,axis =1)
similar_pairs['uuid_B'] = similar_pairs.apply(lambda x: x.level_0 if x.level_0 > x.level_1 else x.level_1 ,axis =1)

In [None]:
similar_pairs = similar_pairs.drop_duplicates(subset = ['uuid_A','uuid_B']).drop(['level_0','level_1'], axis = 1)

In [None]:
similar_pairs  = similar_pairs[similar_pairs.uuid_A !=  similar_pairs.uuid_B]

In [None]:
similar_pairs.columns = ['jaccard_score','uuid_A','uuid_B'] 

In [None]:
similar_pairs.to_csv(output_folder/'Network analyses/patient_CAM_similarity_05.csv', index = False)

In [None]:
similar_pairs_sample = similar_pairs.sample(50000, random_state = 42)

In [None]:
similar_pairs_sample.to_csv(output_folder/'Network analyses/patient_CAM_similarity_sample_500.csv', index = False)

In [None]:
similar_pairs_sample['jaccard_score'] = similar_pairs_sample['jaccard_score']*20

In [None]:
# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(similar_pairs['uuid_A']).union(set(similar_pairs['uuid_B']))
G.add_nodes_from(nodes)
# node_sizes = [node_weights[node] / 10 for node in G.nodes()]

# Add edges to the graph
edges = [(row['uuid_A'], row['uuid_B'], row['jaccard_score']) for _, row in similar_pairs.iterrows()]
G.add_weighted_edges_from(edges)

In [None]:
degrees = [(node,deg) for (node, deg) in G.degree()]
degrees_sorted = sorted(degrees, key=lambda x: x[1], reverse=True)
pd.DataFrame(degrees_sorted,columns = ['User','mentions']).plot(marker = '.',logy = True,logx = True)
plt.grid()

In [None]:
degrees = [(node,deg) for (node, deg) in G.degree()]
degrees_sorted = sorted(degrees, key=lambda x: x[1], reverse=True)
pd.DataFrame(degrees_sorted,columns = ['Thérapie','degree']).plot(marker = '.',logy = False,logx = False)
plt.grid()
plt.show()

In [None]:
df_node_deg = pd.DataFrame(degrees_sorted, columns = ['uuid','degree'])

In [None]:
k = 5000
node_sel = [n for (n,deg) in degrees_sorted[:k]]
node_deg = [deg for (n,deg) in degrees_sorted[:k]]
G2 = G.subgraph(node_sel).copy() #Returns a subgraph of the 5000 most connected patients 

In [None]:
communities = nxcom.greedy_modularity_communities(G2, weight = 'jaccard_score')

In [None]:
# Print the number of communities
print(f"Number of communities: {len(communities)}")

# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i

In [None]:
def get_color_highrange(i, r_off=1, g_off=1, b_off=1):
    '''Assign a color to a vertex.'''
    r0, g0, b0 = 0, 0, 0
    n = 53
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)

In [None]:
# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i
# Set node and edge communities
set_node_community(G2, communities)
set_edge_community(G2)
node_color = [get_color(G2.nodes[v]['community']) for v in G2.nodes]
node_sizes = [df_amount_by_uuid[node] / 20 for node in G2.nodes()]


# Adjust edge transparency and thickness
edge_alpha = 0.2  # Increased transparency
edge_width = similar_pairs_sample['jaccard_score'] / 1000  # Reduced thickness

pos = nx.spring_layout(G2, k=0.5, iterations=100, weight = 'weight')
# pos = nx.nx_agraph.graphviz_layout(G2)
nx.draw(G2, pos, font_size = 0,with_labels=False, edge_color="#444444", node_color=node_color, width=edge_width, alpha= edge_alpha, node_size=node_sizes)

# Add labels to the edges
# labels = nx.get_edge_attributes(G, 'weight')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8)


# Add a legend for the communities
handles = []
labels = []
# fig, ax = plt.subplots(figsize=(8, 8))
# for i, community in enumerate(communities):
#     plt.plot([], [], color=color_map_patches[i], label=f"Community {i}")
# plt.legend(scatterpoints=1, frameon=True, labelspacing=0.5, title='Communities')
# ax.set_axis_off()
for i, community in enumerate(communities):
    handles.append(plt.Line2D([], [], linewidth=0, marker='o', color=list(set(node_color))[i]))
    labels.append(f'Community {i+1}')
plt.legend(handles, labels, loc='upper right',fontsize = 8, labelspacing=0.5, title='Communities')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])

# Save the network to PDF
plt.savefig(output_folder/'Network analyses/Patients - Top 5000 - Jaccard.png', dpi = 1000, format='png')

plt.show()

In [None]:
community_dict = {}
for index, fs in enumerate(communities):
    for element in fs:
        community_dict[element] = index

In [None]:
for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1])[:20]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
df_prestation_lca_restricted['community'] = df_prestation_lca_restricted['uuid'].map(community_dict)

In [None]:
df_treated_filtered['community'] = df_treated_filtered['uuid'].map(community_dict)

In [None]:
df_heatmap_communities = df_prestation_lca_restricted.groupby(['uuid','community','DISCIPLINES_SIMPLIFIED_SET']).sum().groupby(['community','DISCIPLINES_SIMPLIFIED_SET'])['PRESTATIONS_BRUTES'].mean()
df_heatmap_communities = df_heatmap_communities.unstack().fillna(0)
df_heatmap_communities.index = df_heatmap_communities.index+1
df_heatmap_communities.index = df_heatmap_communities.index.astype(int).astype(str)

In [None]:
df_heatmap_communities_diseases = df_treated_filtered.groupby(['uuid','community','DISCIPLINES_SIMPLIFIED_SET']).sum().groupby(['community','DISCIPLINES_SIMPLIFIED_SET'])['PRESTATIONS_BRUTES'].mean()
df_heatmap_communities_diseases = df_heatmap_communities_diseases.unstack().fillna(0)
df_heatmap_communities_diseases.index = df_heatmap_communities_diseases.index+1
df_heatmap_communities_diseases.index = df_heatmap_communities_diseases.index.astype(int).astype(str)

In [None]:
methods_regex = 'PCG'
methods_long_df = df_treated_filtered.melt(id_vars=['uuid','year','community'], 
                          value_vars=df_treated_filtered.filter(regex=methods_regex).columns, 
                          var_name='Disease', value_name='Prevalence')
methods_long_df['Disease'] = methods_long_df['Disease'].str.replace('_PCG','')
methods_long_df = methods_long_df[methods_long_df.Prevalence == 1]

In [None]:
methods_long_df[methods_long_df.community.isnull()==False]

In [None]:
# plt.figure(figsize=(32, 4))
sns.clustermap(df_heatmap_communities, standard_scale = 1, cmap = 'coolwarm', figsize=(25, 12))
plt.title('Typology of communities based on Therapies')
plt.xlabel('Méthodes thérapeutiques')
plt.ylabel('Community')
plt.show()

In [None]:
# plt.figure(figsize=(32, 4))
sns.clustermap(df_heatmap_communities, z_score = 1, cmap = 'coolwarm', figsize=(25, 12))
plt.title('Typology of communities based on Therapies')
plt.xlabel('Méthodes thérapeutiques')
plt.ylabel('Community')
plt.show()

In [None]:
# Plotting the heatmap
plt.figure(figsize=(12, 4))
sns.heatmap(df_heatmap_communities, annot=False, cmap='coolwarm')
plt.title('Typology of communities based on Therapies')
plt.xlabel('Méthodes thérapeutiques')
plt.ylabel('Community')
plt.show()

## Other approach : compute distance (cosine similarity) based on spending patterns

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_tsne = pd.DataFrame((df_prestation_lca_restricted.groupby(['uuid','THERAPIES_SIMPLIFIED_SET'])['PRESTATIONS_BRUTES'].sum()))
df_tsne = df_tsne.unstack('THERAPIES_SIMPLIFIED_SET').fillna(0)['PRESTATIONS_BRUTES']

In [None]:
df_tsne_samp = df_tsne[df_tsne.index.isin(node_sel)]

In [None]:
scaler = MinMaxScaler()
df_tsne_scaled = scaler.fit_transform(df_tsne_samp)
# df_tsne_scaled = df_tsne_samp.copy()

In [None]:
df_tsne_scaled = pd.DataFrame(df_tsne_scaled, columns = df_tsne.columns, index = df_tsne_samp.index)

In [None]:
dict_spending = df_tsne_scaled.sum(axis=1).to_dict()

In [None]:
cosine_sim = cosine_similarity(df_tsne_scaled)
cosine_sim_df = pd.DataFrame(cosine_sim, index=df_tsne_scaled.index, columns=df_tsne_scaled.index)


In [None]:
melted_tsne_df = pd.melt(cosine_sim_df.reset_index(), id_vars="uuid", var_name="uuid_b", value_name="cosine_similarity")
melted_tsne_df = melted_tsne_df[melted_tsne_df.cosine_similarity.isnull()==False]


In [None]:
melted_tsne_df_99 = melted_tsne_df[melted_tsne_df.cosine_similarity > 0.99]
melted_tsne_df_99.columns = ['uuid_a','uuid_b', 'cosine_similarity']
melted_tsne_df_99[['uuid_a', 'uuid_b']] = melted_tsne_df_99.apply(lambda row: sorted([row['uuid_a'], row['uuid_b']]), axis=1, result_type='expand')
melted_tsne_df_99.drop_duplicates(subset=['uuid_a', 'uuid_b'], inplace=True)
melted_tsne_df_99 = melted_tsne_df_99[melted_tsne_df_99.uuid_a != melted_tsne_df_99.uuid_b]

In [None]:
# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(melted_tsne_df_99['uuid_a']).union(set(melted_tsne_df_99['uuid_b']))
G.add_nodes_from(nodes)
# node_sizes = [node_weights[node] / 10 for node in G.nodes()]

# Add edges to the graph
edges = [(row['uuid_a'], row['uuid_b'], row['cosine_similarity']) for _, row in melted_tsne_df_99.iterrows()]
G.add_weighted_edges_from(edges)

# Define the layout
pos = nx.spring_layout(G, k=5, iterations=10, weight = 'cosine_similarity')
# Plot the graph
nx.draw(G, with_labels=False, node_color='lightblue', width = melted_tsne_df_99['cosine_similarity']*10, edge_color='gray')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])

plt.show()

In [None]:
communities = nxcom.greedy_modularity_communities(G, weight = 'cosine_similarity')

In [None]:
# Print the number of communities
print(f"Number of communities: {len(communities)}")

# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i

In [None]:
# Color the nodes based on community
color_map = {}
for i, comm in enumerate(communities):
    for node in comm:
        color_map[node] = i
# Set node and edge communities
set_node_community(G, communities)
set_edge_community(G)
node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]
node_sizes = [dict_spending[node] / 20 for node in G.nodes()]


# Adjust edge transparency and thickness
edge_alpha = 0.3  # Increased transparency
edge_width = melted_tsne_df_99['cosine_similarity'] *10  # Reduced thickness

pos = nx.spring_layout(G, k=1, iterations=100, weight = 'weight')
# pos = nx.nx_agraph.graphviz_layout(G2)
nx.draw(G, pos, font_size = 0,with_labels=False, edge_color="#444444", node_color=node_color, width=edge_width, alpha= edge_alpha, node_size=node_sizes)

# Add labels to the edges
# labels = nx.get_edge_attributes(G, 'weight')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8)


# Add a legend for the communities
handles = []
labels = []
# fig, ax = plt.subplots(figsize=(8, 8))
# for i, community in enumerate(communities):
#     plt.plot([], [], color=color_map_patches[i], label=f"Community {i}")
# plt.legend(scatterpoints=1, frameon=True, labelspacing=0.5, title='Communities')
# ax.set_axis_off()
# for i, community in enumerate(communities):
#     handles.append(plt.Line2D([], [], linewidth=0, marker='o', color=list(set(node_color))[i]))
#     labels.append(f'Community {i+1}')
# plt.legend(handles, labels, loc='upper right',fontsize = 8, labelspacing=0.5, title='Communities')

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.show()

## t-SNE

In [None]:
df_tsne = pd.DataFrame((df_prestation_lca_restricted.groupby(['uuid','THERAPIES_SIMPLIFIED_SET'])['PRESTATIONS_BRUTES'].sum()))

In [None]:
df_tsne = df_tsne.unstack('THERAPIES_SIMPLIFIED_SET').fillna(0)
df_tsne = df_tsne['PRESTATIONS_BRUTES']

In [None]:
# df_tsne_samp = df_tsne[df_tsne.index.isin(node_sel)]
df_tsne_samp = df_tsne.sample(frac=0.2, random_state=42).copy()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Assuming df is your DataFrame where each row is an insured individual
# and each column represents the amount spent on a type of complementary medicine.

# Standardize the data (important for t-SNE)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_tsne_samp)

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=5000)
tsne_results = tsne.fit_transform(scaled_df)

# Create a DataFrame to hold the t-SNE results
tsne_df = pd.DataFrame(data=tsne_results, columns=['Dimension 1', 'Dimension 2'])

# Plot the t-SNE results
plt.figure(figsize=(8, 8))
plt.scatter(tsne_df['Dimension 1'], tsne_df['Dimension 2'], alpha=0.5)
plt.title('t-SNE of Amount Spent on Complementary Medicine')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()

In [None]:
tsne_df = tsne_df.set_index(df_tsne_samp.index)
tsne_df['community'] = tsne_df.index.map(community_dict)
tsne_df['community'] = tsne_df['community']+1
tsne_df['community'] = tsne_df['community'].astype('category')

In [None]:
plt.figure(figsize=(8, 8))
sns.scatterplot(data=tsne_df, hue='community',  x='Dimension 1', y='Dimension 2', alpha=0.5, linewidth = 0)
plt.show()

## Using OpenTSNE

In [None]:
import openTSNE
from openTSNE import utils
import pickle
import gzip
import numpy as np
from sklearn.model_selection import train_test_split
import utils_opentsne
import matplotlib.pyplot as plt
reload(utils_opentsne)

In [None]:
# with gzip.open("/Users/david/Downloads/macosko_2015.pkl.gz", "rb") as f:
#     data = pickle.load(f)

In [None]:
# x = data["pca_50"]
# y = data["CellType1"].astype(str)

In [None]:
df_tsne['sum'] = df_tsne.sum(axis = 1)

In [None]:
df_tsne['sum_q'] = pd.qcut(df_tsne['sum'], q=3, labels=['Low', 'Medium', 'High'])

In [None]:
scaler = StandardScaler()
df_tsne_scaled = df_tsne.drop(['sum_q'],axis = 1)
# df_tsne_samp_scaled = scaler.fit_transform(df_tsne_samp.drop(['sum','sum_q'],axis = 1))
df_tsne_scaled[df_tsne_scaled.columns] = scaler.fit_transform(df_tsne_scaled[df_tsne_scaled.columns])

In [None]:
x = df_tsne_scaled.to_numpy()
y = df_tsne_scaled['sum'].to_numpy()

In [None]:
colors_tsne = {'Low':'#91bfdb','Medium':'#ffffbf','High':'#fc8d59'}

In [None]:
def plot_tsne(x, **kwargs):
    utils_opentsne.plot(x, y, colors=colors_tsne, **kwargs)

In [None]:
%%time
embedding_standard = openTSNE.TSNE(
    perplexity=30,
    initialization="random",
    metric="euclidean",
    n_jobs=10,
    random_state=3,
).fit(x)

In [None]:
%%time
embedding_pca = openTSNE.TSNE(
    perplexity=100,
    initialization="pca",
    metric="euclidean",
    n_jobs=10,
    random_state=3,
).fit(x)

In [None]:
embedding_pca_cosine = openTSNE.TSNE(
    perplexity=300,
    initialization="pca",
    metric="cosine",
    n_jobs=8,
    random_state=3,
).fit(x)

In [None]:
reload(utils_opentsne)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
# ax = plot_tsne(embedding_pca)
# plt.show()

In [None]:
# ax = plot_tsne(embedding_pca_cosine)
# plt.show()

In [None]:
df_tsne_scaled[['Dim1','Dim2']] = embedding_pca
# df_tsne_samp_scaled[['Dim1','Dim2']] = embedding_pca

In [None]:
kmeans = KMeans(n_clusters=15, random_state=42)
kmeans_labels = kmeans.fit_predict(df_tsne_scaled[['Dim1', 'Dim2']])

In [None]:
df_tsne_scaled['kmeans_group'] = kmeans_labels
df_tsne['kmeans_group'] = kmeans_labels

In [None]:

# List to store the inertia (within-cluster sum of squares)
inertia_list = []

# Number of clusters to try
cluster_range = range(1, 18)

# Calculate inertia for different number of clusters
for num_clusters in cluster_range:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(df_tsne_scaled[['Dim1', 'Dim2']])
    inertia_list.append(kmeans.inertia_)

# Plotting the Elbow Method Graph
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia_list, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.grid(True)
plt.show()

In [None]:
# df_tsne_samp['kmeans_group'] = df_tsne_samp['kmeans_group'].astype('string')

In [None]:
cmap = plt.get_cmap('tab20')
colors = [cmap(i/19) for i in range(15)] 

In [None]:
from matplotlib.patches import Patch

In [None]:
legend_handles = [Patch(facecolor=colors[i], edgecolor='gray', label=f'Cluster {i+1}') for i in range(15)]

fig, ax = plt.subplots(figsize=(15, 12))
mapped_colors = [colors[label] for label in df_tsne_scaled[df_tsne_scaled.kmeans_group != 6].kmeans_group]
plt.scatter(df_tsne_scaled[df_tsne_scaled.kmeans_group != 6]['Dim1'], df_tsne_scaled[df_tsne_scaled.kmeans_group != 6]['Dim2'], c=mapped_colors, alpha=0.05, s=1)
mapped_colors = [colors[label] for label in df_tsne_scaled[df_tsne_scaled.kmeans_group == 6].kmeans_group]
plt.scatter(df_tsne_scaled[df_tsne_scaled.kmeans_group == 6]['Dim1'], df_tsne_scaled[df_tsne_scaled.kmeans_group == 6]['Dim2'], c=mapped_colors, alpha=1, s=3)
plt.title('t-SNE with K-means Clusters')
ax.set_axis_off()

# Add legend
ax.legend(handles=legend_handles, title='Clusters', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()

In [None]:
legend_handles = [Patch(facecolor=colors[i], edgecolor='gray', label=f'Cluster {i+1}') for i in range(15)]

fig, ax = plt.subplots(figsize=(15, 12))
mapped_colors = [colors[label] for label in df_tsne_scaled[df_tsne_scaled.kmeans_group != 0].kmeans_group]
plt.scatter(df_tsne_scaled[df_tsne_scaled.kmeans_group != 0]['Dim1'], df_tsne_scaled[df_tsne_scaled.kmeans_group != 0]['Dim2'], c=mapped_colors, alpha=0.05, s=1)
mapped_colors = [colors[label] for label in df_tsne_scaled[df_tsne_scaled.kmeans_group == 0].kmeans_group]
plt.scatter(df_tsne_scaled[df_tsne_scaled.kmeans_group == 0]['Dim1'], df_tsne_scaled[df_tsne_scaled.kmeans_group == 0]['Dim2'], c=mapped_colors, alpha=1, s=3)
plt.title('t-SNE with K-means Clusters')
ax.set_axis_off()

# Add legend
ax.legend(handles=legend_handles, title='Clusters', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize = (15,12))
mapped_colors = [colors[label] for label in df_tsne_scaled.kmeans_group]

plt.scatter(df_tsne_scaled['Dim1'], df_tsne_scaled['Dim2'], c=mapped_colors, alpha=0.3, s=3)
# plt.colorbar()
plt.title('t-SNE with K-means Clusters')
# plt.xlabel('Dimension 1')
# plt.ylabel('Dimension 2')
ax.set_axis_off()
plt.show()

In [None]:
df_heatmap_kmeans = df_tsne.groupby('kmeans_group').mean()
df_heatmap_kmeans_scaled = df_tsne_scaled.groupby('kmeans_group').mean()


df_heatmap_kmeans.index = df_heatmap_kmeans.index+1

df_heatmap_kmeans.index = df_heatmap_kmeans.index.astype(int).astype(str)

# Plotting the heatmap
plt.figure(figsize=(32, 4))
sns.heatmap(df_heatmap_kmeans, annot=False, cmap='coolwarm')
plt.title('Typology of communities based on Therapies')
plt.xlabel('Méthodes thérapeutiques')
plt.ylabel('Community')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt


# Calculate pairwise distance matrix
distance_matrix = pdist(df_heatmap_kmeans.drop(['sum'], axis = 1))

# Perform hierarchical clustering
Z = linkage(distance_matrix, method='ward')

# Create dendrogram
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('K-means group')
plt.show()

In [None]:
df_heatmap_kmeans.sort_values('Relaxation')['Relaxation']

In [None]:
df_prestation_lca_restricted[df_prestation_lca_restricted.THERAPIES_SIMPLIFIED_SET == 'Relaxation'].PRESTATIONS_BRUTES.sum()

In [None]:
# plt.figure(figsize=(32, 4))
sns.clustermap(df_heatmap_kmeans.drop('sum', axis = 1), z_score = 1, cmap = 'viridis', figsize=(25, 12))
plt.show()

In [None]:
# plt.figure(figsize=(32, 4))
sns.clustermap(df_heatmap_kmeans.drop('sum', axis = 1), cmap = 'viridis', figsize=(25, 12))
plt.show()

In [None]:
df_tsne_scaled.groupby('kmeans_group')['sum'].mean().sort_values()

## K-means

In [None]:
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler
# # scaler = StandardScaler()
# # scaled_df = scaler.fit_transform(df_tsne)
# kmeans = KMeans(n_clusters=10, random_state=42)
# kmeans_labels = kmeans.fit_predict(scaled_df)

In [None]:
# scaler = StandardScaler()
# scaled_df = scaler.fit_transform(df_tsne)
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(tsne_df[['Dimension 1', 'Dimension 2']])

In [None]:

# List to store the inertia (within-cluster sum of squares)
inertia_list = []

# Number of clusters to try
cluster_range = range(1, 18)

# Calculate inertia for different number of clusters
for num_clusters in cluster_range:
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(tsne_df[['Dimension 1', 'Dimension 2']])
    inertia_list.append(kmeans.inertia_)

# Plotting the Elbow Method Graph
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia_list, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.grid(True)
plt.show()

In [None]:
df_tsne_samp['kmeans_group'] = kmeans_labels

In [None]:
plt.scatter(tsne_df['Dimension 1'], tsne_df['Dimension 2'], c=kmeans_labels, cmap=plt.cm.get_cmap('tab10', 10), alpha=1)
plt.colorbar()
plt.title('t-SNE with K-means Clusters')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()

In [None]:
df_tsne_samp['community'] = df_tsne_samp.index.map(community_dict)
df_tsne_samp['community'] = df_tsne_samp['community']+1
df_tsne_samp['community'] = df_tsne_samp['community'].astype('category')

In [None]:
df_heatmap_kmeans = df_tsne_samp.groupby('kmeans_group').mean()['PRESTATIONS_BRUTES']
df_heatmap_community = df_tsne_samp.groupby('community').mean()['PRESTATIONS_BRUTES']

In [None]:
# Plotting the heatmap
plt.figure(figsize=(12, 4))
sns.heatmap(df_heatmap_kmeans, annot=False, cmap='coolwarm')
plt.title('Typology of kmeans_group based on Therapies')
plt.xlabel('Therapies')
plt.ylabel('kmeans_group')
plt.show()

In [None]:
# Plotting the heatmap
plt.figure(figsize=(12, 4))
sns.heatmap(df_heatmap_community, annot=False, cmap='coolwarm')
plt.title('Typology of communities based on Therapies')
plt.xlabel('Therapies')
plt.ylabel('Communities')
plt.show()

In [None]:
# Function to get top therapies for each group
def get_top_therapies(group, n_top=3):
    sorted_therapies = group.sort_values(ascending=False)
    return sorted_therapies.head(n_top).index.tolist()

In [None]:
# Dictionary to hold top therapies for each kmeans_group
top_therapies_dict = {}

# Get top therapies for each group
for group in df_heatmap_kmeans.index:
    top_therapies = get_top_therapies(df_heatmap_kmeans.loc[group])
    top_therapies_dict[group] = top_therapies

top_therapies_dict

In [None]:
# Dictionary to hold top therapies for each kmeans_group
top_therapies_dict = {}

# Get top therapies for each group
for group in df_heatmap_community.index:
    top_therapies = get_top_therapies(df_heatmap_community.loc[group])
    top_therapies_dict[group] = top_therapies

top_therapies_dict

In [None]:
# Pseudo-code to implement ratio-based grouping
total_spent_each_therapy = df_tsne_samp.sum(axis=0)

for group in df_heatmap_kmeans.index:
    group_data = df_heatmap_kmeans.loc[group]
    ratio = group_data / total_spent_each_therapy
    top_therapies = get_top_therapies(ratio)

In [None]:
group_data

## Network of therapies and medicine - How do they collaborate?

In [None]:
df_prestation_aos = read_data(data_folder/'processed'/'df_prestation_aos_preprocessed.parquet.gzip')

In [None]:
df1_lca = pd.DataFrame(df_prestation_lca_restricted.groupby('DISCIPLINES_SIMPLIFIED_SET')['uuid'].apply(set))

In [None]:
df1_lca['med_type'] = 'Complementary'

In [None]:
df2_aos = pd.DataFrame(df_prestation_aos.groupby('SOUS_CATEGORIE_DISPENSATEUR')['uuid'].apply(set))

In [None]:
df2_aos['med_type'] = 'Conventional'

In [None]:
df_uuid_sets = pd.concat([df1_lca, df2_aos])

In [None]:
df_uuid_sets['len_set'] = df_uuid_sets['uuid'].apply(len)

In [None]:
# Initialize an empty DataFrame to store the Jaccard indices
jaccard_df = pd.DataFrame(index=df_uuid_sets.index, columns=df_uuid_sets.index)

# Compute the Jaccard index for all combinations
for row1, row2 in combinations(df_uuid_sets.index, 2):
    set1 = df_uuid_sets.loc[row1, 'uuid']
    set2 = df_uuid_sets.loc[row2, 'uuid']
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    jaccard_index = intersection / union
    
    # Fill the DataFrame
    jaccard_df.loc[row1, row2] = jaccard_index
    jaccard_df.loc[row2, row1] = jaccard_index

# Fill diagonal with 1s, as they are identical
for row in df.index:
    jaccard_df.loc[row, row] = 1.0

In [None]:
melted_jaccard_df = pd.melt(jaccard_df.reset_index(), id_vars="index", var_name="Method_2", value_name="jaccard_index")

melted_jaccard_df = melted_jaccard_df[melted_jaccard_df.jaccard_index.isnull()==False]

melted_jaccard_df.columns = ['cat_dispensateur_A','cat_dispensateur_B', 'jaccard_index']


melted_jaccard_df['cat_dispensateur_A'] = melted_jaccard_df.apply(lambda x: x.cat_dispensateur_B if x.cat_dispensateur_A > x.cat_dispensateur_B else x.cat_dispensateur_A ,axis =1)
melted_jaccard_df['cat_dispensateur_B'] = melted_jaccard_df.apply(lambda x: x.cat_dispensateur_A if x.cat_dispensateur_A > x.cat_dispensateur_B else x.cat_dispensateur_B ,axis =1)

melted_jaccard_df['set_cat'] = melted_jaccard_df[['cat_dispensateur_A','cat_dispensateur_B']].apply(set, axis = 1)

melted_jaccard_df = melted_jaccard_df.drop_duplicates('set_cat')

melted_jaccard_df['len_set'] = melted_jaccard_df['set_cat'].apply(len)

melted_jaccard_df = melted_jaccard_df[melted_jaccard_df.len_set != 1]

# melted_jaccard_df = melted_jaccard_df[melted_jaccard_df.jaccard_index > 0.05]

In [None]:
node_weights = df_uuid_sets['len_set'].div(100).to_dict()

In [None]:
# Function to map 'type of medicine' to a color
def get_color(type_of_medicine):
    color_map = {
        'Conventional': 'red',
        'Complementary': 'blue',
        'Type3': 'green'
        # Add more types here
    }
    return color_map.get(type_of_medicine, 'gray')  # Default to gray if type not found


In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})

# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(melted_jaccard_df['cat_dispensateur_A']).union(set(melted_jaccard_df['cat_dispensateur_B']))
G.add_nodes_from(nodes)
node_sizes = [node_weights[node] / 10 for node in G.nodes()]
nx.set_node_attributes(G, df_uuid_sets['med_type'].to_dict(), 'med_type')
node_colors = [get_color(G.nodes[node].get('med_type', None)) for node in G.nodes()]

# node_color=node_color
# Add edges to the graph
edges = [(row['cat_dispensateur_A'], row['cat_dispensateur_B'], row['jaccard_index']) for _, row in melted_jaccard_df.iterrows()]
G.add_weighted_edges_from(edges)

# Define the layout
pos = nx.spring_layout(G, k=0.1, iterations=100, weight = 'weight')
# Plot the graph
nx.draw(G, with_labels=True, font_size = 1, node_color=node_colors, width = melted_jaccard_df['jaccard_index']/10, edge_color='gray',alpha=0.7, node_size=node_sizes)

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.savefig(output_folder/'Network analyses/Collaboration - Conventional vs Complementary.png', format = 'png', dpi = 1200, bbox_inches = 'tight')

plt.show()

In [None]:
melted_jaccard_df_massage = melted_jaccard_df[(melted_jaccard_df.cat_dispensateur_A == 'Methodes de massage')|(melted_jaccard_df.cat_dispensateur_B == 'Methodes de massage')]

In [None]:
df_prestation_aos.uuid.nunique()

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})

# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(melted_jaccard_df_massage['cat_dispensateur_A']).union(set(melted_jaccard_df_massage['cat_dispensateur_B']))
G.add_nodes_from(nodes)
node_sizes = [node_weights[node] / 10 for node in G.nodes()]
nx.set_node_attributes(G, df_uuid_sets['med_type'].to_dict(), 'med_type')
node_colors = [get_color(G.nodes[node].get('med_type', None)) for node in G.nodes()]

# node_color=node_color
# Add edges to the graph
edges = [(row['cat_dispensateur_A'], row['cat_dispensateur_B'], row['jaccard_index']) for _, row in melted_jaccard_df_massage.iterrows()]
G.add_weighted_edges_from(edges)

# Define the layout
pos = nx.spring_layout(G, k=5, iterations=100, weight = 'weight')
# Plot the graph
nx.draw(G, with_labels=True, font_size = 3, node_color=node_colors, width = melted_jaccard_df_massage['jaccard_index']*10, edge_color='gray',alpha=0.7, node_size=node_sizes)

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.savefig(output_folder/'Network analyses/Collaboration - Conventional vs Complementary - Massages.pdf', format = 'pdf', bbox_inches = 'tight')

plt.show()

In [None]:
melted_jaccard_df_onco = melted_jaccard_df[(melted_jaccard_df.cat_dispensateur_B.str.contains('onco|Onco'))|(melted_jaccard_df.cat_dispensateur_A.str.contains('onco|Onco'))].sort_values('jaccard_index')

In [None]:
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({'figure.figsize': (15, 10)})

# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(melted_jaccard_df_onco['cat_dispensateur_A']).union(set(melted_jaccard_df_onco['cat_dispensateur_B']))
G.add_nodes_from(nodes)
node_sizes = [node_weights[node] / 10 for node in G.nodes()]
nx.set_node_attributes(G, df_uuid_sets['med_type'].to_dict(), 'med_type')
node_colors = [get_color(G.nodes[node].get('med_type', None)) for node in G.nodes()]

# node_color=node_color
# Add edges to the graph
edges = [(row['cat_dispensateur_A'], row['cat_dispensateur_B'], row['jaccard_index']) for _, row in melted_jaccard_df_onco.iterrows()]
G.add_weighted_edges_from(edges)

# Define the layout
pos = nx.spring_layout(G, k=5, iterations=100, weight = 'weight')
# Plot the graph
nx.draw(G, with_labels=True, font_size = 3, node_color=node_colors, width = melted_jaccard_df_onco['jaccard_index']*10, edge_color='gray',alpha=0.7, node_size=node_sizes)

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.savefig(output_folder/'Network analyses/Collaboration - Conventional vs Complementary - Onco.pdf', format = 'pdf', bbox_inches = 'tight')

plt.show()

In [None]:
melted_jaccard_df_complementary = melted_jaccard_df[(melted_jaccard_df.cat_dispensateur_A.isin(df1_lca.index))|(melted_jaccard_df.cat_dispensateur_B.isin(df1_lca.index))]

In [None]:
melted_jaccard_df_complementary.sort_values('jaccard_index')

In [None]:
melted_jaccard_df_complementary = melted_jaccard_df_complementary[melted_jaccard_df_complementary.jaccard_index > 0.001]

In [None]:
# Create an empty graph
G = nx.Graph()

# Add nodes to the graph
nodes = set(melted_jaccard_df_complementary['cat_dispensateur_A']).union(set(melted_jaccard_df_complementary['cat_dispensateur_B']))
G.add_nodes_from(nodes)
node_sizes = [node_weights[node] / 10 for node in G.nodes()]
nx.set_node_attributes(G, df_uuid_sets['med_type'].to_dict(), 'med_type')
node_colors = [get_color(G.nodes[node].get('med_type', None)) for node in G.nodes()]

# node_color=node_color
# Add edges to the graph
edges = [(row['cat_dispensateur_A'], row['cat_dispensateur_B'], row['jaccard_index']) for _, row in melted_jaccard_df_complementary.iterrows()]
G.add_weighted_edges_from(edges)

# Define the layout
pos = nx.spring_layout(G, k=5, iterations=100, weight = 'weight')
# Plot the graph
nx.draw(G, with_labels=True, font_size = 3, node_color=node_colors, width = melted_jaccard_df_complementary['jaccard_index'], edge_color='gray',alpha=0.7, node_size=node_sizes)

# Adjust plot limits to account for label spacing
xmin, xmax, ymin, ymax = plt.axis()
plt.axis([xmin - 0.1, xmax + 0.1, ymin - 0.1, ymax + 0.1])
plt.savefig('output_folder/Network analyses/Collaboration - Conventional vs Complementary - Complementary links.pdf', format = 'pdf', bbox_inches = 'tight')

plt.show()

In [None]:
df_uuid_sets[['med_type','len_set']].to_csv('output_folder/Network analyses/df_nodes_complementary_collab.csv', sep = ';')

In [None]:
melted_jaccard_df_complementary.to_csv('output_folder/Network analyses/df_network_complementary_collab.csv', sep=';', index=False)

## Geographically weighted k-means