In [None]:
import os
import glob
import sys
from pathlib import Path
import psycopg2

os.environ['USE_PYGEOS'] = '0'  
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import geoplot
import geoplot.crs as gcrs
import rasterio
from rasterio.mask import mask
from rasterio.plot import show
import fiona
import libpysal as lps
from scipy.spatial import cKDTree
from libpysal.weights.distance import get_points_array
from esda import fdr
import contextily as ctx
from shapely.geometry import Point, Polygon

sys.path.append('/Users/david/Dropbox/PhD/Scripts/Spatial analyses')
import pyspace
import utils
from utils import optimize_memory_df, feature_map, show_values, sizeof_fmt, find_intersection, read_data

# Local imports
from importlib import reload  # Are you using this somewhere?

plt.rc('font', family='Helvetica')  # Try to keep configuration parameters together, maybe in a single configuration function or at the beginning of your script.
sns.set_theme(style="white")
sns.set_context("paper")

In [None]:
# Define base data folder
data_folder  = Path('../Data/')
# Define base result folder
result_folder = Path('../Results')

# Aims of the notebook

## LCA

### Multiple therapies problem

Basically, for each patient-month pair, we have the amount spent by therapist but therapists can have multiple specialties. 

- The aggregation based on string similarity improved the situation by reducing the total number of distinct therapies (+ collapsing numerous therapies with different spelling or wording).


In [None]:
df_prestation_lca = read_data(data_folder/'processed'/'df_prestation_lca_preprocessed.parquet.gzip')

### Multiple therapies problem

Now solved and included in the main Feature engineering notebook


In [None]:
df_dispensateurs_lca_nodupli = pd.read_parquet(
    data_folder/'processed'/'df_dispensateurs_lca_nodupli.parquet.gzip')

df_dispensateurs_lca_exploded = pd.read_parquet(data_folder/'processed'/'df_dispensateur_lca_exploded.parquet.gzip')


In [None]:
df_dispensateurs_lca_nodupli

In [None]:
df_prestation_lca = pd.merge(df_prestation_lca, 
                             df_dispensateurs_lca_nodupli[['ID_DISPENSATEUR','THERAPIES_SIMPLIFIED_SET','n_therapies']], 
                             on = 'ID_DISPENSATEUR', 
                             how = 'left')


In [None]:
print("Nombre de thérapeutes = ", df_dispensateurs_lca_nodupli.ID_DISPENSATEUR.nunique())

In [None]:
print("Nombre de thérapeutes avec > 1 type de thérapie =", df_dispensateurs_lca_nodupli[df_dispensateurs_lca_nodupli.n_therapies > 1].ID_DISPENSATEUR.nunique())

On a donc à peu près 50% des dispensateurs qui posent problème...

In [None]:
df_dispensateurs_lca = df_prestation_lca[['ID_DISPENSATEUR','CODES_THERAPIES','THERAPIES','TXGENREFRAISLGFR','CATEGORIE_DISPENSATEUR']].drop_duplicates()

In [None]:
filtered_therap_clean = pd.read_csv('../Data/processed/20230223_Therapies_ontology.csv',encoding='ISO-8859-1')

In [None]:
filtered_therap_clean['therapie_lvl2'].nunique()

In [None]:
filtered_therap_clean['Code'] = filtered_therap_clean['Code'].str.replace(' ','')
filtered_therap_clean['Code'] = filtered_therap_clean['Code'].str.strip()

In [None]:
dict_ontology_lca = filtered_therap_clean.set_index('Code')['therapie_lvl2'].to_dict()

In [None]:
dict_ontology_lca_by_disc = filtered_therap_clean.set_index('Code')['Methode'].to_dict()

On trouve la liste complète des codes EMR ici : https://www.rme.ch/dl/documents/rme-reglement-8-methodes.pdf  
On trouve la liste complète des codes ASCA ici : http://www.asca.ch/therapies.aspx

In [None]:
# dict_ontology_lca['EMR-235'] = "Art-thérapeute avec diplôme fédéral Spécialisation Thérapie intermédiale"
# dict_ontology_lca['EMR-231'] = "Art-thérapeute avec diplôme fédéral Spécialisation Thérapie par le mouvement et la danse"
# dict_ontology_lca['EMR-43'] = "Thérapie Dorn, qualification supplémentaire"
# dict_ontology_lca['EMR-232'] = "Art-thérapeute avec diplôme fédéral Spécialisation Thérapie par le drame et la parole"
# dict_ontology_lca['EMR-233'] = "Art-thérapeute avec diplôme fédéral Spécialisation Musicothérapie"
# dict_ontology_lca['EMR-234'] = "Art-thérapeute avec diplôme fédéral Spécialisation Thérapie à médiation plastique et visuelle"
# dict_ontology_lca['EMR-128'] = 'Musicothérapie, anthroposophique'
# dict_ontology_lca['EMR-4221'] = "Certificat de branche OrTra TC - méthode Massage Rythmique Thérapeutique"
# dict_ontology_lca['EMR-4201'] = "Certificat de branche OrTra TC - méthode Thérapie Biodynamique"
# dict_ontology_lca['EMR-4211'] = "Certificat de branche OrTra TC - méthode Fasciathérapie"
# dict_ontology_lca['EMR-177'] = "Art de la parole thérapeutique, anthroposophique"
# dict_ontology_lca['EMR-216'] = "Thérapie de développement et de l‘apprentissage selon PäPKi pour enfants d'âge préscolaire et scolaire"
# dict_ontology_lca['EMR-215'] = "Thérapie de développement et de l‘apprentissage selon PäPKi pour les nourrissons et les jeunes enfants"
# dict_ontology_lca['EMR-32'] = "Thérapie par le mouvement, intégral/clinique"

In [None]:
def simplify_ontology(x):
    try:
        x = [i.strip() for i in x]
        y = list(dict_ontology_lca[i] if i in dict_ontology_lca.keys() else i for i in x)
        return y
    except:
        return np.nan

In [None]:
df_dispensateurs_lca['CODES_THERAPIES'] = df_dispensateurs_lca['CODES_THERAPIES'].str.split(',')

def strip_if_list(item):
    if isinstance(item, list):
        return [str_elem.lstrip() for str_elem in item]
    return item

# Apply the function to the DataFrame column
df_dispensateurs_lca['CODES_THERAPIES'] = df_dispensateurs_lca['CODES_THERAPIES'].apply(strip_if_list)

In [None]:
df_dispensateurs_lca['THERAPIES_SIMPLIFIED'] = df_dispensateurs_lca.apply(lambda x : simplify_ontology(x['CODES_THERAPIES']), axis = 1)

In [None]:
unique_codes_lca = df_dispensateurs_lca['CODES_THERAPIES'].explode().unique()

In [None]:
codes_lca_manquants = []
for code_lca in unique_codes_lca:
    try:
        print(dict_ontology_lca[code_lca.strip()])
    except:
        print('Code manquant', code_lca)
        codes_lca_manquants.append(code_lca)

In [None]:
len(codes_lca_manquants)

In [None]:
df_dispensateurs_lca_exploded = df_dispensateurs_lca.explode(['CODES_THERAPIES','THERAPIES_SIMPLIFIED'])

In [None]:
df_dispensateurs_lca_exploded['discipline_therap'] = df_dispensateurs_lca_exploded['CODES_THERAPIES'].map(dict_ontology_lca_by_disc)

In [None]:
df_dispensateurs_lca_exploded

In [None]:
df_dispensateurs_lca_exploded[df_dispensateurs_lca_exploded['THERAPIES_SIMPLIFIED'].str.contains('Sangsues', na = False)].ID_DISPENSATEUR.nunique()

In [None]:
df_dispensateurs_lca_exploded[df_dispensateurs_lca_exploded['THERAPIES_SIMPLIFIED'].str.contains('Sangsues', na = False)]

In [None]:
# Create a function to handle float inputs and convert non-floats into sets
def set_or_nan(x):
    if isinstance(x, float):
        return float('nan')
    else:
        return set(x)

# Apply the function to the 'THERAPIES_SIMPLIFIED' column
df_dispensateurs_lca['THERAPIES_SIMPLIFIED_SET'] = df_dispensateurs_lca['THERAPIES_SIMPLIFIED'].apply(set_or_nan)


In [None]:
# Group by 'ID_DISPENSATEUR', aggregate unique therapies for each dispenser
df_dispensateurs_lca_nodupli = df_dispensateurs_lca.groupby('ID_DISPENSATEUR').agg({
    'THERAPIES_SIMPLIFIED_SET': lambda x: set().union(*[y for y in x if isinstance(y, set)])
})


In [None]:
# Count the number of unique therapies for each dispenser
df_dispensateurs_lca_nodupli['n_therapies'] = df_dispensateurs_lca_nodupli['THERAPIES_SIMPLIFIED_SET'].apply(len)
# Reset the DataFrame index
df_dispensateurs_lca_nodupli = df_dispensateurs_lca_nodupli.reset_index()


In [None]:
# Add the amount by ID_DISPENSATEUR
df_dispensateurs_lca_nodupli['PRESTATION_SUM'] = df_dispensateurs_lca_nodupli['ID_DISPENSATEUR'].map(df_prestation_lca.groupby('ID_DISPENSATEUR').PRESTATIONS_BRUTES.sum().to_dict())

In [None]:
# Check whether dispensateurs with a lot of therapy types represent important amounts

In [None]:
# Check what is happening with the ID_DISPENSATEUR having n_therapies == 0
df_dispensateurs_lca_nodupli[df_dispensateurs_lca_nodupli.n_therapies == 0].sort_values('PRESTATION_SUM')

In [None]:
df_dispensateurs_lca_exploded[df_dispensateurs_lca_exploded.THERAPIES_SIMPLIFIED == 'Massage classique']

In [None]:
df_dispensateurs_lca_exploded.drop_duplicates(subset = ['ID_DISPENSATEUR','THERAPIES_SIMPLIFIED']).THERAPIES_SIMPLIFIED.value_counts().head(20).plot.bar()

In [None]:
therapies_serie = df_dispensateurs_lca_exploded.groupby('ID_DISPENSATEUR').discipline_therap.apply(set)

In [None]:
df_dispensateurs_lca_exploded

In [None]:
df_dispensateurs_lca_nodupli[df_dispensateurs_lca_nodupli.n_therapies == 2]

In [None]:

df_dispensateurs_lca_nodupli[df_dispensateurs_lca_nodupli.THERAPIES_SIMPLIFIED_SET == {'Massage classique', 'Massage therapeutique'}]

In [None]:
import itertools
from collections import Counter

# Create an empty counter to store co-occurrences
co_occurrences = Counter()

# Loop through each set in the series
for s in therapies_serie:
    # Ignore None sets
    if s is None:
        continue

    # Get all combinations of 2 elements from the set
    combos = itertools.combinations(s, 2)
    # Update the counter with the combinations
    co_occurrences.update(combos)

In [None]:
df = pd.DataFrame(co_occurrences.items(), columns=['combo', 'count'])
# Split the combo column into two columns
df[['node1', 'node2']] = pd.DataFrame(df['combo'].tolist())

# Drop the combo column
df = df.drop('combo', axis=1)

# Reset the index
df = df.reset_index(drop=True)

# Rename the columns
df = df.rename(columns={'count': 'weight'})

# Filter out rows where node1 or node2 is None
df = df[df['node1'].notnull() & df['node2'].notnull()]


In [None]:
df.sort_values('weight').tail(50)