In [None]:
# Required Libraries
import pandas as pd
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects
from statsmodels.datasets import grunfeld
import statsmodels.api as sm
import statsmodels.formula.api as smf
from utils import read_data, plot_covariate_distributions, plot_match, compare_balance, sizeof_fmt, optimize_memory_df, plot_categorical_proportional_diff, compute_mean_differences_and_proportions, love_plot, sensitivity_analysis_k_neighbors
import spreg
import libpysal
import numpy as np

## Data filtering

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

fig, ax = plt.subplots()

# Draw rectangles
rect1 = mpatches.Rectangle((0, 0.1), 0.3, 0.2, alpha = 0.3, edgecolor = 'black', facecolor = 'white')
rect2 = mpatches.Rectangle((0, 0.4), 0.3, 0.2, alpha = 0.3, edgecolor = 'black', facecolor = 'white')
rect3 = mpatches.Rectangle((0, 0.7), 0.3, 0.2, alpha = 0.3, edgecolor = 'black', facecolor = 'white')
rect4 = mpatches.Rectangle((0, -0.2), 0.3, 0.2, alpha = 0.3, edgecolor = 'black', facecolor = 'white')

# Add rectangles to plot
ax.add_patch(rect1)
ax.add_patch(rect2)
ax.add_patch(rect3)
ax.add_patch(rect4)

# Add text
plt.text(0.15, 0.775, 'Raw yearly data\n(n = 260,551/year)', color='black', size = 9, horizontalalignment = 'center')
plt.text(0.15, 0.52, """Excluding individuals not part of treatment/control\n within a year""", size = 6, color='black', horizontalalignment = 'center')
plt.text(0.15, 0.4, """
2017    (n=215263)
2018    (n=199964)
2019    (n=191651)
2020    (n=190652)
2021    (n=191133)""", size = 5, color='black', horizontalalignment = 'center')
plt.text(0.15, 0.135, 'Excluding individuals not part of treatment/control\n for the 5-year period\n(n=143,765)', size=9, color='black', horizontalalignment = 'center')
plt.text(0.15, -0.16, 'Excluding individuals not in the PSM matched\n baseline (2017) dataset\n(n=125,077)', size=9, color='black', horizontalalignment = 'center')

# Add arrows
plt.arrow(0.15, 0.7125, 0.0, -0.07, head_width=0.005, head_length=0.03, fc='k', ec='k',head_starts_at_zero = True)
plt.arrow(0.15, 0.4125, 0.0, -0.07, head_width=0.005, head_length=0.03, fc='k', ec='k',head_starts_at_zero = True)
plt.arrow(0.15, 0.11, 0.0, -0.07, head_width=0.005, head_length=0.03, fc='k', ec='k',head_starts_at_zero = True)

# Remove axis
plt.axis('off')

plt.show()

## Data preparation

In [None]:
data = read_data("../Data/processed/full_dataset_nonull.parquet.gzip")
data_final = data[data.gp.isin(['LCA & AOS','AOS only','LCA only'])]

In [None]:
# df_matched_2017 = pd.read_parquet('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2017_filtered.parquet.gzip')
# df_matched_2018 = pd.read_parquet('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2018_filtered.parquet.gzip')
# df_matched_2019 = pd.read_parquet('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2019_filtered.parquet.gzip')
# df_matched_2020 = pd.read_parquet('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2020_filtered.parquet.gzip')
# df_matched_2021 = pd.read_parquet('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2021_filtered.parquet.gzip')

In [None]:
# dfs = {
#     2017: df_matched_2017,
#     2018: df_matched_2018,
#     2019: df_matched_2019,
#     2020: df_matched_2020,
#     2021: df_matched_2021,
# }

# df_matched_full = pd.concat(
#     [df.assign(year=year) for year, df in dfs.items()],
#     ignore_index=True
# )

In [None]:
# df_matched_2017_filtered = read_data('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2017_filtered.parquet.gzip')
# df_matched_2018_filtered = read_data('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2018_filtered.parquet.gzip')
# df_matched_2019_filtered = read_data('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2019_filtered.parquet.gzip')
# df_matched_2020_filtered = read_data('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2020_filtered.parquet.gzip')
# df_matched_2021_filtered = read_data('/Users/david/Dropbox/PhD/GitHub/SanteIntegra/Data/processed/PSM/df_matched_2021_filtered.parquet.gzip')

In [None]:
# del df_matched_2017_filtered, df_matched_2018_filtered, df_matched_2019_filtered, df_matched_2020_filtered, df_matched_2021_filtered

In [None]:
del data

### Filter out individuals that do not belong to the groups part of treatment and control

In [None]:
df_treated = optimize_memory_df(data_final[data_final.treatment.isnull()==False])

### Filter out individuals that do not belong to the treatment and control for the whole 5 years

In [None]:
# Get unique years
unique_years = set(df_treated['NOANNEE'])

# Group by patient_id and filter
df_treated_filtered = df_treated.groupby('uuid').filter(lambda x: set(x['NOANNEE']) == unique_years)

In [None]:
df_treated_filtered.groupby(['NOANNEE','treatment']).uuid.nunique()

In [None]:
del df_treated

### Associate baseline (2017) PSM dataset to the following 4 years

1. One approach is to replicate the duplicate pattern created by the PSM for the 4 following years then concat the baseline with the following years.
2. Another approach is to simply concat the following year without replicating the duplicated individuals from the PSM of the baseline year.

FYI : There is an interesting package that seems to be able to conduct PSM on a continuous treatment variable,

#### Approach 1 : Without replication

In [None]:
# df_matched_2017_filtered['NOANNEE'] = 2017

# df_did_wo_replication = pd.concat([df_matched_2017_filtered[['uuid','NOANNEE','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_AOS','treatment']],df_treated_filtered[(df_treated_filtered.NOANNEE != 2017)][['uuid','NOANNEE','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_AOS','treatment']]]) 

# df_did_wo_replication = df_did_wo_replication.sort_values(['uuid', 'NOANNEE'])

# df_did_wo_replication['time'] = df_did_wo_replication['NOANNEE'] - 2017

# # Create an interaction term between the year and treatment variables.
# df_did_wo_replication['interaction'] = df_did_wo_replication['time'] * df_did_wo_replication['treatment']

# # Add constant term for intercept
# df_did_wo_replication['const'] = 1

## Panel data models - Toy example

In [None]:
# Loading Grunfeld Investment data
data = grunfeld.load_pandas().data

In [None]:
# Loading Grunfeld Investment data
year = pd.Categorical(data.year)

# Preparing data for panel model
data = data.set_index(['firm','year'])
data['year'] = year

# Pooled OLS model
pols = PanelOLS.from_formula('invest ~ value + capital + EntityEffects', data=data)
pols_result = pols.fit()
print(pols_result)

# Fixed effects model
fem = PanelOLS.from_formula('invest ~ value + capital + EntityEffects', data=data)
fem_result = fem.fit()
print(fem_result)

# Random effects model
rem = RandomEffects.from_formula('invest ~ value + capital', data=data)
rem_result = rem.fit()
print(rem_result)

In [None]:
a = [1,2,3,4,6,6,6]

In [None]:
a.count(max(a))

In [None]:
import random

# Generate random sample data
n_individuals = 10  # Number of individuals
n_years = 5  # Number of years

# Create lists to hold the data
uuids = [f"indiv_{i+1}" for i in range(n_individuals) for _ in range(n_years)]
years = [year for _ in range(n_individuals) for year in range(2017, 2017 + n_years)]
treatments = [random.choice([0, 1]) for _ in range(n_individuals * n_years)]
prestations_LCA = [random.uniform(0, 500) if t == 1 else 0 for t in treatments]
prestations_AOS = [random.uniform(1000, 5000) for _ in range(n_individuals * n_years)]
ages = [random.randint(30, 60) for _ in range(n_individuals * n_years)]
sexes = [random.choice(['M', 'F']) for _ in range(n_individuals * n_years)]
deductibles = [random.randint(200, 1000) for _ in range(n_individuals * n_years)]
zipcodes = [random.randint(1000, 9999) for _ in range(n_individuals * n_years)]

# Create the dataframe
sample_df = pd.DataFrame({
    'uuid': uuids,
    'NOANNEE': years,
    'treatment': treatments,
    'PRESTATIONS_BRUTES_LCA': prestations_LCA,
    'PRESTATIONS_BRUTES_AOS': prestations_AOS,
    'age': ages,
    'sex': sexes,
    'deductible': deductibles,
    'zipcode': zipcodes
})

# Add a column to count the number of years treated for each individual
sample_df['years_treated'] = sample_df.groupby('uuid')['treatment'].cumsum()

sample_df.head(15)  # Show the first 15 rows of the sample dataframe

## Panel data models

In [None]:
from linearmodels import PanelOLS

## Effect of treatment_status on PRESTATIONS_BRUTES_AOS

In [None]:
df_treated_filtered['year'] = df_treated_filtered['NOANNEE']-2016
df_panel_model = df_treated_filtered.set_index(['uuid','NOANNEE'])

In [None]:
Y = df_panel_model['PRESTATIONS_BRUTES_AOS']

In [None]:
df_panel_model['years_treated'] = df_panel_model.sort_values(['uuid','year']).groupby('uuid')['treatment'].cumsum()

In [None]:
# Independent variables, adding constant for the intercept
X = sm.add_constant(df_panel_model[['treatment']])

In [None]:
def plot_OLS_coefficients(result, var):
    # Get coefficients and confidence intervals
    coefficients = result.params
    conf_int = result.conf_int()
    conf_int['coef'] = coefficients
    conf_int['pvalues'] = result.pvalues
    conf_int['significant?'] = ['significant' if pval <= 0.05 else 'not significant' for pval in result.pvalues]
    
    # Sort by coefficients
    sorted_coef = conf_int.sort_values(by='coef')
    plt.figure(figsize=(6, 6))
    
    for idx, row in sorted_coef.iterrows():
        if idx == 'Intercept':
            pass
        else:
            ci_lower = row[0]  # Lower bound of confidence interval
            ci_upper = row[1]  # Upper bound of confidence interval
            coef = row['coef']  # Coefficient value

            ci = [[coef - ci_lower], [ci_upper - coef]]

            color = 'tab:red' if row['significant?'] == 'significant' and coef > 0 else \
                    'tab:blue' if row['significant?'] == 'significant' and coef <= 0 else \
                    'tab:gray'

            plt.errorbar(x=[coef], y=[idx], xerr=ci, ecolor=color, capsize=3, linestyle='None', 
                         linewidth=1, marker="o", markersize=5, mfc=color, mec=color)

    plt.axvline(x=0, color='grey', linestyle='--')
#     plt.yticks(np.arange(len(sorted_coef)), sorted_coef.index)
    plt.xlabel('Coefficient Value (CHF)')
    plt.ylabel('Coefficient Name')
    plt.title('Panel Data Model with Fixed Effects -  Coefficient Plot - {}'.format(var))
    plt.show()

In [None]:
model1 = 'PRESTATIONS_BRUTES_AOS ~ 1 + treatment*year + NBAGE_scaled + EntityEffects'
model2 = 'PRESTATIONS_BRUTES_AOS ~ 1 + treatment + treatment*year + EntityEffects'
model3 = 'PRESTATIONS_BRUTES_AOS ~ 1 + treatment + treatment*year + cds + NBAGE_scaled + SEX_F + MTFRANCHISECOUV + ssep2 + D_MEDIC_B + D_MEDIC_S + mean_ndvi + mean_lst + mean_pm10 + mean_no2 + EntityEffects'
model4 = 'PRESTATIONS_BRUTES_AOS ~ 1 + PRESTATIONS_BRUTES_LCA  + EntityEffects'
model5 = 'PRESTATIONS_BRUTES_AOS ~ 1 + PRESTATIONS_BRUTES_LCA  + PRESTATIONS_BRUTES_LCA*year + EntityEffects'
model6 = 'PRESTATIONS_BRUTES_AOS ~ 1 + PRESTATIONS_BRUTES_LCA  + PRESTATIONS_BRUTES_LCA*year + NBAGE_scaled + cds + SEX_F + MTFRANCHISECOUV + ssep2 + D_MEDIC_B + D_MEDIC_S + mean_ndvi + mean_lst + mean_pm10 + mean_no2 + EntityEffects'
model7 = 'PRESTATIONS_BRUTES_AOS ~ 1 + years_treated + EntityEffects'
model8 = 'PRESTATIONS_BRUTES_AOS ~ 1 + years_treated + years_treated*year + EntityEffects'
model9 = 'PRESTATIONS_BRUTES_AOS ~ 1 + years_treated + years_treated*year + cds + NBAGE_scaled + SEX_F + MTFRANCHISECOUV + ssep2 + D_MEDIC_B + D_MEDIC_S + mean_ndvi + mean_lst + mean_pm10 + mean_no2 + EntityEffects'

In [None]:
# Fixed effects model
fem = PanelOLS.from_formula(model1, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
plot_OLS_coefficients(fem_result, 'Model 1')  # Assuming plot_OLS_coefficients function is defined

In [None]:
# Fixed effects model2
fem = PanelOLS.from_formula(model2, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
plot_OLS_coefficients(fem_result, 'Model 2')  # Assuming plot_OLS_coefficients function is defined

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

df_panel_model['NBAGE_scaled'] =  scaler.fit_transform(df_panel_model[['NBAGE']])


In [None]:
df_panel_model[['NBAGE_scaled','year']].corr()

In [None]:
# Fixed effects model3
fem = PanelOLS.from_formula(model3, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
plot_OLS_coefficients(fem_result, 'Model 3')  # Assuming plot_OLS_coefficients function is defined

In [None]:
# Fixed effects model4
fem = PanelOLS.from_formula(model4, data=df_panel_model)
fem_result = fem.fit()
print(fem_result)

In [None]:
plot_OLS_coefficients(fem_result, 'Model 4')  # Assuming plot_OLS_coefficients function is defined

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model5, data=df_panel_model)
fem_result = fem.fit()
print(fem_result)

In [None]:
plot_OLS_coefficients(fem_result, 'Model 5')  # Assuming plot_OLS_coefficients function is defined

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model6, data=df_panel_model, drop_absorbed= True)
fem_result = fem.fit(cov_type='clustered', cluster_entity=True)
print(fem_result)

In [None]:
plot_OLS_coefficients(fem_result, 'Model 6')  # Assuming plot_OLS_coefficients function is defined

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model7, data=df_panel_model)
fem_result = fem.fit()
print(fem_result)

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model8, data=df_panel_model)
fem_result = fem.fit()
print(fem_result)

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model9, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
df_panel_model.columns = [col.replace(' ', '_') for col in df_panel_model.columns]
df_panel_model.columns = [col.replace("'", '') for col in df_panel_model.columns]

In [None]:
' + '.join(df_panel_model.filter(regex='_binary').columns.tolist())

In [None]:
model10 = "PRESTATIONS_BRUTES_AOS ~ 1 + treatment + year + cds + age_group + SEX_F + ssep2 + MTFRANCHISECOUV + mean_ndvi + D_MEDIC_B + D_MEDIC_S + EntityEffects"

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model10, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
dict_age_2017 = df_treated_filtered[df_treated_filtered.NOANNEE == 2017].set_index('uuid')['NBAGE'].to_dict()

df_panel_model['NBAGE_2017'] = df_panel_model.index.get_level_values('uuid').map(dict_age_2017)

In [None]:
time_dummies = pd.get_dummies(df_panel_model['year'], prefix='year',dtype = int)

In [None]:
df_panel_model = pd.concat([df_panel_model, time_dummies], axis=1)

In [None]:
model11 = "PRESTATIONS_BRUTES_AOS ~ 1 + PRESTATIONS_BRUTES_LCA + year_2017 + year_2018 + year_2019 + year_2020  + NBAGE + EntityEffects"

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula(model11, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
# Fixed effects model5
fem = PanelOLS.from_formula('PRESTATIONS_BRUTES_AOS ~ 1 + SEX_F + EntityEffects', data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

## Association entre treatment and CDS

In [None]:
df_panel_model_test = df_panel_model[df_panel_model.age_group == '45-64']

In [None]:
model1_cds = 'cds ~ 1 + treatment*year + EntityEffects'

# Fixed effects model5
fem = PanelOLS.from_formula(model1_cds, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
model2_cds = 'cds ~ 1 + treatment*year + ssep2 + SEX_F +  EntityEffects'

# Fixed effects model5
fem = PanelOLS.from_formula(model2_cds, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
model3_cds = 'cds ~ 1 + treatment*year + NBAGE_scaled + SEX_F + MTFRANCHISECOUV + ssep2 + D_MEDIC_B + D_MEDIC_S + mean_ndvi + mean_lst + mean_pm10 + mean_no2 + EntityEffects'

# Fixed effects model5
fem = PanelOLS.from_formula(model3_cds, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
# # Fixed effects model5
# fem = PanelOLS.from_formula(model1_cds, data=df_panel_model_cancer, drop_absorbed=True)
# fem_result = fem.fit()
# print(fem_result)

## Association entre treatment/PRESTATIONS_BRUTES_LCA et N_ATC

In [None]:
test=df_panel_model.reset_index()

In [None]:
# del test

In [None]:
df_panel_model['NBAGE_min'] = df_panel_model.groupby(level='uuid')['NBAGE'].transform('min')

In [None]:
model1_natc = 'n_atc ~ 1 +  treatment*year_2 + treatment*year_3 + treatment*year_4 + treatment*year_5 + NBAGE + ssep2 + SEX_F + MTFRANCHISECOUV +  EntityEffects'
# Fixed effects model5
fem = PanelOLS.from_formula(model1_natc, data=df_panel_model, drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
model1_natc = 'n_atc ~ 1 + treatment*year + ssep2 + SEX_F + MTFRANCHISECOUV +  EntityEffects'
# Fixed effects model5
fem = PanelOLS.from_formula(model1_natc, data=df_panel_model[df_panel_model.NBAGE >=65], drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
model1_natc = 'DRUGAMOUNT_BRUT ~ 1 + treatment*year + ssep2 + SEX_F + MTFRANCHISECOUV +  EntityEffects'
# Fixed effects model5
fem = PanelOLS.from_formula(model1_natc, data=df_panel_model[df_panel_model.NBAGE >=65], drop_absorbed=True)
fem_result = fem.fit()
print(fem_result)

In [None]:
# model1_natc = 'PRESTATIONS_BRUTES_AOS ~ 1 + treatment*NBAGE + ssep2 + CDPHYSSEXE + CANTON_NAME + MTFRANCHISECOUV +  EntityEffects'
# # Fixed effects model5
# fem = PanelOLS.from_formula(model1_natc, data=df_panel_model, drop_absorbed=True)
# fem_result = fem.fit(cov_type='clustered', cluster_entity=True)
# print(fem_result)

### Random effects model

Fitting a Random Effects model for model2 allows you to examine the interaction between treatment and time while also accounting for both within-entity and between-entity variations in spending on conventional medicine.

**Pros:**
Efficiency: Compared to Fixed Effects models, Random Effects can be more efficient (smaller standard errors) if the individual entity effects are not strongly correlated with the predictors.

Between-Entity Information: Captures both within-entity and between-entity variations, thereby utilizing more information in the data.

Model Complexity: Allows for the inclusion of time-invariant predictors if needed (though not in this specific formula).

**Cons:**

Assumptions: Assumes that the entity-specific effects are uncorrelated with the independent variables. Violation of this assumption can lead to biased estimates.

Generalizability: The random effects are assumed to be drawn from a larger population, and this assumption may not always hold or be of interest.

**Interpretation:**
Similar to a Fixed Effects model, you'll get estimates for the impact of treatment and its interaction with time on spending on conventional medicine (PRESTATIONS_BRUTES_AOS). However, the coefficients will represent both within-entity and between-entity effects, giving a more holistic view of the treatment's impact.

In summary, using a Random Effects model for model2 can be a good strategy if you believe that the unobserved entity-specific effects are not strongly correlated with the treatment and time variables in your model.

In [None]:
# Random effects model
rem = RandomEffects.from_formula(model2, data=df_panel_model)
rem_result = rem.fit()
print(rem_result)

In [None]:
# Fixed effects model5
rem = PanelOLS.from_formula('PRESTATIONS_BRUTES_AOS ~ 1 + PRESTATIONS_BRUTES_LCA + NBAGE', data=df_panel_model, drop_absorbed=True)
rem_result = rem.fit()
print(rem_result)

#### Pooled OLS model

Using a Pooled OLS (Ordinary Least Squares) model for model2 is essentially treating your data as one big dataset without paying special attention to the differences between entities (like patients) or time periods.

**Pros:**
Simplicity: Easy to understand and implement.
More Degrees of Freedom: Uses all the data without any partitioning, which could be beneficial when you have fewer observations.

**Cons:**
Ignores Structure: Does not account for any differences between patients or changes over time. This might lead to incorrect conclusions.
Inefficiency: If there are significant differences between patients or over time, ignoring them can make your estimates less precise.  

**Assumptions:**
Linear Relationship: Assumes a linear relationship between the dependent and independent variables.
Independence: Observations should be independent of each other, which might not be the case in your panel data.

**Interpretation:**
The model will give you an estimate of the average effect of treatment and its interaction with time on spending on conventional medicine (PRESTATIONS_BRUTES_AOS). However, the results might be misleading if the differences between patients or changes over time are important for understanding your question.

In short, a Pooled OLS model is like a "one-size-fits-all" approach. It's easier but might not be as accurate if you think that individual differences or changes over time are important.

In [None]:
# Pooled OLS model
pols = PanelOLS.from_formula(model2, data=df_panel_model, drop_absorbed=True)
pols_result = pols.fit()
print(pols_result)

In [None]:
df_panel_model.reset_index()

In [None]:
uuid_mapping = {uuid: i for i, uuid in enumerate(df_treated_filtered['uuid'].unique())}


In [None]:
uuid_mapping = {uuid: i+1 for i, uuid in enumerate(df_treated['uuid'].unique())}
df_treated['uuid_int'] = df_treated['uuid'].map(uuid_mapping).astype(int)
df_treated_filtered['uuid_int'] = df_treated_filtered['uuid'].map(uuid_mapping).astype(int)

In [None]:


# Formula for fixed effects (Intervention, Time) and random intercepts (1|ID)
formula = "PRESTATIONS_BRUTES_AOS ~ treatment + year + treatment*year + (1|uuid_int)"

# Fit the model
model = smf.mixedlm(formula, df_treated_filtered, groups=df_treated_filtered["uuid_int"])
result = model.fit()

# Show the results
print(result.summary())

In [None]:
"cds + NBAGE_scaled + SEX_F + MTFRANCHISECOUV + ssep2 + D_MEDIC_B + D_MEDIC_S + mean_ndvi + mean_lst + mean_pm10 + mean_no2 "

In [None]:
model = sm.MixedLM.from_formula('PRESTATIONS_BRUTES_AOS ~ treatment*year + cds + NBAGE + SEX_F + MTFRANCHISECOUV + ssep2 + D_MEDIC_B + D_MEDIC_S + mean_ndvi + mean_lst + mean_pm10 + mean_no2', groups='uuid_int', data=df_treated_filtered)
result = model.fit()
# Show the results
print(result.summary())

In [None]:
df_treated = df_treated.reset_index(drop = True)

In [None]:
n_years_uuid = df_treated.groupby('uuid').size().sort_values()

In [None]:
n_years_uuid_2plus = n_years_uuid[n_years_uuid > 1].index

In [None]:
test = df_treated[df_treated.uuid.isin(n_years_uuid_2plus)]

In [None]:
df_treated

In [None]:
model = sm.MixedLM.from_formula('PRESTATIONS_BRUTES_AOS ~ treatment', groups='uuid_int', data=df_treated)
result = model.fit()
# Show the results
print(result.summary())

In [None]:
# Show the results
print(result.summary())

In [None]:
# Formula for fixed effects (Intervention, Time) and random intercepts (1|ID)
formula = "PRESTATIONS_BRUTES_AOS ~ treatment + year + (1|uuid_int)"
vc = {'classroom': '0 + C(classroom)'}
# Fit the model
model = smf.mixedlm(formula, df_treated, groups=df_treated["uuid_int"])
result = model.fit()

# Show the results
print(result.summary())

## Spatial panel data models with fixed-effects

### Toy example

In [None]:
nat = libpysal.examples.load_example("NCOVR")
db = libpysal.io.open(nat.get_path("NAT.dbf"), "r")

# Create spatial weight matrix
nat_shp = libpysal.examples.get_path("NAT.shp")
w = libpysal.weights.Queen.from_shapefile(nat_shp)
w.transform = 'r'

# Define dependent variable
name_y = ["HR70", "HR80", "HR90"]
y = np.array([db.by_col(name) for name in name_y]).T

# Define independent variables
name_x = ["RD70", "RD80", "RD90", "PS70", "PS80", "PS90"]
x = np.array([db.by_col(name) for name in name_x]).T

In [None]:
fe_lag = spreg.Panel_FE_Lag(y, x, w, name_y=name_y, name_x=name_x, name_ds="NAT")

In [None]:
print(fe_lag.summary)

In [None]:
y_long = y.reshape((y.shape[0]*y.shape[1],1), order='F')
x_long = x.reshape((x.shape[0]*3,2), order='F')

fe_lag_long = spreg.Panel_FE_Lag(y_long, x_long, w, name_y=name_y, name_x=name_x, name_ds="NAT")
print(fe_lag_long.summary)

### Actual model

In [None]:
import geopandas as gpd
from scipy.spatial import cKDTree
from libpysal.weights.distance import get_points_array

In [None]:
gdf_panel_model = gpd.GeoDataFrame(df_panel_model[['PRESTATIONS_BRUTES_AOS','PRESTATIONS_BRUTES_LCA','treatment','cds','lon_masked','lat_masked']], geometry = gpd.points_from_xy(df_panel_model.lon_masked, df_panel_model.lat_masked), crs = 4326)
gdf_panel_model = gdf_panel_model.to_crs(2056)

In [None]:
sample_uuids = gdf_panel_model.reset_index()['uuid'].sample(n=10000, random_state=1).tolist()

In [None]:
sample_df_panel_model = gdf_panel_model[gdf_panel_model.index.get_level_values('uuid').isin(sample_uuids)]

In [None]:
w = libpysal.weights.KNN(cKDTree(get_points_array(sample_df_panel_model.geometry.centroid)), 8)

w.transform = 'r'

In [None]:
# Define dependent variable
y = sample_df_panel_model['PRESTATIONS_BRUTES_AOS'].values.reshape(-1, 1)


# Define independent variables
x = sample_df_panel_model[['PRESTATIONS_BRUTES_LCA']].values.reshape(-1, 1)


In [None]:
# name_y = ['amount_aos_2017','amount_aos_2018','amount_aos_2019','amount_aos_2020','amount_aos_2021']
# name_x = ['treatment_2017','treatment_2018','treatment_2019','treatment_2020','treatment_2021', 'cds_2017','cds_2018','cds_2019','cds_2020','cds_2021']

In [None]:
name_y = ['amount_aos']
name_x = ['amount_lca']

In [None]:
??spreg.Panel_FE_Lag

In [None]:
fe_lag_long = spreg.Panel_FE_Lag(y, x, w, name_y=name_y, name_x=name_x, name_ds="test")

In [None]:
fe_lag_long

## Number of years treated and PRESTATIONS_BRUTES_AOS

In [None]:
outcome_vars = ['DRUGAMOUNT_BRUT','PRESTATIONS_BRUTES_AOS','PRESTATIONS_NETTES_AOS','PRESTATIONS_BRUTES_LCA','PRESTATIONS_DISEASE','PRESTATIONS_BIRTH','PRESTATIONS_ACCIDENT','PRESTATIONS_TOTAL','n_atc','NBRE_FACTURES_LCA', 'NBRE_FACTURES_AOS', 'NBRE_FACTURES_TOTAL', 'n_inpatient_hosp', 'n_outpatient_hosp',
 'n_month_outpatienthosp',
 'n_month_inpatienthosp',
 'time_to_rehosp_in',
 'time_to_rehosp_out']

In [None]:
sample_df['years_treated'] = sample_df.groupby('uuid')['treatment'].cumsum()

In [None]:
# Loading Grunfeld Investment data
year = pd.Categorical(df_matched_full['year'])

In [None]:
# Preparing data for panel model
df_matched_full = df_matched_full.set_index(['uuid','year'])
df_matched_full['year'] = year


In [None]:
# Pooled OLS model
pols = PanelOLS.from_formula('PRESTATIONS_BRUTES_AOS ~  n_atc + DRUGAMOUNT_BRUT + n_outpatient_hosp', data=df_matched_full)
pols_result = pols.fit()
print(pols_result)

In [None]:
# Fixed effects model
fem = PanelOLS.from_formula('treatment ~  n_atc + n_outpatient_hosp + n_inpatient_hosp + EntityEffects', data=df_matched_full)
fem_result = fem.fit(cov_type='clustered', cluster_entity = True)
print(fem_result)

In [None]:
# Random effects model
rem = RandomEffects.from_formula('PRESTATIONS_BRUTES_AOS ~ treatment + n_atc + n_outpatient_hosp + n_inpatient_hosp + EntityEffects', data=df_matched_full)
rem_result = rem.fit(cov_type='clustered', cluster_entity = True)
print(rem_result)

In [None]:
# Conduct the Fixed Effects Model Analysis
fe_model_formula = 'PRESTATIONS_BRUTES_AOS ~ years_treated + age + C(NOANNEE) + sex + deductible + zipcode + C(uuid)'
fe_model = smf.ols(formula=fe_model_formula, data=sample_df).fit()

# Display the Fixed Effects model summary
fe_model.summary()

## Difference-in-differences model

In [None]:
# Specify regression formula
reg_formula = 'PRESTATIONS_BRUTES_AOS ~ NOANNEE + treatment + interaction'

# Implement DiD model
model = sm.OLS.from_formula(reg_formula, df_did_wo_replication)
result = model.fit()

In [None]:
print(result.summary())

## Integrate differences in treatment "exposure"

- With this binary definition of treatment and outcome, we can't differentiate people who have used complementary medicine once within a year from the ones that have used it every week.
- This follow up analysis includes this information

In [None]:
reg_formula = 'PRESTATIONS_BRUTES_AOS ~ PRESTATIONS_BRUTES_LCA + NOANNEE + treatment + interaction'
# Implement DiD model
model = sm.OLS.from_formula(reg_formula, df_did_wo_replication)
result = model.fit()

In [None]:
print(result.summary())

#### Approach 2 : With replication

In [None]:
baseline_df = df_matched_2017[['uuid','NOANNEE','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_AOS','treatment']].copy()
subsequent_years_df = df_treated_filtered[(df_treated_filtered.NOANNEE != 2017)][['uuid','NOANNEE','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_AOS','treatment']]

In [None]:
baseline_df[baseline_df.uuid.duplicated()]

In [None]:
df_treated_filtered[df_treated_filtered.uuid == '18f8b703-f5e6-42fe-8843-afdb8727b0d9']

In [None]:
subsequent_years_df[subsequent_years_df.uuid == '18f8b703-f5e6-42fe-8843-afdb8727b0d9']

In [None]:
baseline_df['key'] = range(len(baseline_df))

In [None]:
subsequent_years_df['key'] = range(len(subsequent_years_df))
# Merge with the baseline data to replicate the duplication pattern
final_df = pd.merge(baseline_df[['key', 'uuid']], subsequent_years_df, on='uuid', how='left')

In [None]:
final_df

In [None]:
data_final[(data_final.NOANNEE != 2017)&(data_final.uuid.isin(df_matched_2017.uuid))]

In [None]:
df = pd.DataFrame({
    'individual_id': [1, 1, 2, 2, 3, 3],
    'year': [1, 2, 1, 2, 1, 2],
    'treatment': [0, 0, 1, 1, 0, 0],
    'conventional_cost': [100, 110, 50, 45, 130, 125]
})

In [None]:
df_did_init = df_treated_filtered[(df_treated_filtered.NOANNEE != 2017)&(df_treated_filtered.uuid.isin(df_matched_2017.uuid))][['uuid','NOANNEE','PRESTATIONS_BRUTES_LCA','PRESTATIONS_BRUTES_AOS','treatment']]

In [None]:
df_did = pd.merge(df_matched_2017, df_did_init, on = ['uuid'])

In [None]:
df