In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import os

own_path = os.getcwd()
#own_path = 'MAKai'

# get all sheet names
sheet_names = pd.ExcelFile(own_path+'\MasterarbeitenDatenAlleV2.xlsx').sheet_names

color_palette = {1: 'rgb(248, 246, 245)', 2: 'rgb(228, 227, 221)', 3: 'rgb(77, 75, 70)', 4: 'rgb(134, 0, 71)', 5: 'rgb(179, 6, 44)', 6: 'rgb(277, 186, 15)', 7: 'rgb(115, 124, 69)',
                 8: 'rgb(0, 97, 143)', 9: 'rgb(173, 59, 118)', 10: 'rgb(201, 98, 21)', 11: 'rgb(247, 217, 38)', 12: 'rgb(165, 171, 82)', 13: 'rgb(72, 169, 218)'}


dfs = []
for sheet_name in sheet_names:
    if sheet_name == 'Codierung':
        codes = pd.read_excel(own_path+'\MasterarbeitenDatenAlleV2.xlsx', sheet_name=sheet_name)
    else:
        df = pd.read_excel(own_path+'\MasterarbeitenDatenAlleV2.xlsx', sheet_name=sheet_name)
        df['DT'] = sheet_name
        dfs.append(df)

code_color = {
    0: 0,
    1: -1,
    2: -1,
    3: 0,
    4: 1,
    5: 0,
}

codes['code_color'] = codes['Codierung'].map(code_color)

# concatenate all dataframes
df = pd.concat(dfs, ignore_index=True)
df = df.set_index(['DT', 'Land']).melt(ignore_index=False).set_index('variable', append=True)
df = df.pivot_table(index=['Land', 'variable'], columns='DT', values='value')
df.index.names = ['Land', 'Jahr']
codes = codes.set_index('Land')
df = df.merge(codes, left_index=True, right_index=True).drop('Typ', axis=1)
df = df.set_index('Codierung', append=True).sort_index()

name_map = {
    'Kosten': ['Anteil  BIP Private', 'Anteil BIP Public ', 'Gesundheitsausgaben pro Kopf', 'Out of Pocket'],
    'Zugänglichkeit': ['Artzbesuche (pro Kopf)', 'Belegungsrate Akutpflegebet', 'Hospital beds', 'Practising doctors', 'Professional nurses'],
    'Qualität': ['Krebs M', 'Krebs W', 'Schlaganfall M', 'Schlaganfall W', 'Sterblichkeit ab 65 M', 'Sterblichkeit ab 65 W', 'Verhinderbare Sterblichkeitsrat']
}

score_map = {
    'Anteil  BIP Private': 1, 
    'Anteil BIP Public ': 1, 
    'Artzbesuche (pro Kopf)': 1,
    'Belegungsrate Akutpflegebet': -1, 
    'Gesundheitsausgaben pro Kopf': 1,
    'Hospital beds': 1, 
    'Krebs M': -1, 
    'Krebs W': -1, 
    'Out of Pocket': 1,
    'Practising doctors': 1,
    'Professional nurses': 1, 
    'Schlaganfall M': -1,
    'Schlaganfall W': -1,
    'Sterblichkeit ab 65 M': 1, 
    'Sterblichkeit ab 65 W': 1,
    'Verhinderbare Sterblichkeitsrat': -1
    }

# fill missing values
df = df.groupby(['Land']).ffill()

# z normalize over all years per variable
df = (df - df.mean()) / df.std()

# add columns for each category
for key in name_map.keys():
    tmp_sum = []
    for col in name_map[key]:
        tmp_sum.append(df[col] * score_map[col])

    tmp_sum = pd.concat(tmp_sum, axis=1).mean(axis=1)

    df[key] = tmp_sum

# filter year > 2010
df_f = df[df.index.get_level_values('Jahr') > 2010]

In [7]:
corr_data = df_f.copy()

corr_data['Krebs'] = (corr_data['Krebs M'] + corr_data['Krebs W']) / 2
corr_data['Schlaganfall'] = (corr_data['Schlaganfall M'] + corr_data['Schlaganfall W']) / 2
corr_data['Sterblichkeit'] = (corr_data['Sterblichkeit ab 65 M'] + corr_data['Sterblichkeit ab 65 W']) / 2
corr_data['Practising Medical Staff'] = (corr_data['Practising doctors'] + corr_data['Professional nurses']) / 2

corr_data = corr_data.drop(['Krebs M', 'Krebs W', 'Schlaganfall M', 'Schlaganfall W', 'Sterblichkeit ab 65 M', 'Sterblichkeit ab 65 W', 'Practising doctors', 'Professional nurses'], axis=1)
corr_data = corr_data[['Anteil  BIP Private', 'Anteil BIP Public ', 'Gesundheitsausgaben pro Kopf', 'Out of Pocket', 'Artzbesuche (pro Kopf)', 'Belegungsrate Akutpflegebet', 'Hospital beds', 'Practising Medical Staff', 'Krebs', 'Schlaganfall', 'Sterblichkeit', 'Verhinderbare Sterblichkeitsrat']]

colorscale = [[0, color_palette[13]], [0.5, color_palette[1]], [1, color_palette[5]]]

fig = px.imshow(corr_data.corr(), title='Correlation Matrix', labels=dict(x='Variable', y='Variable', color='Correlation'), color_continuous_scale=colorscale)
fig.update_layout(width=1000, height=1000)
fig.show()

In [90]:
# count data points per year
df.groupby('Jahr').count().mean(axis=1)


Jahr
2006     1.25
2007    10.40
2008    24.20
2009    18.75
2010    28.95
2011    29.90
2012    30.65
2013    32.00
2014    33.15
2015    33.35
2016    34.10
2017    34.75
2018    34.20
2019    34.30
2020    35.45
2021    32.70
2022    35.90
dtype: float64

In [12]:
# plot time series grouped by DT
plt_data = df_f.groupby(['Jahr', 'Codierung']).mean()

for col in name_map.keys():
    fig = px.line(plt_data[col].unstack().ffill(), title=col, color_discrete_sequence=[color_palette[3], color_palette[4], color_palette[5], color_palette[7], color_palette[12], color_palette[13]])
    fig.show()

In [92]:
# 3d scatter plot
plt_data = plt_data[np.logical_and(plt_data.index.get_level_values('Codierung') != 0,plt_data.index.get_level_values('Codierung') != 3) ]
fig = px.scatter_3d(plt_data.reset_index(), x='Kosten', y='Zugänglichkeit', z='Qualität', color='code_color')
fig.update_layout(width=800, height=800)
fig.show()


In [23]:
# use statsmodels to do a linear regression with fixed effects for year and group
import statsmodels.api as sm
import statsmodels.formula.api as smf

df_r = df_f.reset_index()
df_r['Jahr'] = df_r['Jahr'].astype('category')
df_r['Land'] = df_r['Land'].astype('category')
df_r['Codierung'] = df_r['Codierung'].astype('category')

# drop codings 0 and 3
df_r = df_r[np.logical_and(df_r['Codierung'] != 0, df_r['Codierung'] != 3)]

# regression for quality_i,j = a + b * Kosten_i,j + c * Zugänglichkeit_i,j + d * Jahr_i + e * Codierung_i
model = smf.ols('Qualität ~ Kosten + Zugänglichkeit + C(Jahr) + C(Codierung)', data=df_r).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               Qualität   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.580
Method:                 Least Squares   F-statistic:                     23.73
Date:                 Sa, 08 Mrz 2025   Prob (F-statistic):           3.92e-41
Time:                        13:17:30   Log-Likelihood:                -97.091
No. Observations:                 264   AIC:                             228.2
Df Residuals:                     247   BIC:                             289.0
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0194      0.06

In [24]:
# regression for quality_i,j = a + b * Kosten_i,j + c * Zugänglichkeit_i,j + d * Jahr_i + e * Codierung_i
model = smf.ols('Zugänglichkeit ~ Qualität + Kosten + C(Jahr) + C(Codierung)', data=df_r).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Zugänglichkeit   R-squared:                       0.475
Model:                            OLS   Adj. R-squared:                  0.441
Method:                 Least Squares   F-statistic:                     13.95
Date:                 Sa, 08 Mrz 2025   Prob (F-statistic):           1.88e-26
Time:                        13:17:41   Log-Likelihood:                -82.906
No. Observations:                 264   AIC:                             199.8
Df Residuals:                     247   BIC:                             260.6
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             0.0590      0.05

In [25]:
# regression for quality_i,j = a + b * Kosten_i,j + c * Zugänglichkeit_i,j + d * Jahr_i + e * Codierung_i
model = smf.ols('Kosten ~ Zugänglichkeit + Qualität + C(Jahr) + C(Codierung)', data=df_r).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Kosten   R-squared:                       0.473
Model:                            OLS   Adj. R-squared:                  0.439
Method:                 Least Squares   F-statistic:                     13.86
Date:                 Sa, 08 Mrz 2025   Prob (F-statistic):           2.63e-26
Time:                        13:17:53   Log-Likelihood:                -130.23
No. Observations:                 264   AIC:                             294.5
Df Residuals:                     247   BIC:                             355.2
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             0.0626      0.07

In [26]:
# regression for quality_i,j = a + b * Kosten_i,j + c * Zugänglichkeit_i,j + d * Jahr_i + e * Codierung_i
model = smf.ols('Qualität ~ Kosten + Zugänglichkeit + C(Jahr)', data=df_r).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               Qualität   R-squared:                       0.508
Model:                            OLS   Adj. R-squared:                  0.483
Method:                 Least Squares   F-statistic:                     19.88
Date:                 Sa, 08 Mrz 2025   Prob (F-statistic):           9.85e-32
Time:                        13:18:00   Log-Likelihood:                -126.26
No. Observations:                 264   AIC:                             280.5
Df Residuals:                     250   BIC:                             330.6
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.0776      0.086     

In [27]:
# regression for quality_i,j = a + b * Kosten_i,j + c * Zugänglichkeit_i,j + d * Jahr_i + e * Codierung_i
model = smf.ols('Zugänglichkeit ~ Qualität + Kosten + C(Jahr)', data=df_r).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Zugänglichkeit   R-squared:                       0.189
Model:                            OLS   Adj. R-squared:                  0.147
Method:                 Least Squares   F-statistic:                     4.493
Date:                 Sa, 08 Mrz 2025   Prob (F-statistic):           7.06e-07
Time:                        13:18:09   Log-Likelihood:                -140.16
No. Observations:                 264   AIC:                             308.3
Df Residuals:                     250   BIC:                             358.4
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0806      0.091     

In [28]:
# regression for quality_i,j = a + b * Kosten_i,j + c * Zugänglichkeit_i,j + d * Jahr_i + e * Codierung_i
model = smf.ols('Kosten ~ Zugänglichkeit + Qualität + C(Jahr)', data=df_r).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Kosten   R-squared:                       0.456
Model:                            OLS   Adj. R-squared:                  0.428
Method:                 Least Squares   F-statistic:                     16.12
Date:                 Sa, 08 Mrz 2025   Prob (F-statistic):           1.69e-26
Time:                        13:18:17   Log-Likelihood:                -134.44
No. Observations:                 264   AIC:                             296.9
Df Residuals:                     250   BIC:                             346.9
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0763      0.089     