# Only for the year 2021

In [34]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import io
import statsmodels.api as sm
from scipy import stats
import geopandas as gpd

In [35]:
#Import the data on the infrastructure of the hospital (2008-2021)
url = "https://spitalstatistik.bagapps.ch/data/download/kzp21_KZ_TimeSerie.xlsx?v=1678279041"
sheet = 'KZ2008-KZ2021'
hospital_data = pd.read_excel(url, sheet_name=sheet)

#Only the 2021 data
hospital_data = hospital_data[hospital_data['JAHR'] == 2021]

In [36]:
#Import the data on the 2021 cases, cleaned by Lenja and in the shared drive under data
file_path = 'C:/Users/Gaëlle/Documents/_CAS applied data science/3. Module 3 Data analysis and machine learning/Project/2021_clean.xlsx'
cases = pd.read_excel(file_path)

# Delete uninteresting variables
columns_to_drop = ['taux observé 2021', 'taux attendu 2021', 'SMR 2021']
cases = cases.drop(columns=columns_to_drop)

## Infrastructure of hospitals

In [37]:
hospital_data.tail()

Unnamed: 0,JAHR,KT,Status,Inst,Adr,Ort,Typ,Notfalldienst,Infrastruktur1,Infrastruktur2,...,ErlOKPAmbB,ErlStatB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,ErlLangB,PTageLang,AustLang,KostLangT
4069,2021,ZH,,Geburtshaus Delphys,Badenerstrasse 177,8003 Zürich,K232,,,,...,334453.0,,2097414.0,45.000033,0.0,,,0.0,0.0,0.0
4070,2021,ZH,,Universitäts-Kinderspital Zürich - Das Spital ...,Steinwiesstrasse 75,8032 Zürich,K233,,,,...,,,,,,,,0.0,0.0,0.0
4071,2021,ZH,,Klinik Lengg AG,Bleulerstrasse 60,8008 Zürich,K235,,,,...,,,,,,,,0.0,0.0,0.0
4072,2021,ZH,,Klinik Susenberg,Schreberweg 9,8044 Zürich,K235,,,,...,,,,,,,,0.0,0.0,0.0
4073,2021,ZH,,Sune-Egge,Konradstrasse 62,8005 Zürich,K235,,,,...,,,,,,,,0.0,0.0,0.0


In [38]:
# everyline is a unique hospital
duplicates = hospital_data.duplicated(['Inst'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Cases

## Clean the "nombre de cas 2021"

In [39]:
cases.head()

Unnamed: 0,institution,indicator,nombre de cas 2021
0,Les Hôpitaux Universitaires de GenEve HUG - HU...,A Maladies cardiaques,
1,Les Hôpitaux Universitaires de GenEve HUG - HU...,A.1 Infarctus du myocarde,
2,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.1.M DP infarctus du myocarde (âge >19), mo...",510.0
3,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.2.M DP infarctus du myocarde, âge 20-44, m...",22.0
4,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.3.M DP infarctus du myocarde, âge 45-64, m...",237.0


Let's clean the "number_of_cases_2021" variable

In [40]:
## Information about the format of the data in the variable: 
# From https://spitalstatistik.bagapps.ch/data/download/qip21_publikation.pdf?v=1680713881, page 4

# Verwendete Zeichen, wenn keine Zahlenangabe erfolgt
# ... Zahl unbekannt, weil (noch) nicht erhoben oder (noch) nicht berechnet
# * entfällt, weil trivial oder Begriffe nicht anwendbar
# – wird bei gerundeten Zahlen verwendet und steht hier für den Wert absolut null

# Signes utilisés en l’absence de chiffres
# ... chiffre inconnu (pas [encore] relevé ou pas [encore] calculé)
# * non indiqué car évident ou non pertinent
# – utilisé pour les chiffres arrondis ; signifie zéro

In [41]:
valeur_nb_cases=(cases['nombre de cas 2021'].unique())
#valeur_nb_cases.tolist()

In [42]:
# Chang thousands separator
#Creation of a new variable called "number_of_cases_2" which is the cleaned variable
cases['number_of_cases_2'] = cases['nombre de cas 2021'].apply(lambda x: int(x.replace("'", "")) if isinstance(x, str) and x.replace("'", "").isdigit() else x)

In [43]:
# Identify cells which start with a name (ex: Fallzahl 2021) and delete
# And creation of a new dataset 

## Create a Boolean mask to filter the rows to keep
masque = ~(cases['number_of_cases_2'].str.startswith('nomb') | cases['number_of_cases_2'].str.startswith('nume') | cases['number_of_cases_2'].str.startswith('Fall'))

## Apply the mask to delete the corresponding lines and create a new dataset
cases_2 = cases[masque]

In [44]:
# Identify if some no digit value remain:
import re
non_digit_start = cases_2[cases_2['number_of_cases_2'].apply(lambda x: not re.match(r'^\d', str(x)))]
value_nb_cases_bizarre=(non_digit_start['number_of_cases_2'].unique())
value_nb_cases_bizarre
# there are still ' ', '-'

array([' ', '-'], dtype=object)

In [45]:
# For value "-", " " and empty, I delete the rows
masque = ~(cases_2['number_of_cases_2'].str.startswith(' ') | cases_2['number_of_cases_2'].str.startswith('-'))
cases_3 = cases_2[masque]

#cases_3.dropna(subset=['number_of_cases_2'], inplace=True)

In [46]:
# number_of_cases_2 should a float
float_value = cases_3['number_of_cases_2'].astype(float)

cases_3 = cases_3.assign(number_of_cases_2=float_value)

In [47]:
print('the shape of the initial dataset:', cases.shape)
print('the shape of the dataset after the cleaning:', cases_3.shape)

the shape of the initial dataset: (139895, 4)
the shape of the dataset after the cleaning: (117856, 4)


In [48]:
cases_3.drop(columns=['nombre de cas 2021'])

Unnamed: 0,institution,indicator,number_of_cases_2
2,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.1.M DP infarctus du myocarde (âge >19), mo...",510.0
3,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.2.M DP infarctus du myocarde, âge 20-44, m...",22.0
4,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.3.M DP infarctus du myocarde, âge 45-64, m...",237.0
5,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.4.M DP infarctus du myocarde, âge 65-84, m...",171.0
6,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.5.M DP infarctus du myocarde, âge >84, mor...",80.0
...,...,...,...
139890,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.41.F Plexuschirurgie,0.0
139891,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.42.F Gynäkologische Tumore,0.0
139892,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.43.F Anerkanntes zertifiziertes Brustzentrum,0.0
139893,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.44.F Geburtshilfe (ab GA 32 0/7 SSW und GG...,0.0


In [49]:
duplicates = cases_3.duplicated(subset=['institution', 'indicator'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Clean the indicators of the cases

In [50]:
valeur_nb_cases=(cases_3['indicator'].unique())
valeur_nb_cases.tolist()

['A.1.1.M DP infarctus du myocarde (âge >19), mortalité',
 'A.1.2.M DP infarctus du myocarde, âge 20-44, mortalité',
 'A.1.3.M DP infarctus du myocarde, âge 45-64, mortalité',
 'A.1.4.M DP infarctus du myocarde, âge 65-84, mortalité',
 'A.1.5.M DP infarctus du myocarde, âge >84, mortalité',
 'A.1.7.M DP infarctus du myocarde (âge >19), admissions directes, mortalité',
 "A.1.8.M DP infarctus du myocarde (âge >19), transféré d'un autre hôpital, mortalité",
 'A.1.18.M DP infarctus du myocarde (âge >19), réanimation avant admission, mortalité',
 'A.1.14.P DP infarctus du myocarde avec cathétérisme des vaisseaux coronaires (âge >19), pourcentage',
 'A.1.15.P DP infarctus du myocarde avec cathétérisme des vaisseaux coronaires (âge >19), admissions directes, pourcentage',
 "A.1.16.P DP infarctus du myocarde avec cathétérisme des vaisseaux coronaires (âge >19), transféré d'un autre hôpital, pourcentage",
 'A.1.17.P DP infarctus du myocarde avec cathétérisme cardiaque ou OP coronarienne (âge >1

In [51]:
# Get only the codes: so keep everything before the first space
string = cases_3['indicator'].str.split(' ').str[0]
cases_3 = cases_3.assign(indicator_3=string)


In [52]:
cases_3

Unnamed: 0,institution,indicator,nombre de cas 2021,number_of_cases_2,indicator_3
2,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.1.M DP infarctus du myocarde (âge >19), mo...",510,510.0,A.1.1.M
3,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.2.M DP infarctus du myocarde, âge 20-44, m...",22,22.0,A.1.2.M
4,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.3.M DP infarctus du myocarde, âge 45-64, m...",237,237.0,A.1.3.M
5,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.4.M DP infarctus du myocarde, âge 65-84, m...",171,171.0,A.1.4.M
6,Les Hôpitaux Universitaires de GenEve HUG - HU...,"A.1.5.M DP infarctus du myocarde, âge >84, mor...",80,80.0,A.1.5.M
...,...,...,...,...,...
139890,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.41.F Plexuschirurgie,0,0.0,Z.4.41.F
139891,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.42.F Gynäkologische Tumore,0,0.0,Z.4.42.F
139892,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.43.F Anerkanntes zertifiziertes Brustzentrum,0,0.0,Z.4.43.F
139893,Stiftung Ostschweizer Kinderspital - Romerhuus...,Z.4.44.F Geburtshilfe (ab GA 32 0/7 SSW und GG...,0,0.0,Z.4.44.F


In [53]:
cases_3.drop(columns=["indicator", 'nombre de cas 2021'])

Unnamed: 0,institution,number_of_cases_2,indicator_3
2,Les Hôpitaux Universitaires de GenEve HUG - HU...,510.0,A.1.1.M
3,Les Hôpitaux Universitaires de GenEve HUG - HU...,22.0,A.1.2.M
4,Les Hôpitaux Universitaires de GenEve HUG - HU...,237.0,A.1.3.M
5,Les Hôpitaux Universitaires de GenEve HUG - HU...,171.0,A.1.4.M
6,Les Hôpitaux Universitaires de GenEve HUG - HU...,80.0,A.1.5.M
...,...,...,...
139890,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,Z.4.41.F
139891,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,Z.4.42.F
139892,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,Z.4.43.F
139893,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,Z.4.44.F


## Pivot the dataset

In [54]:
df_case = cases_3.pivot(index='institution', columns='indicator_3', values='number_of_cases_2')

In [55]:
cases_4=df_case.reset_index()
cases_4

indicator_3,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,Z.4.41.F,Z.4.42.F,Z.4.43.F,Z.4.44.F,Z.4.45.F,Z.4.5.F,Z.4.6.F,Z.4.7.F,Z.4.8.F,Z.4.9.F
0,AMEOS Spital Einsiedeln AG - AMEOS Spital Eins...,34.0,11.0,16.0,16.0,18.0,34.0,28.0,4.0,4.0,...,1.0,0.0,7.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,"Adus Medica AG - Adus Medica AG, Breitestrasse...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Andreas Klinik - Andreas Klinik, Rigistrasse 1...",15.0,9.0,6.0,6.0,9.0,15.0,2.0,2.0,2.0,...,3.0,4.0,69.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Asana Spital Leuggern AG - Asana Spital Leugge...,43.0,38.0,17.0,17.0,22.0,43.0,5.0,0.0,0.0,...,0.0,3.0,17.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Asana Spital Menziken AG - Asana Spital Menzik...,48.0,21.0,16.0,16.0,31.0,48.0,6.0,3.0,3.0,...,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,Universitätsspital Basel - Universitätsspital ...,838.0,77.0,394.0,394.0,436.0,838.0,488.0,697.0,506.0,...,33.0,112.0,302.0,111.0,10.0,228.0,20.0,18.0,0.0,7.0
228,Universitätsspital Zürich - UniversitätsSpital...,714.0,138.0,314.0,314.0,389.0,707.0,314.0,653.0,400.0,...,9.0,99.0,170.0,303.0,68.0,413.0,72.0,88.0,13.0,5.0
229,"Uroviva Klinik AG - Uroviva Klinik AG, Zürichs...",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
230,Venenklinik Bellevue AG - Venenklinik Bellevue...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Include the correspondance name in the cases dataframe

In [56]:
file_path = 'C:/Users/Gaëlle/Documents/_CAS applied data science/3. Module 3 Data analysis and machine learning/Project/correspondances_2021.xlsx'
names_hosp = pd.read_excel(file_path)

In [57]:
names_hosp

Unnamed: 0,Nom_df1,Nom_df2
0,Les Hôpitaux Universitaires de GenEve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
1,Les Hôpitaux Universitaires de GenEve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
2,Les Hôpitaux Universitaires de GenEve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
3,Les Hôpitaux Universitaires de GenEve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
4,Les Hôpitaux Universitaires de GenEve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
...,...,...
229,Swiss Medical Network Hospitals SA Clinique de...,SMN - Clinique de Valère
230,Swiss Medical Network Hospitals SA Privatklini...,SMN - Privatklinik Obach
231,GSMN Schweiz AG Privatklinik Lindberg - Privat...,GSMN Schweiz AG - Privatklinik Lindberg
232,Swiss Medical Network Hospitals SA Privatklini...,SMN - Privatklinik Siloah


In [58]:
# Merge to obtain in the dataframe cases the right names of hospitals
cases_F = cases_4.merge(names_hosp, left_on='institution', right_on='Nom_df1', how='left')

In [59]:
cases_F

Unnamed: 0,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,Z.4.43.F,Z.4.44.F,Z.4.45.F,Z.4.5.F,Z.4.6.F,Z.4.7.F,Z.4.8.F,Z.4.9.F,Nom_df1,Nom_df2
0,AMEOS Spital Einsiedeln AG - AMEOS Spital Eins...,34.0,11.0,16.0,16.0,18.0,34.0,28.0,4.0,4.0,...,7.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,AMEOS Spital Einsiedeln AG - AMEOS Spital Eins...,AMEOS Spital Einsiedeln AG
1,"Adus Medica AG - Adus Medica AG, Breitestrasse...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Adus Medica AG - Adus Medica AG, Breitestrasse...",Adus Medica AG
2,"Andreas Klinik - Andreas Klinik, Rigistrasse 1...",15.0,9.0,6.0,6.0,9.0,15.0,2.0,2.0,2.0,...,69.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"Andreas Klinik - Andreas Klinik, Rigistrasse 1...",Andreas Klinik
3,Asana Spital Leuggern AG - Asana Spital Leugge...,43.0,38.0,17.0,17.0,22.0,43.0,5.0,0.0,0.0,...,17.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Asana Spital Leuggern AG - Asana Spital Leugge...,Asana Spital Leuggern AG
4,Asana Spital Menziken AG - Asana Spital Menzik...,48.0,21.0,16.0,16.0,31.0,48.0,6.0,3.0,3.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,Asana Spital Menziken AG - Asana Spital Menzik...,Asana Spital Menziken AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,Universitätsspital Basel - Universitätsspital ...,838.0,77.0,394.0,394.0,436.0,838.0,488.0,697.0,506.0,...,302.0,111.0,10.0,228.0,20.0,18.0,0.0,7.0,Universitätsspital Basel - Universitätsspital ...,Universitätsspital Basel
230,Universitätsspital Zürich - UniversitätsSpital...,714.0,138.0,314.0,314.0,389.0,707.0,314.0,653.0,400.0,...,170.0,303.0,68.0,413.0,72.0,88.0,13.0,5.0,Universitätsspital Zürich - UniversitätsSpital...,Universitätsspital Zürich
231,"Uroviva Klinik AG - Uroviva Klinik AG, Zürichs...",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Uroviva Klinik AG - Uroviva Klinik AG, Zürichs...",Uroviva Klinik AG
232,Venenklinik Bellevue AG - Venenklinik Bellevue...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Venenklinik Bellevue AG - Venenklinik Bellevue...,Venenklinik Bellevue AG


In [60]:
# Here, before we merge the cases dataset with the institutions dataset, we need to be sure that there is only one row per hospital
# we can see that there are many duplicates
duplicates = cases_F.duplicated(['Nom_df2'])
count_duplicates = duplicates.sum()
count_duplicates

79

In [61]:
cases_F_0=cases_F.drop(columns=["institution", 'Nom_df1'])

In [62]:
# We need to group_by the name of institution and sum
result = cases_F_0.groupby('Nom_df2').agg('sum').reset_index()

In [63]:
# No more duplicate, the dataset is now ready to be merged with the institution dataset
duplicates = result.duplicated(['Nom_df2'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Merge with the infrastructure dataset

In [64]:
df = result.merge(hospital_data, left_on='Nom_df2', right_on='Inst', how='left')

In [65]:
df

Unnamed: 0,Nom_df2,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,ErlOKPAmbB,ErlStatB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,ErlLangB,PTageLang,AustLang,KostLangT
0,AMEOS Spital Einsiedeln AG,34.0,11.0,16.0,16.0,18.0,34.0,28.0,4.0,4.0,...,,,,,,,,0.0,0.0,0.0
1,Adus Medica AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
2,Andreas Klinik,15.0,9.0,6.0,6.0,9.0,15.0,2.0,2.0,2.0,...,,,,,,,,0.0,0.0,0.0
3,Asana Spital Leuggern AG,43.0,38.0,17.0,17.0,22.0,43.0,5.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
4,Asana Spital Menziken AG,48.0,21.0,16.0,16.0,31.0,48.0,6.0,3.0,3.0,...,,,,,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,Universitätsspital Basel,838.0,77.0,394.0,394.0,436.0,838.0,488.0,697.0,506.0,...,,,,,,,,0.0,0.0,0.0
151,Universitätsspital Zürich,714.0,138.0,314.0,314.0,389.0,707.0,314.0,653.0,400.0,...,,,,,,,,47.0,12.0,33869.0
152,Uroviva Klinik AG,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
153,Venenklinik Bellevue AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0


# Explore the C-section data

In [66]:
df[['G.1.4.P', 'G.1.5.P', 'G.1.6.P', 'G.1.7.P']]

# G.1.4.P: césariennes
# G.1.5.P: Césariennes en cas d'accouchement à faible risque  
# G.1.6.P: Césariennes en cas d'accouchement à faible risque, âge <35  
# G.1.7.P: Césariennes en cas d'accouchement à faible risque, âge >34
#  G.1.7.P +  G.1.6.P = G.1.5.P
# Therefore, G.1.4.P is the total amont of c-section, the most important variable!

Unnamed: 0,G.1.4.P,G.1.5.P,G.1.6.P,G.1.7.P
0,116.0,96.0,68.0,28.0
1,0.0,0.0,0.0,0.0
2,286.0,248.0,149.0,99.0
3,200.0,168.0,109.0,59.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
150,987.0,645.0,355.0,290.0
151,1253.0,714.0,379.0,335.0
152,0.0,0.0,0.0,0.0
153,0.0,0.0,0.0,0.0


In [67]:
# Which hospitals haven't perform any c-section?
test=df[df['G.1.4.P']==0]
#print(test)

# We can observe that by doing this operation we are deleting automatically the maison de naissances, etc.

In [68]:
# let's delete then 
df_2=df[df['G.1.4.P']!=0]

In [69]:
df_2.shape

(81, 725)

In [70]:
df_2.to_excel('DF_2021_clean.xlsx', index=False) 