# Only for the year 2020

In [48]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import io
import statsmodels.api as sm
from scipy import stats
import geopandas as gpd

In [49]:
#Import the data on the infrastructure of the hospital (2008-2021)
url = "https://spitalstatistik.bagapps.ch/data/download/kzp21_KZ_TimeSerie.xlsx?v=1678279041"
sheet = 'KZ2008-KZ2021'
hospital_data = pd.read_excel(url, sheet_name=sheet)

#Only the 2020 data
hospital_data = hospital_data[hospital_data['JAHR'] == 2020]

In [50]:
#Import the data on the 2020 cases, cleaned by Lenja and in the shared drive under data
file_path = 'C:/Users/Gaëlle/Documents/_CAS applied data science/3. Module 3 Data analysis and machine learning/Project/2020_clean.xlsx'
cases = pd.read_excel(file_path)

In [51]:
# Delete uninteresting variables
columns_to_drop = ['taux observE 2020', 'taux attendu 2020', 'SMR 2020']
cases = cases.drop(columns=columns_to_drop)

## Infrastructure of hospitals

In [52]:
hospital_data.tail()

Unnamed: 0,JAHR,KT,Status,Inst,Adr,Ort,Typ,Notfalldienst,Infrastruktur1,Infrastruktur2,...,ErlOKPAmbB,ErlStatB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,ErlLangB,PTageLang,AustLang,KostLangT
3793,2020,ZH,,Geburtshaus Delphys,Badenerstrasse 177,8003 Zürich,K232,,,,...,285925.0,,1870137.0,45.000019,0.0,,,0.0,0.0,0.0
3794,2020,ZH,,Universitäts-Kinderspital Zürich - Das Spital ...,Steinwiesstrasse 75,8032 Zürich,K233,,,,...,,,,,,,,0.0,0.0,0.0
3795,2020,ZH,,Klinik Lengg AG,Bleulerstrasse 60,8008 Zürich,K235,,,,...,,,,,,,,0.0,0.0,0.0
3796,2020,ZH,,Klinik Susenberg,Schreberweg 9,8044 Zürich,K235,,,,...,,,,,,,,0.0,0.0,0.0
3797,2020,ZH,,Sune-Egge,Konradstrasse 62,8005 Zürich,K235,,,,...,,,,,,,,0.0,0.0,0.0


In [53]:
# everyline is a unique hospital
duplicates = hospital_data.duplicated(['Inst'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Cases

## Clean the "nombre de cas 2020"

In [54]:
cases.head()

Unnamed: 0,institution,indicator,nombre de cas 2020
0,Les HÙpitaux Universitaires de GenËve HUG - HU...,A Maladies cardiaques,
1,Les HÙpitaux Universitaires de GenËve HUG - HU...,A.1 Infarctus du myocarde,
2,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",19.0
3,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",0.0
4,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",3.0


Let's clean the "number_of_cases_2020" variable

In [55]:
## Information about the format of the data in the variable: 
# From https://spitalstatistik.bagapps.ch/data/download/qip21_publikation.pdf?v=1680713881, page 4

# Verwendete Zeichen, wenn keine Zahlenangabe erfolgt
# ... Zahl unbekannt, weil (noch) nicht erhoben oder (noch) nicht berechnet
# * entfällt, weil trivial oder Begriffe nicht anwendbar
# – wird bei gerundeten Zahlen verwendet und steht hier für den Wert absolut null

# Signes utilisés en l’absence de chiffres
# ... chiffre inconnu (pas [encore] relevé ou pas [encore] calculé)
# * non indiqué car évident ou non pertinent
# – utilisé pour les chiffres arrondis ; signifie zéro

In [56]:
valeur_nb_cases=(cases['nombre de cas 2020'].unique())
#valeur_nb_cases.tolist()

In [57]:
# Chang thousands separator
#Creation of a new variable called "number_of_cases_2" which is the cleaned variable
cases['number_of_cases_2'] = cases['nombre de cas 2020'].apply(lambda x: int(x.replace("'", "")) if isinstance(x, str) and x.replace("'", "").isdigit() else x)

In [58]:
# Identify cells which start with a name (ex: Fallzahl 2021) and delete
# And creation of a new dataset 

## Create a Boolean mask to filter the rows to keep
masque = ~(cases['number_of_cases_2'].str.startswith('nomb') | cases['number_of_cases_2'].str.startswith('nume') | cases['number_of_cases_2'].str.startswith('Fall'))

## Apply the mask to delete the corresponding lines and create a new dataset
cases_2 = cases[masque]

In [59]:
# Identify if some no digit value remain:
import re
non_digit_start = cases_2[cases_2['number_of_cases_2'].apply(lambda x: not re.match(r'^\d', str(x)))]
value_nb_cases_bizarre=(non_digit_start['number_of_cases_2'].unique())
value_nb_cases_bizarre
# there are still ' ', '-'

array([' ', '-'], dtype=object)

In [60]:
# For value "-", " " and empty, I delete the rows
masque = ~(cases_2['number_of_cases_2'].str.startswith(' ') | cases_2['number_of_cases_2'].str.startswith('-'))
cases_3 = cases_2[masque]

#cases_3.dropna(subset=['number_of_cases_2'], inplace=True)

In [61]:
# number_of_cases_2 should a float
float_value = cases_3['number_of_cases_2'].astype(float)

cases_3 = cases_3.assign(number_of_cases_2=float_value)

In [62]:
print('the shape of the initial dataset:', cases.shape)
print('the shape of the dataset after the cleaning:', cases_3.shape)

the shape of the initial dataset: (117233, 4)
the shape of the dataset after the cleaning: (97344, 4)


In [63]:
cases_3.drop(columns=['nombre de cas 2020'])

Unnamed: 0,institution,indicator,number_of_cases_2
2,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",19.0
3,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",0.0
4,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",3.0
5,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",7.0
6,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",9.0
...,...,...,...
117228,Stiftung Ostschweizer Kinderspital - Romerhuus...,F.2.13.V Anteil Verlegungen bei Resektion/Ersa...,0.0
117229,Stiftung Ostschweizer Kinderspital - Romerhuus...,F.3.1.V Anteil Verlegungen bei OP an Becken≠/B...,0.0
117230,Stiftung Ostschweizer Kinderspital - Romerhuus...,I.1.8.V Anteil Verlegungen bei H¸ft≠Endoprothe...,0.0
117231,Stiftung Ostschweizer Kinderspital - Romerhuus...,J.1.1.V Anteil Verlegungen bei Beatmungsf‰llen...,0.0


In [64]:
duplicates = cases_3.duplicated(subset=['institution', 'indicator'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Clean the indicators of the cases

In [65]:
valeur_nb_cases=(cases_3['indicator'].unique())
valeur_nb_cases.tolist()

['A.1.1.M DP infarctus du myocarde (‚ge >19), mortalitE',
 'A.1.2.M DP infarctus du myocarde, ‚ge 20-44, mortalitE',
 'A.1.3.M DP infarctus du myocarde, ‚ge 45-64, mortalitE',
 'A.1.4.M DP infarctus du myocarde, ‚ge 65-84, mortalitE',
 'A.1.5.M DP infarctus du myocarde, ‚ge >84, mortalitE',
 'A.1.7.M DP infarctus du myocarde (‚ge >19), admissions directes, non transfErE, mortalitE',
 "A.1.8.M DP infarctus du myocarde (‚ge >19), transfErE d'un autre hÙpital, mortalitE",
 'A.1.14.P DP infarctus du myocarde avec cathEtErisme des vaisseaux coronaires (‚ge >19), pourcentage',
 'A.1.15.P DP infarctus du myocarde avec cathEtErisme des vaisseaux coronaires (‚ge >19), admissions directes, pourcentage',
 "A.1.16.P DP infarctus du myocarde avec cathEtErisme des vaisseaux coronaires (‚ge >19), transfErE d'un autre hÙpital, pourcentage",
 'A.1.17.P DP infarctus du myocarde avec cathEtErisme cardiaque ou OP coronarienne (‚ge >19), admissions directes, pourcentage',
 'A.1.9.P Infarctus aigu du myocar

In [66]:
# Get only the codes: so keep everything before the first space
string = cases_3['indicator'].str.split(' ').str[0]
cases_3 = cases_3.assign(indicator_3=string)


In [67]:
cases_3

Unnamed: 0,institution,indicator,nombre de cas 2020,number_of_cases_2,indicator_3
2,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",19,19.0,A.1.1.M
3,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",0,0.0,A.1.2.M
4,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",3,3.0,A.1.3.M
5,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",7,7.0,A.1.4.M
6,Les HÙpitaux Universitaires de GenËve HUG - HU...,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",9,9.0,A.1.5.M
...,...,...,...,...,...
117228,Stiftung Ostschweizer Kinderspital - Romerhuus...,F.2.13.V Anteil Verlegungen bei Resektion/Ersa...,0,0.0,F.2.13.V
117229,Stiftung Ostschweizer Kinderspital - Romerhuus...,F.3.1.V Anteil Verlegungen bei OP an Becken≠/B...,0,0.0,F.3.1.V
117230,Stiftung Ostschweizer Kinderspital - Romerhuus...,I.1.8.V Anteil Verlegungen bei H¸ft≠Endoprothe...,0,0.0,I.1.8.V
117231,Stiftung Ostschweizer Kinderspital - Romerhuus...,J.1.1.V Anteil Verlegungen bei Beatmungsf‰llen...,0,0.0,J.1.1.V


In [68]:
cases_3.drop(columns=["indicator", 'nombre de cas 2020'])

Unnamed: 0,institution,number_of_cases_2,indicator_3
2,Les HÙpitaux Universitaires de GenËve HUG - HU...,19.0,A.1.1.M
3,Les HÙpitaux Universitaires de GenËve HUG - HU...,0.0,A.1.2.M
4,Les HÙpitaux Universitaires de GenËve HUG - HU...,3.0,A.1.3.M
5,Les HÙpitaux Universitaires de GenËve HUG - HU...,7.0,A.1.4.M
6,Les HÙpitaux Universitaires de GenËve HUG - HU...,9.0,A.1.5.M
...,...,...,...
117228,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,F.2.13.V
117229,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,F.3.1.V
117230,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,I.1.8.V
117231,Stiftung Ostschweizer Kinderspital - Romerhuus...,0.0,J.1.1.V


## Pivot the dataset

In [69]:
df_case = cases_3.pivot(index='institution', columns='indicator_3', values='number_of_cases_2')

In [70]:
cases_4=df_case.reset_index()
cases_4

indicator_3,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,Y.4.3.M,Y.4.4.M,Y.4.5.M,Y.4.6.M,Y.5.1.M,Y.5.1.P,Y.5.2.P,Y.5.3.M,Y.5.4.P,Z.1.1.X
0,AMEOS Spital Einsiedeln AG - AMEOS Spital Eins...,34.0,7.0,13.0,13.0,20.0,34.0,34.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20.0,0.0,19.0
1,"Adus Medica AG - Adus Medica AG, Breitestrasse...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0
2,"Andreas Klinik - Andreas Klinik, Rigistrasse 1...",9.0,6.0,4.0,4.0,5.0,9.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,7.0
3,Asana Spital Leuggern AG - Asana Spital Leugge...,27.0,23.0,7.0,7.0,16.0,27.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,18.0
4,Asana Spital Menziken AG - Asana Spital Menzik...,36.0,16.0,16.0,16.0,19.0,36.0,5.0,2.0,2.0,...,0.0,0.0,0.0,0.0,1.0,1.0,5.0,45.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,Universit‰tsspital Basel - Universit‰tsspital ...,823.0,72.0,387.0,387.0,432.0,823.0,507.0,711.0,490.0,...,1.0,3.0,4.0,71.0,49.0,49.0,64.0,123.0,4.0,84.0
230,Universit‰tsspital Z¸rich - Universit‰tsSpital...,612.0,142.0,264.0,264.0,342.0,608.0,348.0,558.0,350.0,...,6.0,6.0,15.0,11.0,203.0,203.0,54.0,252.0,12.0,133.0
231,"Uroviva Klinik AG - Uroviva Klinik AG, Z¸richs...",0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
232,Venenklinik Bellevue AG - Venenklinik Bellevue...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Include the correspondance name in the cases dataframe

In [71]:
file_path = 'C:/Users/Gaëlle/Documents/_CAS applied data science/3. Module 3 Data analysis and machine learning/Project/correspondance_2020.xlsx'
names_hosp = pd.read_excel(file_path)

In [72]:
names_hosp

Unnamed: 0,Nom_df1,Nom_df2
0,Les HÙpitaux Universitaires de GenËve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
1,Les HÙpitaux Universitaires de GenËve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
2,Les HÙpitaux Universitaires de GenËve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
3,Les HÙpitaux Universitaires de GenËve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
4,Les HÙpitaux Universitaires de GenËve HUG - HU...,Les Hôpitaux Universitaires de Genève HUG
...,...,...
229,Universit‰ts-Kinderspital Z¸rich das Spital de...,Universitäts-Kinderspital Zürich - Das Spital ...
230,Universit‰ts-Kinderspital Z¸rich das Spital de...,Universitäts-Kinderspital Zürich - Das Spital ...
231,Universit‰ts-Kinderspital beider Basel (UKBB) ...,Universitäts-Kinderspital beider Basel (UKBB)
232,Stiftung Ostschweizer Kinderspital - Ostschwei...,Stiftung Ostschweizer Kinderspital


In [73]:
# Merge to obtain in the dataframe cases the right names of hospitals
cases_F = cases_4.merge(names_hosp, left_on='institution', right_on='Nom_df1', how='left')

In [74]:
cases_F

Unnamed: 0,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,Y.4.5.M,Y.4.6.M,Y.5.1.M,Y.5.1.P,Y.5.2.P,Y.5.3.M,Y.5.4.P,Z.1.1.X,Nom_df1,Nom_df2
0,AMEOS Spital Einsiedeln AG - AMEOS Spital Eins...,34.0,7.0,13.0,13.0,20.0,34.0,34.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,20.0,0.0,19.0,AMEOS Spital Einsiedeln AG - AMEOS Spital Eins...,AMEOS Spital Einsiedeln AG
1,"Adus Medica AG - Adus Medica AG, Breitestrasse...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,"Adus Medica AG - Adus Medica AG, Breitestrasse...",Adus Medica AG
2,"Andreas Klinik - Andreas Klinik, Rigistrasse 1...",9.0,6.0,4.0,4.0,5.0,9.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,7.0,"Andreas Klinik - Andreas Klinik, Rigistrasse 1...",Andreas Klinik
3,Asana Spital Leuggern AG - Asana Spital Leugge...,27.0,23.0,7.0,7.0,16.0,27.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.0,0.0,18.0,Asana Spital Leuggern AG - Asana Spital Leugge...,Asana Spital Leuggern AG
4,Asana Spital Menziken AG - Asana Spital Menzik...,36.0,16.0,16.0,16.0,19.0,36.0,5.0,2.0,2.0,...,0.0,0.0,1.0,1.0,5.0,45.0,0.0,0.0,Asana Spital Menziken AG - Asana Spital Menzik...,Asana Spital Menziken AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,Universit‰tsspital Basel - Universit‰tsspital ...,823.0,72.0,387.0,387.0,432.0,823.0,507.0,711.0,490.0,...,4.0,71.0,49.0,49.0,64.0,123.0,4.0,84.0,Universit‰tsspital Basel - Universit‰tsspital ...,Universitätsspital Basel
230,Universit‰tsspital Z¸rich - Universit‰tsSpital...,612.0,142.0,264.0,264.0,342.0,608.0,348.0,558.0,350.0,...,15.0,11.0,203.0,203.0,54.0,252.0,12.0,133.0,Universit‰tsspital Z¸rich - Universit‰tsSpital...,Universitätsspital Zürich
231,"Uroviva Klinik AG - Uroviva Klinik AG, Z¸richs...",0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Uroviva Klinik AG - Uroviva Klinik AG, Z¸richs...",Uroviva Klinik AG
232,Venenklinik Bellevue AG - Venenklinik Bellevue...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Venenklinik Bellevue AG - Venenklinik Bellevue...,Venenklinik Bellevue AG


In [75]:
# Here, before we merge the cases dataset with the institutions dataset, we need to be sure that there is only one row per hospital
# we can see that there are many duplicates
duplicates = cases_F.duplicated(['Nom_df2'])
count_duplicates = duplicates.sum()
count_duplicates

78

In [76]:
cases_F_0=cases_F.drop(columns=["institution", 'Nom_df1'])

In [77]:
# We need to group_by the name of institution and sum
result = cases_F_0.groupby('Nom_df2').agg('sum').reset_index()

In [78]:
result

Unnamed: 0,Nom_df2,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,Y.4.3.M,Y.4.4.M,Y.4.5.M,Y.4.6.M,Y.5.1.M,Y.5.1.P,Y.5.2.P,Y.5.3.M,Y.5.4.P,Z.1.1.X
0,AMEOS Spital Einsiedeln AG,34.0,7.0,13.0,13.0,20.0,34.0,34.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20.0,0.0,19.0
1,Adus Medica AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0
2,Andreas Klinik,9.0,6.0,4.0,4.0,5.0,9.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,7.0
3,Asana Spital Leuggern AG,27.0,23.0,7.0,7.0,16.0,27.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,18.0
4,Asana Spital Menziken AG,36.0,16.0,16.0,16.0,19.0,36.0,5.0,2.0,2.0,...,0.0,0.0,0.0,0.0,1.0,1.0,5.0,45.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,Universitätsspital Basel,823.0,72.0,387.0,387.0,432.0,823.0,507.0,711.0,490.0,...,1.0,3.0,4.0,71.0,49.0,49.0,64.0,123.0,4.0,84.0
152,Universitätsspital Zürich,612.0,142.0,264.0,264.0,342.0,608.0,348.0,558.0,350.0,...,6.0,6.0,15.0,11.0,203.0,203.0,54.0,252.0,12.0,133.0
153,Uroviva Klinik AG,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154,Venenklinik Bellevue AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
# No more duplicate, the dataset is now ready to be merged with the institution dataset
duplicates = result.duplicated(['Nom_df2'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Merge with the infrastructure dataset

In [80]:
df = result.merge(hospital_data, left_on='Nom_df2', right_on='Inst', how='left')

In [81]:
df

Unnamed: 0,Nom_df2,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,ErlOKPAmbB,ErlStatB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,ErlLangB,PTageLang,AustLang,KostLangT
0,AMEOS Spital Einsiedeln AG,34.0,7.0,13.0,13.0,20.0,34.0,34.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
1,Adus Medica AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
2,Andreas Klinik,9.0,6.0,4.0,4.0,5.0,9.0,1.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
3,Asana Spital Leuggern AG,27.0,23.0,7.0,7.0,16.0,27.0,8.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
4,Asana Spital Menziken AG,36.0,16.0,16.0,16.0,19.0,36.0,5.0,2.0,2.0,...,,,,,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,Universitätsspital Basel,823.0,72.0,387.0,387.0,432.0,823.0,507.0,711.0,490.0,...,,,,,,,,0.0,0.0,0.0
152,Universitätsspital Zürich,612.0,142.0,264.0,264.0,342.0,608.0,348.0,558.0,350.0,...,,,,,,,,37.0,6.0,26263.0
153,Uroviva Klinik AG,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0
154,Venenklinik Bellevue AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,0.0,0.0,0.0


# Explore the C-section data

In [82]:
df[['G.1.4.P', 'G.1.5.P', 'G.1.6.P', 'G.1.7.P']]

# G.1.4.P: césariennes
# G.1.5.P: Césariennes en cas d'accouchement à faible risque  
# G.1.6.P: Césariennes en cas d'accouchement à faible risque, âge <35  
# G.1.7.P: Césariennes en cas d'accouchement à faible risque, âge >34
# G.1.7.P + G.1.6.P = G.1.5.P
# Therefore, G.1.4.P is the total amont of c-section, the most important variable!

Unnamed: 0,G.1.4.P,G.1.5.P,G.1.6.P,G.1.7.P
0,85.0,71.0,45.0,26.0
1,0.0,0.0,0.0,0.0
2,268.0,234.0,139.0,95.0
3,202.0,160.0,113.0,47.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
151,928.0,635.0,365.0,270.0
152,1222.0,727.0,386.0,341.0
153,0.0,0.0,0.0,0.0
154,0.0,0.0,0.0,0.0


In [83]:
# Which hospitals haven't perform any c-section?
test=df[df['G.1.4.P']==0]
#print(test)

# We can observe that by doing this operation we are deleting automatically the maison de naissances, etc.

In [84]:
# let's delete then 
df_2=df[df['G.1.4.P']!=0]

In [85]:
df_2.shape

(82, 633)

In [86]:
df_2.to_excel('DF_2020_clean.xlsx', index=False) 