# Only for the year 2018

In [111]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import io
import statsmodels.api as sm
from scipy import stats
import geopandas as gpd
from google.colab import drive

In [112]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [113]:
#Import the data on the infrastructure of the hospital (2008-2021)
url = "https://spitalstatistik.bagapps.ch/data/download/kzp21_KZ_TimeSerie.xlsx?v=1678279041"
sheet = 'KZ2008-KZ2021'
hospital_data = pd.read_excel(url, sheet_name=sheet)

#Only the 2021 data
hospital_data = hospital_data[hospital_data['JAHR'] == 2018]

In [114]:
#Import the data on the 2018 cases, cleaned by Lenja and in the shared drive under data
cases = pd.read_csv('/content/drive/MyDrive/Module1_CDR/DATA/2018_clean.csv', delimiter=';')


  cases = pd.read_csv('/content/drive/MyDrive/Module1_CDR/DATA/2018_clean.csv', delimiter=';')


In [115]:
cases.columns

Index(['institution', 'indicator', 'taux observE 2013-2017',
       'taux attendu 2013-2017', 'SMR 2013-2017', 'nombre de cas 2013-2017',
       'taux observE 2018', 'taux attendu 2018', 'SMR 2018',
       'nombre de cas 2018', 'Unnamed: 10', 'Unnamed: 11'],
      dtype='object')

In [116]:
columns_to_drop = ['taux observE 2018', 'taux attendu 2018', 'SMR 2018', 'taux observE 2013-2017',
       'taux attendu 2013-2017', 'SMR 2013-2017', 'nombre de cas 2013-2017','Unnamed: 10', 'Unnamed: 11']
cases = cases.drop(columns=columns_to_drop)

In [117]:
cases.columns

Index(['institution', 'indicator', 'nombre de cas 2018'], dtype='object')

## Infrastructure of hospitals

In [118]:
hospital_data.tail()

Unnamed: 0,JAHR,KT,Status,Inst,Adr,Ort,Typ,Notfalldienst,Infrastruktur1,Infrastruktur2,...,ErlOKPAmbB,ErlStatB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,ErlLangB,PTageLang,AustLang,KostLangT
3236,2018,ZH,,Geburtshaus Delphys,Badenerstrasse 177,8003 Zürich,K232,,,,...,254272.0,,1748146.0,45.000017,14332.0,44.997209,,,,
3237,2018,ZH,,Universitäts-Kinderspital Zürich das Spital de...,Steinwiesstrasse 75,8032 Zürich,K233,,,,...,,,,,,,,,,
3238,2018,ZH,,Klinik Lengg AG,Bleulerstrasse 60,8008 Zürich,K235,,,,...,,,,,,,,,,
3239,2018,ZH,,Klinik Susenberg,Schreberweg 9,8044 Zürich,K235,,,,...,,,,,,,,,,
3240,2018,ZH,,Sune-Egge,Konradstrasse 62,8005 Zürich,K235,,,,...,,,,,,,,,,


In [119]:
# everyline is a unique hospital
duplicates = hospital_data.duplicated(['Inst'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Cases

## Clean the "nombre de cas 2018"

In [120]:
cases['indicator'].unique

<bound method Series.unique of 0                                    A Maladies cardiaques
1                                A.1 Infarctus du myocarde
2        A.1.1.M DP infarctus du myocarde (‚ge >19), mo...
3        A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...
4        A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...
                               ...                        
72639    F.2.13.V Trasferimenti con resezione/sostituzi...
72640    F.3.1.V Trasferimenti con OP alle arterie del ...
72641    I.1.8.V Trasferimenti con prima applicazione d...
72642    J.1.1.V Trasferimenti con casi di ventilazione...
72643      J.2.1.V Trasferimenti con DP sepsi, percentuale
Name: indicator, Length: 72644, dtype: object>

Let's clean the "number_of_cases_2018" variable

In [121]:
## Information about the format of the data in the variable:
# From https://spitalstatistik.bagapps.ch/data/download/qip21_publikation.pdf?v=1680713881, page 4

# Verwendete Zeichen, wenn keine Zahlenangabe erfolgt
# ... Zahl unbekannt, weil (noch) nicht erhoben oder (noch) nicht berechnet
# * entfällt, weil trivial oder Begriffe nicht anwendbar
# – wird bei gerundeten Zahlen verwendet und steht hier für den Wert absolut null

# Signes utilisés en l’absence de chiffres
# ... chiffre inconnu (pas [encore] relevé ou pas [encore] calculé)
# * non indiqué car évident ou non pertinent
# – utilisé pour les chiffres arrondis ; signifie zéro

In [122]:
valeur_nb_cases=(cases['nombre de cas 2018'].unique())
valeur_nb_cases.tolist()

[' ',
 '595',
 '30',
 '207',
 '248',
 '110',
 '545',
 '27',
 '517',
 '491',
 '26',
 '493',
 '3',
 '257',
 '333',
 '205',
 "1'250",
 '9',
 '113',
 '550',
 '578',
 '334',
 "1'240",
 '487',
 '443',
 '413',
 '153',
 '142',
 '36',
 '589',
 '189',
 '39',
 '225',
 '21',
 '495',
 '108',
 '109',
 '169',
 '50',
 '18',
 '15',
 '29',
 '65',
 '0',
 '19',
 '25',
 '22',
 '46',
 '4',
 '11',
 '20',
 '10',
 '1',
 '32',
 '8',
 "1'213",
 '55',
 '270',
 '593',
 '295',
 '672',
 '13',
 '987',
 '974',
 '76',
 '128',
 '92',
 '64',
 '145',
 '81',
 '146',
 '129',
 '33',
 '278',
 '48',
 '41',
 '310',
 '291',
 '625',
 '88',
 "1'572",
 '85',
 '98',
 '662',
 '522',
 '67',
 "1'101",
 '74',
 '152',
 '480',
 '395',
 '175',
 '434',
 '136',
 '103',
 '71',
 '17',
 '107',
 '409',
 '139',
 '75',
 '2',
 '437',
 '421',
 '267',
 '70',
 '162',
 '325',
 '217',
 '87',
 '28',
 '16',
 '5',
 '91',
 '124',
 '12',
 '121',
 '62',
 '102',
 '37',
 '83',
 '59',
 '34',
 '7',
 '24',
 '106',
 '6',
 '40',
 '14',
 '69',
 '23',
 '246',
 '179',


In [123]:
cases['nombre de cas 2018']

0             
1             
2          595
3           30
4          207
         ...  
72639       21
72640      132
72641       91
72642    1'426
72643      871
Name: nombre de cas 2018, Length: 72644, dtype: object

In [124]:
print(valeur_nb_cases)

[' ' '595' '30' ... "3'060" "1'989" "1'426"]


In [125]:
# Chang thousands separator
#Creation of a new variable called "number_of_cases_2" which is the cleaned variable
cases['number_of_cases_2'] = cases['nombre de cas 2018'].apply(lambda x: int(x.replace("'", "")) if isinstance(x, str) and x.replace("'", "").isdigit() else x)

In [126]:
cases

Unnamed: 0,institution,indicator,nombre de cas 2018,number_of_cases_2
0,Les HÙpitaux Universitaires de GenËve HUG,A Maladies cardiaques,,
1,Les HÙpitaux Universitaires de GenËve HUG,A.1 Infarctus du myocarde,,
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595,595
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30,30
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207,207
...,...,...,...,...
72639,CH,F.2.13.V Trasferimenti con resezione/sostituzi...,21,21
72640,CH,F.3.1.V Trasferimenti con OP alle arterie del ...,132,132
72641,CH,I.1.8.V Trasferimenti con prima applicazione d...,91,91
72642,CH,J.1.1.V Trasferimenti con casi di ventilazione...,1'426,1426


In [127]:
# Identify cells which start with a name (ex: Fallzahl 2021) and delete
# And creation of a new dataset

## Create a Boolean mask to filter the rows to keep
masque = ~(cases['number_of_cases_2'].str.startswith('nomb') | cases['number_of_cases_2'].str.startswith('nume') | cases['number_of_cases_2'].str.startswith('Fall'))

## Apply the mask to delete the corresponding lines and create a new dataset
cases_2 = cases[masque]

In [128]:
cases_2

Unnamed: 0,institution,indicator,nombre de cas 2018,number_of_cases_2
0,Les HÙpitaux Universitaires de GenËve HUG,A Maladies cardiaques,,
1,Les HÙpitaux Universitaires de GenËve HUG,A.1 Infarctus du myocarde,,
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595,595
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30,30
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207,207
...,...,...,...,...
72639,CH,F.2.13.V Trasferimenti con resezione/sostituzi...,21,21
72640,CH,F.3.1.V Trasferimenti con OP alle arterie del ...,132,132
72641,CH,I.1.8.V Trasferimenti con prima applicazione d...,91,91
72642,CH,J.1.1.V Trasferimenti con casi di ventilazione...,1'426,1426


In [129]:
# Identify if some no digit value remain:
import re
non_digit_start = cases_2[cases_2['number_of_cases_2'].apply(lambda x: not re.match(r'^\d', str(x)))]
value_nb_cases_bizarre=(non_digit_start['number_of_cases_2'].unique())
value_nb_cases_bizarre
# there are still ' ', '-'

array([' ', nan], dtype=object)

In [130]:
# For value "-", " " and empty, I delete the rows
masque = ~(cases_2['number_of_cases_2'].str.startswith(' ') | cases_2['number_of_cases_2'].str.startswith('-'))
cases_3 = cases_2[masque]

#cases_3.dropna(subset=['number_of_cases_2'], inplace=True)

In [131]:
cases_3

Unnamed: 0,institution,indicator,nombre de cas 2018,number_of_cases_2
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595,595
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30,30
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207,207
5,Les HÙpitaux Universitaires de GenËve HUG,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",248,248
6,Les HÙpitaux Universitaires de GenËve HUG,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",110,110
...,...,...,...,...
72639,CH,F.2.13.V Trasferimenti con resezione/sostituzi...,21,21
72640,CH,F.3.1.V Trasferimenti con OP alle arterie del ...,132,132
72641,CH,I.1.8.V Trasferimenti con prima applicazione d...,91,91
72642,CH,J.1.1.V Trasferimenti con casi di ventilazione...,1'426,1426


In [132]:
# number_of_cases_2 should a float
float_value = cases_3['number_of_cases_2'].astype(float)

cases_3 = cases_3.assign(number_of_cases_2=float_value)

In [133]:
cases_3

Unnamed: 0,institution,indicator,nombre de cas 2018,number_of_cases_2
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595,595.0
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30,30.0
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207,207.0
5,Les HÙpitaux Universitaires de GenËve HUG,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",248,248.0
6,Les HÙpitaux Universitaires de GenËve HUG,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",110,110.0
...,...,...,...,...
72639,CH,F.2.13.V Trasferimenti con resezione/sostituzi...,21,21.0
72640,CH,F.3.1.V Trasferimenti con OP alle arterie del ...,132,132.0
72641,CH,I.1.8.V Trasferimenti con prima applicazione d...,91,91.0
72642,CH,J.1.1.V Trasferimenti con casi di ventilazione...,1'426,1426.0


In [134]:
print('the shape of the initial dataset:', cases.shape)
print('the shape of the dataset after the cleaning:', cases_3.shape)

the shape of the initial dataset: (72644, 4)
the shape of the dataset after the cleaning: (60287, 4)


In [135]:
cases_3['indicator']

2        A.1.1.M DP infarctus du myocarde (‚ge >19), mo...
3        A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...
4        A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...
5        A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...
6        A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...
                               ...                        
72639    F.2.13.V Trasferimenti con resezione/sostituzi...
72640    F.3.1.V Trasferimenti con OP alle arterie del ...
72641    I.1.8.V Trasferimenti con prima applicazione d...
72642    J.1.1.V Trasferimenti con casi di ventilazione...
72643      J.2.1.V Trasferimenti con DP sepsi, percentuale
Name: indicator, Length: 60287, dtype: object

In [136]:
cases_3.drop(columns=['nombre de cas 2018'])

Unnamed: 0,institution,indicator,number_of_cases_2
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595.0
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30.0
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207.0
5,Les HÙpitaux Universitaires de GenËve HUG,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",248.0
6,Les HÙpitaux Universitaires de GenËve HUG,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",110.0
...,...,...,...
72639,CH,F.2.13.V Trasferimenti con resezione/sostituzi...,21.0
72640,CH,F.3.1.V Trasferimenti con OP alle arterie del ...,132.0
72641,CH,I.1.8.V Trasferimenti con prima applicazione d...,91.0
72642,CH,J.1.1.V Trasferimenti con casi di ventilazione...,1426.0


In [137]:
duplicates = cases_3.duplicated(subset=['institution', 'indicator'])
count_duplicates = duplicates.sum()
count_duplicates

0

In [138]:
cases_3['indicator']

2        A.1.1.M DP infarctus du myocarde (‚ge >19), mo...
3        A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...
4        A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...
5        A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...
6        A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...
                               ...                        
72639    F.2.13.V Trasferimenti con resezione/sostituzi...
72640    F.3.1.V Trasferimenti con OP alle arterie del ...
72641    I.1.8.V Trasferimenti con prima applicazione d...
72642    J.1.1.V Trasferimenti con casi di ventilazione...
72643      J.2.1.V Trasferimenti con DP sepsi, percentuale
Name: indicator, Length: 60287, dtype: object

## Clean the indicators of the cases

In [139]:
valeur_nb_cases=(cases_3['indicator'].unique())
valeur_nb_cases.tolist()

['A.1.1.M DP infarctus du myocarde (‚ge >19), mortalitE',
 'A.1.2.M DP infarctus du myocarde, ‚ge 20-44, mortalitE',
 'A.1.3.M DP infarctus du myocarde, ‚ge 45-64, mortalitE',
 'A.1.4.M DP infarctus du myocarde, ‚ge 65-84, mortalitE',
 'A.1.5.M DP infarctus du myocarde, ‚ge >84, mortalitE',
 'A.1.7.M DP infarctus du myocarde (‚ge >19), admissions directes, non transfErE, mortalitE',
 "A.1.8.M DP infarctus du myocarde (‚ge >19), transfErE d'un autre hÙpital, mortalitE",
 'A.1.14.P DP infarctus du myocarde avec cathEtErisme des vaisseaux coronaires (‚ge >19), pourcentage',
 'A.1.15.P DP infarctus du myocarde avec cathEtErisme des vaisseaux coronaires (‚ge >19), admissions directes, pourcentage',
 "A.1.16.P DP infarctus du myocarde avec cathEtErisme des vaisseaux coronaires (‚ge >19), transfErE d'un autre hÙpital, pourcentage",
 'A.1.17.P DP infarctus du myocarde avec cathEtErisme cardiaque ou OP coronarienne (‚ge >19), admissions directes, pourcentage',
 'A.1.9.P Infarctus aigu du myocar

In [140]:
# Get only the codes: so keep everything before the first space
string = cases_3['indicator'].str.split(' ').str[0]
cases_3 = cases_3.assign(indicator_3=string)


In [141]:
cases_3

Unnamed: 0,institution,indicator,nombre de cas 2018,number_of_cases_2,indicator_3
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595,595.0,A.1.1.M
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30,30.0,A.1.2.M
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207,207.0,A.1.3.M
5,Les HÙpitaux Universitaires de GenËve HUG,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",248,248.0,A.1.4.M
6,Les HÙpitaux Universitaires de GenËve HUG,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",110,110.0,A.1.5.M
...,...,...,...,...,...
72639,CH,F.2.13.V Trasferimenti con resezione/sostituzi...,21,21.0,F.2.13.V
72640,CH,F.3.1.V Trasferimenti con OP alle arterie del ...,132,132.0,F.3.1.V
72641,CH,I.1.8.V Trasferimenti con prima applicazione d...,91,91.0,I.1.8.V
72642,CH,J.1.1.V Trasferimenti con casi di ventilazione...,1'426,1426.0,J.1.1.V


In [142]:
cases_3.drop(columns=["indicator", 'nombre de cas 2018'])

Unnamed: 0,institution,number_of_cases_2,indicator_3
2,Les HÙpitaux Universitaires de GenËve HUG,595.0,A.1.1.M
3,Les HÙpitaux Universitaires de GenËve HUG,30.0,A.1.2.M
4,Les HÙpitaux Universitaires de GenËve HUG,207.0,A.1.3.M
5,Les HÙpitaux Universitaires de GenËve HUG,248.0,A.1.4.M
6,Les HÙpitaux Universitaires de GenËve HUG,110.0,A.1.5.M
...,...,...,...
72639,CH,21.0,F.2.13.V
72640,CH,132.0,F.3.1.V
72641,CH,91.0,I.1.8.V
72642,CH,1426.0,J.1.1.V


In [143]:
cases_3.head()

Unnamed: 0,institution,indicator,nombre de cas 2018,number_of_cases_2,indicator_3
2,Les HÙpitaux Universitaires de GenËve HUG,"A.1.1.M DP infarctus du myocarde (‚ge >19), mo...",595,595.0,A.1.1.M
3,Les HÙpitaux Universitaires de GenËve HUG,"A.1.2.M DP infarctus du myocarde, ‚ge 20-44, m...",30,30.0,A.1.2.M
4,Les HÙpitaux Universitaires de GenËve HUG,"A.1.3.M DP infarctus du myocarde, ‚ge 45-64, m...",207,207.0,A.1.3.M
5,Les HÙpitaux Universitaires de GenËve HUG,"A.1.4.M DP infarctus du myocarde, ‚ge 65-84, m...",248,248.0,A.1.4.M
6,Les HÙpitaux Universitaires de GenËve HUG,"A.1.5.M DP infarctus du myocarde, ‚ge >84, mor...",110,110.0,A.1.5.M


In [144]:
cases_3.columns

Index(['institution', 'indicator', 'nombre de cas 2018', 'number_of_cases_2',
       'indicator_3'],
      dtype='object')

In [145]:
'''# List of column names to keep
columns_to_keep = ['institution', 'indicator', 'taux observE 2018', 'taux attendu 2018',
       'SMR 2018', 'nombre de cas 2018', 'number_of_cases_2', 'indicator_3']

# Select the specified columns by label
cases_3 = cases_3.loc[:, columns_to_keep]'''

"# List of column names to keep\ncolumns_to_keep = ['institution', 'indicator', 'taux observE 2018', 'taux attendu 2018',\n       'SMR 2018', 'nombre de cas 2018', 'number_of_cases_2', 'indicator_3']\n\n# Select the specified columns by label\ncases_3 = cases_3.loc[:, columns_to_keep]"

In [146]:
cases_3.head()
cases_3.columns

Index(['institution', 'indicator', 'nombre de cas 2018', 'number_of_cases_2',
       'indicator_3'],
      dtype='object')

## Pivot the dataset

In [147]:
cases_3.columns

Index(['institution', 'indicator', 'nombre de cas 2018', 'number_of_cases_2',
       'indicator_3'],
      dtype='object')

In [148]:
#I ran this function for all of the columns in the df, there are no duplicates
duplicates_inst = cases_3.duplicated(['indicator_3'])
count_duplicates_inst = duplicates_inst.sum()
print(count_duplicates)

0


In [149]:
columns_to_drop = ['indicator', 'nombre de cas 2018']
cases_3 = cases_3.drop(columns=columns_to_drop)

In [150]:
cases_3 = cases_3.drop_duplicates(subset=['institution', 'indicator_3'])


In [151]:
df_case = cases_3.pivot(index='institution', columns='indicator_3', values='number_of_cases_2')

In [152]:
cases_4=df_case.reset_index()
cases_4

indicator_3,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,L.5.6.F,L.6.1.F,L.6.2.F,L.6.3.F,L.6.4.F,L.6.5.F,L.6.8.F,L.7.1.F,L.7.2.F,Z.1.1.X
0,Adus Medica AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,197.0
1,"Allgemeinspital, Grundversorgung (Niveau 4)",1444.0,489.0,432.0,546.0,680.0,1504.0,317.0,808.0,453.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,573.0
2,"Allgemeinspital, Zentrumsversorgung (Niveau 1,...",4105.0,1072.0,1922.0,1922.0,2139.0,4098.0,1218.0,3603.0,2413.0,...,0.0,563.0,44.0,7.0,519.0,328.0,3.0,69.0,0.0,765.0
3,Andreas Klinik,10.0,5.0,3.0,3.0,7.0,10.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
4,Asana Gruppe AG Spital Leuggern,24.0,11.0,8.0,8.0,16.0,24.0,4.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,Universit‰tsspital Basel,786.0,69.0,359.0,359.0,422.0,786.0,342.0,668.0,495.0,...,0.0,134.0,19.0,1.0,115.0,40.0,0.0,5.0,0.0,82.0
161,Universit‰tsspital Z¸rich,831.0,172.0,363.0,363.0,465.0,827.0,227.0,733.0,473.0,...,0.0,159.0,2.0,0.0,157.0,92.0,0.0,29.0,0.0,180.0
162,Uroviva Klinik AG,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163,Venenklinik Bellevue AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Include the correspondance name in the cases dataframe

In [169]:
names_hosp = pd.read_csv('/content/drive/MyDrive/Module1_CDR/2018-2019/correspondances_18.csv')

In [170]:
names_hosp

Unnamed: 0.1,Unnamed: 0,Nom_df1,Nom_df2
0,0,Les HÙpitaux Universitaires de GenËve HUG,Les Hôpitaux Universitaires de Genève HUG
1,1,Insel Gruppe AG (universit‰r),Insel Gruppe AG (universitär)
2,2,CHUV Centre Hospitalier Universitaire Vaudois,CHUV Centre Hospitalier Universitaire Vaudois
3,3,Universit‰tsspital Z¸rich,Universitätsspital Zürich
4,4,Universit‰tsspital Basel,Universitätsspital Basel
...,...,...,...
138,138,Geburtshaus Basel,Geburtshaus Basel
139,139,Ita Wegman Geburtshaus,Ita Wegman Geburtshaus
140,140,Universit‰ts-Kinderspital Z¸rich das Spital de...,Universitäts-Kinderspital Zürich das Spital de...
141,141,Universit‰ts-Kinderspital beider Basel (UKBB),Universitäts-Kinderspital beider Basel (UKBB)


In [155]:
# Merge to obtain in the dataframe cases the right names of hospitals
cases_F = cases_4.merge(names_hosp, left_on='institution', right_on='Nom_df1', how='left')

In [156]:
cases_F

Unnamed: 0.1,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,L.6.3.F,L.6.4.F,L.6.5.F,L.6.8.F,L.7.1.F,L.7.2.F,Z.1.1.X,Unnamed: 0,Nom_df1,Nom_df2
0,Adus Medica AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,197.0,114.0,Adus Medica AG,Adus Medica AG
1,"Allgemeinspital, Grundversorgung (Niveau 4)",1444.0,489.0,432.0,546.0,680.0,1504.0,317.0,808.0,453.0,...,0.0,0.0,0.0,0.0,0.0,0.0,573.0,,,
2,"Allgemeinspital, Zentrumsversorgung (Niveau 1,...",4105.0,1072.0,1922.0,1922.0,2139.0,4098.0,1218.0,3603.0,2413.0,...,7.0,519.0,328.0,3.0,69.0,0.0,765.0,,,
3,Andreas Klinik,10.0,5.0,3.0,3.0,7.0,10.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7.0,65.0,Andreas Klinik,Andreas Klinik
4,Asana Gruppe AG Spital Leuggern,24.0,11.0,8.0,8.0,16.0,24.0,4.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,50.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,Universit‰tsspital Basel,786.0,69.0,359.0,359.0,422.0,786.0,342.0,668.0,495.0,...,1.0,115.0,40.0,0.0,5.0,0.0,82.0,4.0,Universit‰tsspital Basel,Universitätsspital Basel
161,Universit‰tsspital Z¸rich,831.0,172.0,363.0,363.0,465.0,827.0,227.0,733.0,473.0,...,0.0,157.0,92.0,0.0,29.0,0.0,180.0,3.0,Universit‰tsspital Z¸rich,Universitätsspital Zürich
162,Uroviva Klinik AG,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,Uroviva Klinik AG,Uroviva Klinik AG
163,Venenklinik Bellevue AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118.0,Venenklinik Bellevue AG,Venenklinik Bellevue AG


In [157]:
# Here, before we merge the cases dataset with the institutions dataset, we need to be sure that there is only one row per hospital
# we can see that there are many duplicates
duplicates = cases_F.duplicated(['Nom_df2'])
count_duplicates = duplicates.sum()
count_duplicates

21

In [158]:
cases_F_0=cases_F.drop(columns=["institution", 'Nom_df1'])

In [159]:
# We need to group_by the name of institution and sum
result = cases_F_0.groupby('Nom_df2').agg('sum').reset_index()

In [160]:
# We need to group_by the name of institution and sum
result = cases_F_0.groupby('Nom_df2').agg('sum').reset_index()

In [161]:
# No more duplicate, the dataset is now ready to be merged with the institution dataset
duplicates = result.duplicated(['Nom_df2'])
count_duplicates = duplicates.sum()
count_duplicates

0

## Merge with the infrastructure dataset

In [162]:
df = cases_F.merge(hospital_data, left_on='Nom_df2', right_on='Inst', how='left')

In [163]:
df

Unnamed: 0,institution,A.1.1.M,A.1.1.V,A.1.10.M,A.1.10.P,A.1.11.M,A.1.12.X,A.1.13.M,A.1.14.P,A.1.15.P,...,ErlOKPAmbB,ErlStatB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,ErlLangB,PTageLang,AustLang,KostLangT
0,Adus Medica AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,"Allgemeinspital, Grundversorgung (Niveau 4)",1444.0,489.0,432.0,546.0,680.0,1504.0,317.0,808.0,453.0,...,,,,,,,,,,
2,"Allgemeinspital, Zentrumsversorgung (Niveau 1,...",4105.0,1072.0,1922.0,1922.0,2139.0,4098.0,1218.0,3603.0,2413.0,...,,,,,,,,,,
3,Andreas Klinik,10.0,5.0,3.0,3.0,7.0,10.0,2.0,0.0,0.0,...,,,,,,,,,,
4,Asana Gruppe AG Spital Leuggern,24.0,11.0,8.0,8.0,16.0,24.0,4.0,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,Universit‰tsspital Basel,786.0,69.0,359.0,359.0,422.0,786.0,342.0,668.0,495.0,...,,,,,,,,,,
161,Universit‰tsspital Z¸rich,831.0,172.0,363.0,363.0,465.0,827.0,227.0,733.0,473.0,...,,,,,,,,,,
162,Uroviva Klinik AG,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
163,Venenklinik Bellevue AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


# Explore the C-section data

In [164]:
df[['G.1.4.P', 'G.1.5.P', 'G.1.6.P', 'G.1.7.P']]

# G.1.4.P: césariennes
# G.1.5.P: Césariennes en cas d'accouchement à faible risque
# G.1.6.P: Césariennes en cas d'accouchement à faible risque, âge <35
# G.1.7.P: Césariennes en cas d'accouchement à faible risque, âge >34
#  G.1.7.P +  G.1.6.P = G.1.5.P
# Therefore, G.1.4.P is the total amont of c-section, the most important variable!

Unnamed: 0,G.1.4.P,G.1.5.P,G.1.6.P,G.1.7.P
0,0.0,0.0,0.0,0.0
1,4519.0,3575.0,2290.0,1285.0
2,5050.0,3124.0,1801.0,1323.0
3,260.0,216.0,140.0,76.0
4,227.0,180.0,128.0,52.0
...,...,...,...,...
160,943.0,602.0,334.0,268.0
161,1242.0,687.0,376.0,311.0
162,0.0,0.0,0.0,0.0
163,0.0,0.0,0.0,0.0


In [165]:
# Which hospitals haven't perform any c-section?
test=df[df['G.1.4.P']==0]
#print(test)

# We can observe that by doing this operation we are deleting automatically the maison de naissances, etc.

In [166]:
# let's delete then
df_2=df[df['G.1.4.P']!=0]

In [167]:
df_2.shape

(93, 581)

In [168]:
df_2.to_csv('/content/drive/MyDrive/Module1_CDR/2018-2019/DF_2018_clean.csv')
