In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import io
import statsmodels.api as sm
from scipy import stats
import geopandas as gpd


In [2]:
from google.colab import files
uploaded = files.upload()

In [3]:
from google.colab import drive
drive.mount('/content/hospital_cases_GMA.csv')


Mounted at /content/hospital_cases_GMA.csv


In [4]:
file_path = '/content/hospital_cases_GMA.csv/MyDrive/hospital_cases_GMA.csv'
df = pd.read_csv(file_path)
column_names=df.columns.tolist()
column_names

FileNotFoundError: ignored

In [None]:
df.head()

# 1. Structure of the dataset

## 1.1 Numbers of hospitals

In [None]:
# Number of hospitals per region
result_reg = df.groupby(["year", "Reg"])["hospital"].nunique().reset_index()
result_reg=result_reg.pivot(index="Reg", values="hospital", columns="year")
#result_reg.to_csv('Hospital_evolution.csv', index=False)
result_reg

In [None]:
# Number of hospitals per type (university vs non university)
result_uni = df.groupby(["year", "Uni"])["hospital"].nunique().reset_index()

result_uni=result_uni.pivot(index="Uni", values="hospital", columns="year")
result_uni

## 1.2 C-section rates by regions and hospitals' types (university vs non university)

In [None]:
### Figures

# C-sections rates in french speaking cantons vs german speaking cantons

region=df[["year","Reg", "delivery", "cases_c-section"]].groupby(["year","Reg"]).sum()
region=region.reset_index()

region["percent"]=region["cases_c-section"]/region["delivery"]*100

## set the figures

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

region_AL = region[region["Reg"] == "AL"]
region_RO = region[region["Reg"] == "RO"]

ax1.plot(region_AL["year"],region_AL["percent"], color = "#c7bbc9", linestyle = "solid", linewidth=4, label="German-speaking cantons")
ax1.plot(region_RO["year"],region_RO["percent"], color = "#5e3c58", linestyle = "solid", linewidth=4, label="French-speaking cantons")

ax1.set_xlabel("Year")
ax1.set_ylabel('C-section rate')
ax1.set_title("C-sections rates per region")
ax1.legend()
ax1.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), shadow=True, ncol=2)  # To put the legend below the graphics


# C-sections rates in university hospitals vs non-university hospitals
uni=df[["year","Uni", "delivery", "cases_c-section"]].groupby(["year","Uni"]).sum()
uni=uni.reset_index()

uni["percent"]=uni["cases_c-section"]/uni["delivery"]*100

uni_uni = uni[uni["Uni"] == 1]
uni_non = uni[uni["Uni"] != 1]

ax2.plot(uni_uni["year"],uni_uni["percent"], color = "#bbcbb2", linestyle = "solid", linewidth=4, label="Universiy hospitals")
ax2.plot(uni_non["year"],uni_non["percent"], color = "#557c3e", linestyle = "solid", linewidth=4, label="Non-Universiy hospitals")

ax2.set_xlabel("Year")
ax2.set_ylabel('C-section rate')
ax2.set_title("C-sections rates in university and non-university hospitals")
ax2.legend()
ax2.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), shadow=True, ncol=2)  # To put the legend below the graphics

plt.tight_layout() # To make sure that the legend don't overlap

#####

fig.savefig("C-sections_comparaison.png")

######

plt.show()


## 1.3 Boxplots

In [None]:
## Box plots for 2015-2021

df_reg=df[['percentage_c-section', 'Reg']]
df_reg=df_reg.reset_index()
df_reg['rate'] = df_reg['percentage_c-section'] * 100

df_uni=df[['percentage_c-section', 'Uni']]
df_uni=df_uni.reset_index()
df_uni['rate'] = df_uni['percentage_c-section'] * 100

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

df_reg.pivot(columns='Reg', values='rate').boxplot(ax=ax1) #pivot car on aimerait avoir une colonne par type de fleur pour créer le boxplot
ax1.set_title('C-section rates in 2015-2021 per region')
ax1.set_xlabel('Region')
ax1.set_ylabel('Percentage of C-sections')


df_uni.pivot(columns='Uni', values='rate').boxplot(ax=ax2) #pivot car on aimerait avoir une colonne par type de fleur pour créer le boxplot
ax2.set_title('C-section rates in 2015-2021 per type of hospitals')
ax2.set_xlabel('Type of hospitals')
ax2.set_ylabel('Percentage of C-sections')

# Define new labels for
new_x = ['Non University hospital', 'University hospital']

# Use set_xticklabels pour change the labe defined above
ax2.set_xticklabels(new_x)

#####

fig.savefig("/content/hospital_cases_GMA.csv/MyDrive/Module1_CDR/Figures CDR png/Boxplot_2015-2021.png")

######

plt.show()


In [None]:
## Box plots for 2021 only

df_2021=df[df["year"]==2021]

df_reg=df_2021[['percentage_c-section', 'Reg']]
df_reg=df_reg.reset_index()
df_reg['rate'] = df_reg['percentage_c-section'] * 100

df_uni=df_2021[['percentage_c-section', 'Uni']]
df_uni=df_uni.reset_index()
df_uni['rate'] = df_uni['percentage_c-section'] * 100

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

df_reg.pivot(columns='Reg', values='rate').boxplot(ax=ax1) #pivot car on aimerait avoir une colonne par type de fleur pour créer le boxplot
ax1.set_title('C-section rates in 2021 per region')
ax1.set_xlabel('Region')
ax1.set_ylabel('Percentage of C-sections')


df_uni.pivot(columns='Uni', values='rate').boxplot(ax=ax2) #pivot car on aimerait avoir une colonne par type de fleur pour créer le boxplot
ax2.set_title('C-section rates in 2021 per type of hospitals')
ax2.set_xlabel('Type of hospitals')
ax2.set_ylabel('Percentage of C-sections')

# Define new labels for
new_x = ['Non University hospital', 'University hospital']

# Use set_xticklabels pour change the labe defined above
ax2.set_xticklabels(new_x)

#####

fig.savefig("Boxplot_2021.png")

######

plt.show()


In [None]:
years = range(2015, 2022)

# Create separate boxplots for each year
for year in years:
    # Filter the DataFrame for the specific year
    subset = df[df['year'] == year]

    # Create a boxplot for 'percentage_c-section' for the current year
    plt.figure()
    plt.boxplot(subset['percentage_c-section'])
    plt.title(f'Boxplot for Percentage of C-Sections in {year}')
    plt.xlabel('Year')
    plt.ylabel('Percentage of C-Sections')
    plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

sns.boxplot(data=df, x='year', y='percentage_c-section')
plt.title('Percentage of C-Sections (2015 - 2021)')
plt.xlabel('Year')
plt.ylabel('Percentage of C-Sections')
fig.savefig("/content/hospital_cases_GMA.csv/MyDrive/Module1_CDR/Figures CDR png/Boxplot_2015-2021.png")
plt.show()


In [None]:
years = range(2015, 2022)

# Define the list of variables for which you want to calculate the mean
variables = ['delivery', 'cases_c-section', 'cases_kidney', 'Doctor', 'Doctor_in_formation', 'Nurse', 'Other_Nurse', 'Total_staff', 'Operation_rooms', 'Delivery_rooms', 'Beds']

# Calculate the mean values for each variable over the years
mean_values = []

for variable in variables:
    variable_means = []
    for year in years:
        subset = df[df['year'] == year]
        variable_mean = subset[variable].mean()
        variable_means.append(variable_mean)
    mean_values.append(variable_means)

# Create a line plot for the mean values of each variable
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

for i, variable in enumerate(variables):
    plt.plot(years, mean_values[i], label=variable)

plt.title('Mean Values Over the Years (2015 - 2021)')
plt.xlabel('Year')
plt.ylabel('Mean Value')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()

In [None]:
years = range(2015, 2022)

# Define the list of variables for which you want to calculate the mean and standard deviation
variables = ['delivery', 'cases_c-section', 'cases_kidney', 'Doctor', 'Doctor_in_formation', 'Nurse', 'Other_Nurse', 'Total_staff', 'Operation_rooms', 'Delivery_rooms', 'Beds']

# Calculate the mean and standard deviation values for each variable over the years
mean_values = []
std_dev_values = []

for variable in variables:
    variable_means = []
    variable_std_devs = []
    for year in years:
        subset = df[df['year'] == year]
        variable_mean = subset[variable].mean()
        variable_std_dev = subset[variable].std()
        variable_means.append(variable_mean)
        variable_std_devs.append(variable_std_dev)
    mean_values.append(variable_means)
    std_dev_values.append(variable_std_devs)
# Create a line plot with error bars for the mean and standard deviation values of each variable
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

for i, variable in enumerate(variables):
    plt.errorbar(years, mean_values[i], yerr=std_dev_values[i], label=variable, marker='o')

plt.title('Mean Values with Standard Deviation (2015 - 2021)')
plt.xlabel('Year')
plt.ylabel('Mean Value')
plt.legend(loc='upper left')
plt.grid(True)
plt.show()

In [None]:
years = range(2015, 2022)

# Define the list of variables for both plots
variables = ['cases_c-section', 'delivery', 'cases_kidney', 'Doctor', 'Nurse', 'Other_Nurse', 'Total_staff', 'Operation_rooms', 'Delivery_rooms', 'Beds']

# Create two subplots: one for 'cases_c-section,' 'delivery,' and 'cases_kidney,' and another for the rest
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12, 8), sharex=True)

# Lists to store the mean and standard deviation values for each variable
mean_values = []
std_dev_values = []

for variable in variables:
    variable_means = []
    variable_std_devs = []
    for year in years:
        subset = df[df['year'] == year]
        variable_mean = subset[variable].mean()
        variable_std_dev = subset[variable].std()
        variable_means.append(variable_mean)
        variable_std_devs.append(variable_std_dev)
    mean_values.append(variable_means)
    std_dev_values.append(variable_std_devs)
    # Create the first plot for 'cases_c-section,' 'delivery,' and 'cases_kidney'
for i, variable in enumerate(variables[:3]):
    axes[0].errorbar(years, mean_values[i], label=variable, marker='o')

# Create the second plot for the remaining variables
for i, variable in enumerate(variables[3:]):
    axes[1].errorbar(years, mean_values[i + 3],label=variable, marker='o')

# Customize the first plot
axes[0].set_title('Mean Values (2015 - 2021)')
axes[0].set_ylabel('Mean Value of number of cases')
axes[0].legend(loc='upper left')
axes[0].grid(True)

# Customize the second plot
axes[1].set_title('Mean Values (2015 - 2021)')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Mean Value')
axes[1].legend(loc='upper left')
axes[1].grid(True)

# Adjust subplot spacing
plt.tight_layout()
fig.savefig("/content/hospital_cases_GMA.csv/MyDrive/Module1_CDR/Figures CDR png/Mean_plots_2015-2021.png")
# Show the plots
plt.show()




In [None]:
df['Private_division']

In [None]:
value_counts_var1 = df['Reg'].value_counts()
value_counts_var2 = df['Uni'].value_counts()
value_counts_var3 = df['Private_division'].value_counts()

# Set the figure size
plt.figure(figsize=(10, 5))

# Plot for 'var1'
plt.subplot(131)  # 1 row, 3 columns, 1st plot
value_counts_var1.plot(kind='bar', rot=0)
plt.title('German (AL) and Latin speaking (RO) region')
plt.ylabel('Number of Data points')  # Add the y-axis label

# Plot for 'var2'
plt.subplot(133)  # 1 row, 3 columns, 2nd plot
value_counts_var2.plot(kind='bar', rot=0)
plt.title('University (1) and Non-university hospital (0)')
plt.ylabel('Number of Data points')

plt.subplots_adjust(wspace=4)
# Show the plots
plt.tight_layout()
plt.show()


In [None]:
df.head()

In [None]:
value_counts_var2015 = df[df['year'] == 2015]['Uni'].value_counts()
value_counts_var2016 = df[df['year'] == 2016]['Uni'].value_counts()
value_counts_var2017 = df[df['year'] == 2017]['Uni'].value_counts()
value_counts_var2018 = df[df['year'] == 2018]['Uni'].value_counts()
value_counts_var2019 = df[df['year'] == 2019]['Uni'].value_counts()
value_counts_var2020 = df[df['year'] == 2020]['Uni'].value_counts()
value_counts_var2021 = df[df['year'] == 2021]['Uni'].value_counts()


value_counts_var2015 = df[df['year'] == 2015]['Reg'].value_counts()
value_counts_var2016 = df[df['year'] == 2016]['Reg'].value_counts()
value_counts_var2017 = df[df['year'] == 2017]['Reg'].value_counts()
value_counts_var2018 = df[df['year'] == 2018]['Reg'].value_counts()
value_counts_var2019 = df[df['year'] == 2019]['Reg'].value_counts()
value_counts_var2020 = df[df['year'] == 2020]['Reg'].value_counts()
value_counts_var2021 = df[df['year'] == 2021]['Reg'].value_counts()

In [None]:
print(result_uni)

In [None]:

# Set the figure size
plt.figure(figsize=(10, 6))

# Get the unique categories in the "Uni" column
categories = result_uni.index.unique()

# Create a bar chart for each category
for category in categories:
    data = result_uni.loc[category]
    plt.bar(data.index, data.values, label=category)

# Add labels and legend
plt.xlabel('Year')
plt.ylabel('Number of datapoints')
plt.title('Number of Non-University (0) versus University Hospitals (1) per year in our dataset')
plt.legend(title='Non-Uni(0) vs Uni(1)')

fig.savefig("/content/hospital_cases_GMA.csv/MyDrive/Module1_CDR/Figures CDR png/Uni_vs_Nonuni.png")
# Show the chart
plt.show()


In [None]:
# Set the figure size
plt.figure(figsize=(10, 6))

# Get the unique categories in the "Uni" column
categories = result_uni.index.unique()

# Create cumulative data for each category
cumulative_data = {}
for category in categories:
    data = result_uni.loc[category]
    cumulative_data[category] = data.cumsum()  # Calculate cumulative sum

# Create cumulative bar chart for each category
for category, data in cumulative_data.items():
    plt.bar(data.index, data.values, label=category)

# Add labels and legend
plt.xlabel('Year')
plt.ylabel('Cumulative Number of datapoints')
plt.title('Cumulative Number of Non-University (0) versus University Hospitals (1) over the years')
plt.legend(title='Non-Uni(0) vs Uni(1)')

fig.savefig("/content/hospital_cases_GMA.csv/MyDrive/Module1_CDR/Figures CDR png/Cumulative_Uni_vs_Nonuni.png")

# Show the chart
plt.show()


In [None]:

# Set the figure size
plt.figure(figsize=(10, 6))

# Get the unique categories in the "Reg" column
categories = result_reg.index.unique()

# Create a bar chart for each category
for category in categories:
    data = result_reg.loc[category]
    plt.bar(data.index, data.values, label=category)

# Add labels and legend
plt.xlabel('Year')
plt.ylabel('Number of Hospitals')
plt.title('Number of Hospitals per Region Over the Years')
plt.legend(title='Region')

fig.savefig("/content/hospital_cases_GMA.csv/MyDrive/Module1_CDR/Figures CDR png/AL_vs_R0.png")

# Show the chart
plt.show()

# 2. Map of c-sections rates for 2021

In [None]:
# C-sections rates by canton for 2021 - set up the dataset
df_2021=df[df["year"]==2021]

df_2021=df_2021[["delivery","canton", "cases_c-section"]].groupby(["canton"]).sum()
df_2021=df_2021.reset_index()

df_2021["percentage"]=df_2021["cases_c-section"]/df_2021["delivery"]*100

# Get the entire name of the canton to be able to match with the shape file
canton = 'C:/Users/Gaëlle/Documents/_CAS applied data science/2. Module 2 Statistical Inference for Data Science/Projet/Cantons.xlsx'
canton = pd.read_excel(canton)
canton.head()
merged_df_0 = df_2021.merge(canton, how="left", left_on="canton", right_on="Abre")



In [None]:
merged_df_0

In [None]:
shape_file = 'C:/Users/Gaëlle/Documents/_CAS applied data science/2. Module 2 Statistical Inference for Data Science/Projet/map/CHE_adm1.shp' # link to the shape file
map_df = gpd.read_file(shape_file)

# merge our data with the shape file
merged_df = map_df.merge(merged_df_0, how="left", left_on="NAME_1", right_on="Canton")
# merge: left merge, left is the map, right is the canton. First use the left one, then add the right
# we merge on a key that have different names: on map the name is NAME_1 and on the df_2021 the key is "Canton"
merged_df


In [None]:
# Create the map
def plotmap(df, datacol, vmax, filename, title):  #creation of a function with the following arguments
    sm = plt.cm.ScalarMappable(cmap='OrRd', norm=plt.Normalize(vmin=0, vmax=vmax))
    fig, ax = plt.subplots(1, figsize=(20, 10))
    ax.axis("off")
    ax.set_title(title, fontdict={'fontsize': '25', 'fontweight' : '3'})
    ax.annotate("Sources: BAG", xy=(0.68, 0.11),
    xycoords='figure fraction', fontsize=12, color='#555555')
    sm.set_array([])
    fig.colorbar(sm, ax=ax, extend="max")
    df['coords'] = df['geometry'].apply(lambda x: x.representative_point().coords[:])
    df['coords'] = [coords[0] for coords in df['coords']]
    for idx, row in df.iterrows():
        #plt.annotate(s=row['NAME_0'], xy=row['coords'],horizontalalignment='center')
        df.plot(column=datacol, cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', vmax=vmax)
    fig.savefig(filename, dpi=150)

plotmap(merged_df, "percentage", 50, "C-section_canton.png", "C-section rates per canton in 2021")

# If you want file names with day information, use code below
#os.makedirs("dailymaps", exist_ok=True)
#stamp = datetime.datetime.now().strftime("%Y%m%d")
#hdate = datetime.datetime.now().strftime("%d.%m.%Y")
#plotmap(merged_df, "VIRUSCASESCONFIRMED", 500, f"dailymaps/map_abs_{stamp}.png", f"# of confirmed coronavirus cases per canton



In [None]:
# Color codes
# RO: #5e3c58
# Al: #c7bbc9

# Uni: "#bbcbb2"
# Non Uni: "#557c3e"