# Combine the dataset from 2018-2021

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import io
import statsmodels.api as sm
from scipy import stats
import geopandas as gpd
from google.colab import drive

In [None]:
'''#Import the cleaned data for 2020 and 2021
file_path = 'C:/Users/Gaëlle/Documents/_CAS applied data science/3. Module 3 Data analysis and machine learning/Project/DF_2021_clean.xlsx'
df_2021 = pd.read_excel(file_path)

file_path_2 = 'C:/Users/Gaëlle/Documents/_CAS applied data science/3. Module 3 Data analysis and machine learning/Project/DF_2020_clean.xlsx'
df_2020 = pd.read_excel(file_path_2)'''

In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df_2018 = pd.read_csv('/content/drive/MyDrive/Module1_CDR/2018-2019/DF_2018_clean.csv')
df_2019 = pd.read_csv('/content/drive/MyDrive/Module1_CDR/2018-2019/DF_2019_clean.csv')
df_2020 = pd.read_excel('/content/drive/MyDrive/Module1_CDR/2020-2021/DF_2020_clean.xlsx')
df_2021 = pd.read_excel('/content/drive/MyDrive/Module1_CDR/2020-2021/DF_2021_clean.xlsx')
print(df_2018.shape, df_2019.shape, df_2020.shape, df_2021.shape)

(93, 582) (99, 603) (82, 633) (81, 725)


In [None]:
#I included this line of code because in the 2018 and 2019 datasets we needed to get rid of some rows that don't show hospitals but larger groups
df_2018.dropna(subset=['Nom_df2'], inplace=True)
df_2019.dropna(subset=['Nom_df2'], inplace=True)
df_2020.dropna(subset=['Nom_df2'], inplace=True)
df_2020.dropna(subset=['Nom_df2'], inplace=True)
print(df_2018.shape, df_2019.shape, df_2020.shape, df_2021.shape)

(78, 582) (60, 603) (82, 633) (81, 725)


In [None]:
# Do they have the same variables?
print('the shape of the 2018 dataset:', df_2018.shape)
print('the shape of the 2019 dataset:', df_2019.shape)
print('the shape of the 2020 dataset:', df_2020.shape)
print('the shape of the 2021 dataset:', df_2021.shape)
# Not the same number of variables

the shape of the 2018 dataset: (78, 582)
the shape of the 2019 dataset: (60, 603)
the shape of the 2020 dataset: (82, 633)
the shape of the 2021 dataset: (81, 725)


In [None]:
#common_columns = df_2020.columns.intersection(df_2021.columns).tolist()

# Create a set to store the common columns
common_columns = None

# Iterate through the DataFrames and find the common columns
for year in range(2018, 2022):
    df_name = f'df_{year}'
    if df_name in globals():
        current_df = globals()[df_name]
        if common_columns is None:
            common_columns = set(current_df.columns)
        else:
            common_columns = common_columns.intersection(set(current_df.columns))

# Convert the common columns set to a list
common_columns = list(common_columns)


In [None]:
print(common_columns)

['I.1.16.M', 'A.5.1.F', 'A.7.14.M', 'L.6.4.F', 'KostAWLFA', 'L.5.5.F', 'KostForLFP', 'A.7.1.F', 'CMInB', 'TeilsP', 'D.2.6.P', 'J.1.4.M', 'L.6.8.F', 'J.1.5.M', 'ErlAmbB', 'A.7.4.F', 'pPatLKP', 'KostKVGStatA', 'Ptage', 'AnlKVGStatR', 'ErlZvOKPStatVB', 'AnlKVGStatA', 'A.7.7.M', 'F.3.7.M', 'D.1.2.M', 'ErlZvOKPStatR', 'B.1.9.M', 'H.1.5.P', 'I.1.8.M', 'L.5.3.F', 'J.3.1.P', 'H.4.2.P', 'A.7.15.M', 'PtageStatB', 'A.7.18.P', 'B.1.14.P', 'E.4.6.M', 'E.4.16.P', 'H.1.9.P', 'D.1.1.M', 'Infrastruktur2', 'Akt', 'PersTFallMS', 'pDIA_AMB', 'KostOKPAmbA', 'G.5.3.P', 'A.7.24.F', 'Gebs', 'A.7.28.M', 'AustStatMSA', 'F.3.3.M', 'Notfalldienst', 'A.3.2.V', 'F.1.4.F', 'A.7.13.M', 'I.1.9.M', 'G.2.2.F', 'H.1.7.M', 'E.4.2.M', 'StdBelA', 'E.5.4.F', 'H.3.2.F', 'E.3.1.F', 'A.1.14.P', 'L.6.1.F', 'AnzBelA', 'E.4.15.M', 'A.7.11.V', 'I.2.7.M', 'KostAmbA', 'BettenStatR', 'F.3.9.F', 'AmbKonsP', 'A.7.8.M', 'A.7.12.V', 'J.1.1.M', 'G.3.5.P', 'H.3.3.F', 'AustLang', 'F.1.5.X', 'C.1.5.S', 'F.2.8.M', 'G.6.3.F', 'E.4.11.F', 'B.1.1

In [None]:
df_combined = pd.concat([df_2018[common_columns], df_2019[common_columns], df_2020[common_columns], df_2021[common_columns]], ignore_index=True)

In [None]:
df_combined

(301, 554)

In [None]:
duplicate_columns = df_combined.columns[df_combined.columns.duplicated()]
print(duplicate_columns)

Index([], dtype='object')


In [None]:
# Assuming you have a DataFrame 'df' and you want to make 'column1', 'column2', 'column3', 'column4', and 'column5' the first columns.
desired_column_order = ['Nom_df2', 'G.1.4.P', 'G.1.5.P', 'G.1.6.P', 'G.1.7.P'] + [col for col in df_combined.columns if col not in ['Nom_df2', 'G.1.4.P', 'G.1.5.P', 'G.1.6.P', 'G.1.7.P']]

# Reorder the DataFrame columns
df_combined= df_combined[desired_column_order]



In [None]:
df_combined.to_csv('/content/drive/MyDrive/Module1_CDR/2018-2021 Dataset/merged_dataset_18_21.csv', index=False)