## Do Imports

In [56]:
import pandas as pd

## Read in Data Files

In [57]:
# Read WHO morbidity data
df_mort = pd.read_csv('Resources/source_data/Morticd10_part5_rev.csv')

# Read WHO country names with codes, to be merged with morbidity data
df_countries = pd.read_csv('Resources/source_data/country_codes.csv')

# Read the list of countries to be used for our reporting
df_filtered_countries = pd.read_csv('Resources/source_data/filtered_countries.csv')

# Read the ICD 10 codes data
df_icd10 = pd.read_csv('Resources/source_data/ICDCodeSet.csv')

  df_mort = pd.read_csv('Resources/source_data/Morticd10_part5_rev.csv')


## Validate the imported data
Show the first few rows of each DataFrame

In [58]:
display(df_mort.head(2))
display(df_countries.head(2))
display(df_filtered_countries.head(2))
display(df_icd10.head(2))

Unnamed: 0,Country,Admin1,SubDiv,Year,List,Cause,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths21,Deaths22,Deaths23,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4
0,4303,,,2017,101,1000,1,1,8,281784,...,43174.0,29856.0,29184.0,,,68.0,1608.0,,,
1,4303,,,2017,101,1000,2,1,8,292339,...,56037.0,52655.0,88271.0,,,19.0,1178.0,,,


Unnamed: 0,country,name
0,1010,Algeria
1,1020,Angola


Unnamed: 0,Country,Country Name,Region
0,5020,Australia,Oceania
1,2070,Brazil,South America


Unnamed: 0,ICDCode,Description
0,A000,"Cholera due to Vibrio cholerae 01, biovar c..."
1,A001,"Cholera due to Vibrio cholerae 01, biovar e..."


## Create a new DataFrame for cleaning

In [59]:
# Copy the df_mort DataFrame into a new DataFrame named `df_cleaning`
df_cleaning = df_mort.copy()
display(df_cleaning.head(2))

Unnamed: 0,Country,Admin1,SubDiv,Year,List,Cause,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths21,Deaths22,Deaths23,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4
0,4303,,,2017,101,1000,1,1,8,281784,...,43174.0,29856.0,29184.0,,,68.0,1608.0,,,
1,4303,,,2017,101,1000,2,1,8,292339,...,56037.0,52655.0,88271.0,,,19.0,1178.0,,,


## Update the DataFrame to use the filtered set of countries
The `filtered_countries.csv` file contains a list of 12 countries we are focusing on for our project. Our DataFrame should be filtered to this set.

In [60]:
df_merged = df_cleaning.merge(df_filtered_countries, on='Country', how='inner')
df_merged

Unnamed: 0,Country,Admin1,SubDiv,Year,List,Cause,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths23,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4,Country Name,Region
0,4160,,,2017,103,A02,1,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe
1,4160,,,2017,103,A04,1,0,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe
2,4160,,,2017,103,A08,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe
3,4160,,,2017,103,A32,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe
4,4160,,,2017,103,A41,1,0,1,4,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227826,5020,,,2022,104,Y871,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Australia,Oceania
227827,5020,,,2022,104,Y883,1,0,1,4,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Australia,Oceania
227828,5020,,,2022,104,Y883,2,0,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Australia,Oceania
227829,5020,,,2022,104,Y890,2,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Australia,Oceania


## Merge in the ICD10 Code descriptions

In [61]:
# Rename the DataFrame column names for ICD codes
df_icd10 = df_icd10.rename(columns={'ICDCode': 'ICD Code', 'detailed_list_numbers': 'Detailed Codes', 'cause': 'ICD Detail'})
df_merged = df_merged.rename(columns={'Cause': 'ICD Code'})

# Verify renaming
print("Renamed columns in df_icd10:", df_icd10.columns)
print("Renamed columns in df_cleaning:", df_merged.columns)

Renamed columns in df_icd10: Index(['ICD Code', 'Description'], dtype='object')
Renamed columns in df_cleaning: Index(['Country', 'Admin1', 'SubDiv', 'Year', 'List', 'ICD Code', 'Sex',
       'Frmat', 'IM_Frmat', 'Deaths1', 'Deaths2', 'Deaths3', 'Deaths4',
       'Deaths5', 'Deaths6', 'Deaths7', 'Deaths8', 'Deaths9', 'Deaths10',
       'Deaths11', 'Deaths12', 'Deaths13', 'Deaths14', 'Deaths15', 'Deaths16',
       'Deaths17', 'Deaths18', 'Deaths19', 'Deaths20', 'Deaths21', 'Deaths22',
       'Deaths23', 'Deaths24', 'Deaths25', 'Deaths26', 'IM_Deaths1',
       'IM_Deaths2', 'IM_Deaths3', 'IM_Deaths4', 'Country Name', 'Region'],
      dtype='object')


In [62]:
# Merge the ICD Code data into the `df_cleaning` DataFrame

df_merged = df_merged.merge(df_icd10, on='ICD Code', how='left')
df_merged.head()

Unnamed: 0,Country,Admin1,SubDiv,Year,List,ICD Code,Sex,Frmat,IM_Frmat,Deaths1,...,Deaths24,Deaths25,Deaths26,IM_Deaths1,IM_Deaths2,IM_Deaths3,IM_Deaths4,Country Name,Region,Description
0,4160,,,2017,103,A02,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe,
1,4160,,,2017,103,A04,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe,
2,4160,,,2017,103,A08,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe,
3,4160,,,2017,103,A32,1,0,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe,
4,4160,,,2017,103,A41,1,0,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Iceland,Northern Europe,


## Drop unused columns

In [63]:
# Drop the Admin1 and SubDiv columns since these aren't essential to our analysis
# Todo: Do we drop the IM_Deaths columns since these deaths are captured in the Deaths columns?

columns_to_drop = ['Admin1', 'SubDiv']
df_merged = df_merged.drop(columns=columns_to_drop)
print(df_merged.columns)

Index(['Country', 'Year', 'List', 'ICD Code', 'Sex', 'Frmat', 'IM_Frmat',
       'Deaths1', 'Deaths2', 'Deaths3', 'Deaths4', 'Deaths5', 'Deaths6',
       'Deaths7', 'Deaths8', 'Deaths9', 'Deaths10', 'Deaths11', 'Deaths12',
       'Deaths13', 'Deaths14', 'Deaths15', 'Deaths16', 'Deaths17', 'Deaths18',
       'Deaths19', 'Deaths20', 'Deaths21', 'Deaths22', 'Deaths23', 'Deaths24',
       'Deaths25', 'Deaths26', 'IM_Deaths1', 'IM_Deaths2', 'IM_Deaths3',
       'IM_Deaths4', 'Country Name', 'Region', 'Description'],
      dtype='object')
