## Do Imports

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)

## Read in Data Files

In [5]:
# Read WHO morbidity data
df_mort = pd.read_csv('.././resources/source_data/Morticd10_part5_rev.csv')

# Read WHO country names with codes, to be merged with morbidity data
df_countries = pd.read_csv('.././resources/source_data/country_codes.csv')

# Read the list of countries to be used for our reporting
df_filtered_countries = pd.read_csv('.././resources/source_data/filtered_countries.csv')

# Read the ICD 10 codes data for versions 103 and 104
df_icd10 = pd.read_csv('.././resources/source_data/ICDCodeSet.csv')

# Read the ICD 10 codes data for version 101
df_icd10_version_101 = pd.read_csv('.././resources/source_data/ICD10_codes_v101.csv')

# Read the ICD 10 categories data
df_icd10_categories = pd.read_csv('.././resources/source_data/ICD10_categories.csv')

  df_mort = pd.read_csv('.././resources/source_data/Morticd10_part5_rev.csv')


## Validate the imported data
Show the first few rows of each DataFrame

In [None]:
display(df_mort.head(2))
display(df_countries.head(2))
display(df_filtered_countries.head(2))
display(df_icd10.head(2))
display(df_icd10_categories.head(2))
display(df_icd10_version_101.head(2))

## Create a new DataFrame for cleaning

In [None]:
# Copy the df_mort DataFrame into a new DataFrame named `df_cleaning`
df_cleaning = df_mort.copy()
display(df_cleaning.head(2))

## Update the DataFrame to use the filtered set of countries
The `filtered_countries.csv` file contains a list of 12 countries we are focusing on for our project. Our DataFrame should be filtered to this set.

In [None]:
df_merged = df_cleaning.merge(df_filtered_countries, on='Country', how='inner')
display(df_merged.head(3))
display(df_merged.tail(3))

## Update dataset to use the years 2017 - 2021

In [None]:
# Count the number of rows for the year 2022 before filtering
rows_2022_before = df_merged[df_merged['Year'] == 2022].shape[0]

# Filter the DataFrame for years between 2017 and 2021
df_filtered = df_merged.query('2017 <= Year <= 2021')

# Verify the result of filtering
print(df_filtered[['Year']].drop_duplicates().sort_values(by='Year'))

# Calculate the number of rows for 2022 that will be dropped
rows_dropped_2022 = rows_2022_before

# Print the number of rows that will be dropped
print(f"Number of rows for the year 2022 that will be dropped: {rows_dropped_2022:,}")

In [None]:
# Filter the DataFrame for years between 2017 and 2021
df_merged = df_merged.query('2017 <= Year <= 2021')

## Verify the filtered results for the country list and the years

In [None]:
# Verify the result of filtering out 2022
print(df_merged[['Year']].drop_duplicates().sort_values(by='Year'))

In [None]:
# Compare the length of the unfiltered DataFrame with the new filtered DataFrame
length_diff = len(df_cleaning) - len(df_merged)
print(f"There are {len(df_cleaning):,} rows in the full dataset and {len(df_merged):,} rows in the filtered DataFrame. Result: The filtered DataFrame is {length_diff:,} shorter.")

## Merge in the ICD10 Code descriptions

In [None]:
# Rename the DataFrame column names for ICD codes
df_icd10 = df_icd10.rename(columns={'ICDCode': 'ICD Code', 'detailed_list_numbers': 'Detailed Codes', 'cause': 'ICD Detail'})
df_merged = df_merged.rename(columns={'Cause': 'ICD Code'})

# Verify renaming
print("Renamed columns in df_icd10:", df_icd10.columns)
print("Renamed columns in df_cleaning:", df_merged.columns)

In [None]:
# Merge the ICD Code data into the `df_cleaning` DataFrame

df_merged = df_merged.merge(df_icd10, on='ICD Code', how='left')
df_merged.head()

In [None]:
# Function to determine the code description for a given ICD Code
def get_code_description(icd_code):
    for _, row in df_icd10_categories.iterrows():
        start, end = row['code_set'].split('-')
        if start <= icd_code <= end:
            return row['code_description']
    return 'Unknown'

# Function to map three-letter ICD-10 codes to categories
def map_three_letter_code(icd_code):
    # Extract the first three letters of the ICD code
    three_letter_code = icd_code[:3]
    return get_code_description(three_letter_code)

# Assuming merged_df is your existing DataFrame with an 'ICD Code' column
# Apply the function to create a new column with the code descriptions
df_merged['code_description'] = df_merged['ICD Code'].apply(map_three_letter_code)

display(df_merged.head(5))

## Drop unused columns

In [None]:
# Drop the Admin1, SubDiv, IM_Deaths, and IM_Frmat columns since these aren't essential to our analysis

im_death_columns = [col for col in df_merged if col.startswith('IM_Deaths')]
columns_to_drop = ['Admin1', 'SubDiv', 'IM_Frmat'] + im_death_columns  # Flatten the list
df_merged = df_merged.drop(columns=columns_to_drop)
print(df_merged.columns)

## Update death buckets
See this GitHub issue for the list of buckets: https://github.com/users/ShylaTatum/projects/3/views/1?pane=issue&itemId=73940120
See this doc in the repository for how the mapping works for the index: `Resources/documentation/Documentation_21Feb2024.doc`

In [None]:
# Identify columns to group by looking for columns that start with the word `Deaths``
deaths_columns = [col for col in df_merged if col.startswith('Deaths')]

# Create new grouped columns

# All ages - Deaths1 column
df_merged['Deaths: All Ages'] = df_merged[deaths_columns[0]]

# Smoketest - Sum all Deaths columns
df_merged['Deaths: Smoketest All Ages'] = df_merged[deaths_columns[1:]].sum(axis=1)

# Between age 0 and 1 - Deaths2 column
df_merged['Deaths: 0-1 Years'] = df_merged[deaths_columns[1]]

# Between ages 1 and 24 - Deaths3 through Deaths10 columns
df_merged['Deaths: 1-24 Years'] = df_merged[deaths_columns[2:11]].sum(axis=1)

# Between ages 25 and 44 - Deaths11 through Deaths14 columns
df_merged['Deaths: 25-44 Years'] = df_merged[deaths_columns[12:15]].sum(axis=1)

# Between ages 45 and 69 - Deaths15 through Deaths19
df_merged['Deaths: 45-69 Years'] = df_merged[deaths_columns[16:20]].sum(axis=1)

# Ages 70 and older
df_merged['Deaths: 70-95+ Years'] = df_merged[deaths_columns[21:24]].sum(axis=1)

# Age not specified
df_merged['Deaths: Age Not Specified'] = df_merged[deaths_columns[25]]

# Drop original deaths columns
df_merged = df_merged.drop(columns=deaths_columns)

# Verify results
display(df_merged.head(5))

In [None]:
# Get the length of the deaths_columns list
length_of_deaths_columns = len(deaths_columns)

# Print the length
print(length_of_deaths_columns)

## Final prettier stuff
* Add inline description for sex type
* Rename columns
* Reorder columns

In [None]:
# Inline note for Sex

# Define mapping dictionary
sex_mapping = {
    1: '1 - Male', 
    2: '2 - Female',
    9: '9 - Unspecified'
}

# Apply the mapping to the `Sex` column
df_merged['Sex'] = df_merged['Sex'].map(sex_mapping)

In [None]:
# Rename columns
df_merged = df_merged.rename(columns={'Country':'Country Code', 'Description': 'ICD10 Detail', 'List':'ICD10 Version', 'Frmat': 'Age Format', 'code_description':'ICD Category'})

In [None]:
# Define the desired order of columns
ordered_columns = ['Year', 'Country Code', 'Country Name', 
                   'Region', 'Sex', 'ICD Code', 'ICD Category', 'ICD10 Version', 
                   'Deaths: All Ages', 'Deaths: 0-1 Years', 'Deaths: 1-24 Years', 'Deaths: 25-44 Years', 'Deaths: 45-69 Years', 'Deaths: 70-95+ Years', 'Deaths: Age Not Specified', 'Age Format', 'Deaths: Smoketest All Ages']

# Reorder the columns using reindex
df_reordered = df_merged.reindex(columns=ordered_columns)

# Display the new DataFrame with reorganized columns
display(df_reordered.head())


## Write out the file
This code creates a `final` DataFrame from copying the DataFrame we've been using to clean the data. And then we save it out.

In [None]:
# Name the final DataFrame
df_output = df_reordered.copy()
display(df_output.head(2))

In [None]:
# Write out the cleaned CSV
df_output.to_csv('Resources/source_data/current_who_mortality_2017_2021.csv', index=False)

# Verify the file has been written
print("CSV file 'current_who_mortality_2017_2021.csv' has been written successfully.")

# Test the new CSV by creating a DataFrame and viewing it

In [None]:
# Import the CSV
df_final = pd.read_csv('Resources/source_data/current_who_mortality_2017_2021.csv')

In [None]:
# View the first 5 rows of the new DataFrame
display(df_final.head(20))
df_final.to_csv('df_final.csv', index=False)