## Do Imports

In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)

## Read in Data Files

In [2]:
# Read WHO country names with codes, to be merged with morbidity data
df_countries = pd.read_csv('.././resources/source_data/country_codes.csv')

# Read the list of countries to be used for our reporting
df_filtered_countries = pd.read_csv('.././resources/source_data/filtered_countries.csv')

# Read the WHO populaton data
df_who_pop = pd.read_csv('.././resources/source_data/who_mort_population.csv')

## Validate the imported data
Show the first few rows of each DataFrame

In [3]:
display(df_countries.head(2))
display(df_filtered_countries.head(2))
display(df_who_pop.head(2))

Unnamed: 0,country,name
0,1010,Algeria
1,1020,Angola


Unnamed: 0,Country,Country Name,Region
0,5020,Australia,Oceania
1,2070,Brazil,South America


Unnamed: 0,Country,Admin1,SubDiv,Year,Sex,Frmat,Pop1,Pop2,Pop3,Pop4,...,Pop18,Pop19,Pop20,Pop21,Pop22,Pop23,Pop24,Pop25,Pop26,Lb
0,1060,,,1980,1,7,137100.0,3400.0,15800.0,,...,,5300.0,,2900.0,,,,,6500.0,5000.0
1,1060,,,1980,2,7,159000.0,4000.0,18400.0,,...,,6200.0,,3400.0,,,,,7500.0,6000.0


## Create a new DataFrame for cleaning

In [4]:
# Copy the df_mort DataFrame into a new DataFrame named `df_cleaning`
df_pop_cleaning = df_who_pop.copy()
display(df_pop_cleaning.head(2))

Unnamed: 0,Country,Admin1,SubDiv,Year,Sex,Frmat,Pop1,Pop2,Pop3,Pop4,...,Pop18,Pop19,Pop20,Pop21,Pop22,Pop23,Pop24,Pop25,Pop26,Lb
0,1060,,,1980,1,7,137100.0,3400.0,15800.0,,...,,5300.0,,2900.0,,,,,6500.0,5000.0
1,1060,,,1980,2,7,159000.0,4000.0,18400.0,,...,,6200.0,,3400.0,,,,,7500.0,6000.0


## Update the DataFrame to use the filtered set of countries
The `filtered_countries.csv` file contains a list of 12 countries we are focusing on for our project. Our DataFrame should be filtered to this set.

In [5]:
df_pop_merged = df_pop_cleaning.merge(df_filtered_countries, on='Country', how='inner')
display(df_pop_merged.head(3))
display(df_pop_merged.tail(3))

Unnamed: 0,Country,Admin1,SubDiv,Year,Sex,Frmat,Pop1,Pop2,Pop3,Pop4,...,Pop20,Pop21,Pop22,Pop23,Pop24,Pop25,Pop26,Lb,Country Name,Region
0,1520,,,2009,1,2,5400996.0,95789.72,373042.3,,...,105701.0,87089.0,37632.0,21329.0,,,0.0,,Tunisia,Africa
1,1520,,,2009,2,2,5470013.0,88492.28,359332.7,,...,110985.0,89656.0,39330.0,22696.0,,,0.0,,Tunisia,Africa
2,2070,,,1979,1,2,59237900.0,1275200.0,6959300.0,,...,631400.0,402700.0,167300.0,105400.0,,,0.0,1377415.0,Brazil,South America


Unnamed: 0,Country,Admin1,SubDiv,Year,Sex,Frmat,Pop1,Pop2,Pop3,Pop4,...,Pop20,Pop21,Pop22,Pop23,Pop24,Pop25,Pop26,Lb,Country Name,Region
1370,3350,,,2021,2,1,2033728.0,15372.0,17761.0,17856.0,...,94989.0,49982.0,41669.0,39046.0,,,,16614.0,Singapore,Asia
1371,3350,,,2022,1,1,1990212.0,16468.0,17311.0,19016.0,...,90038.0,48203.0,31019.0,21211.0,,,,16663.0,Singapore,Asia
1372,3350,,,2022,2,1,2083027.0,15605.0,16533.0,18303.0,...,98546.0,57589.0,43918.0,39679.0,,,,15627.0,Singapore,Asia


## Update dataset to use the years 2017 - 2021

In [6]:
# Count the number of rows for the year 2022 before filtering
# rows_2022_before = df_pop_merged[df_pop_merged['Year'] == 2022].shape[0]

# Filter the DataFrame for years between 2017 and 2021
# df_pop_filtered = df_pop_merged.query('2017 <= Year <= 2021')

# Verify the result of filtering
#print(df_pop_filtered[['Year']].drop_duplicates().sort_values(by='Year'))

# Calculate the number of rows for 2022 that will be dropped
# rows_dropped_2022 = rows_2022_before

# Print the number of rows that will be dropped
# print(f"Number of rows for the year 2022 that will be dropped: {rows_dropped_2022:,}")

In [7]:
# Filter the DataFrame for years between 2017 and 2021
# df_pop_merged = df_pop_merged.query('2017 <= Year <= 2021')

In [10]:
# Load the original list of countries
df_countries = pd.read_csv('.././resources/source_data/filtered_countries.csv')

# Create a DataFrame with all combinations of countries and years
years = range(2017, 2022)
all_combinations = pd.MultiIndex.from_product([df_countries['Country Name'], years], names=['Country Name', 'Year'])
df_all_combinations = pd.DataFrame(index=all_combinations).reset_index()

# Assuming df_pop_merged is your population dataframe
# Filter the DataFrame for years between 2017 and 2021
df_pop_filtered = df_pop_merged.query('2017 <= Year <= 2021')

print(df_pop_filtered.head())

# Group by Year and Country Name
df_pop_filtered = df_pop_filtered.groupby(['Year', 'Country Name'])['Pop1'].sum().reset_index()

# Merge with all combinations to ensure all countries are included
df_pop_merged = pd.merge(df_all_combinations, df_pop_filtered, on=['Country Name', 'Year'], how='left')

# Fill missing values with 0 or appropriate values
df_pop_merged['Pop1'].fillna(0, inplace=True)

# Display the merged DataFrame
display(df_pop_merged)

     Country  Admin1 SubDiv  Year  Sex  Frmat          Pop1          Pop2  \
38      2070     NaN    NaN  2019    1      1  1.027603e+08  1.515249e+06   
39      2070     NaN    NaN  2019    2      1  1.073868e+08  1.445697e+06   
487     3150     NaN    NaN  2017    1      2  4.321810e+06  9.348549e+04   
488     3150     NaN    NaN  2017    2      2  4.391458e+06  8.842099e+04   
489     3150     NaN    NaN  2018    1      2  4.407285e+06  9.407200e+04   

             Pop3       Pop4  ...         Pop20         Pop21         Pop22  \
38   1.521777e+06  1493811.0  ...  2.297297e+06  1.484658e+06  9.190590e+05   
39   1.452251e+06  1425912.0  ...  2.877091e+06  1.995878e+06  1.364756e+06   
487  3.650553e+05        NaN  ...  1.083353e+05  7.432762e+04  5.352394e+04   
488  3.451319e+05        NaN  ...  1.262416e+05  9.557991e+04  7.464761e+04   
489  3.710580e+05        NaN  ...  1.214000e+05  7.266200e+04  5.640000e+04   

            Pop23  Pop24  Pop25  Pop26         Lb  Country Nam

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_pop_merged['Pop1'].fillna(0, inplace=True)


Unnamed: 0,Country Name,Year,Pop1
0,Australia,2017,24594200.0
1,Australia,2018,24963260.0
2,Australia,2019,25334830.0
3,Australia,2020,25649250.0
4,Australia,2021,25685410.0
5,Brazil,2017,0.0
6,Brazil,2018,0.0
7,Brazil,2019,210147100.0
8,Brazil,2020,0.0
9,Brazil,2021,0.0


## Verify the filtered results for the country list and the years

In [None]:
# Verify the result of filtering out 2022
print(df_pop_merged[['Year']].drop_duplicates().sort_values(by='Year'))

In [None]:
# Print out the countries
print(df_pop_merged[['Country Name']].drop_duplicates())

In [None]:
# Compare the length of the unfiltered DataFrame with the new filtered DataFrame
length_diff = len(df_pop_cleaning) - len(df_pop_merged)
print(f"There are {len(df_pop_cleaning):,} rows in the full dataset and {len(df_pop_merged):,} rows in the filtered DataFrame. Result: The filtered DataFrame is {length_diff:,} shorter.")

## Add country name

In [None]:
# Ensure 'Country' column in df_pop_merged and 'name' column in df_countries are of the same data type
#df_pop_merged['Country'] = df_pop_merged['Country'].astype(str)
#df_countries['name'] = df_countries['name'].astype(str)

# Merge df_countries into df_pop_merged using 'Country' from df_pop_merged and 'name' from df_countries
#df_pop_merged = df_pop_merged.merge(df_countries, left_on='Country', right_on='name', how='left')

# Display the merged dataframe
#display(df_pop_merged.head())

## Drop unused columns

In [None]:
# Drop the Admin1, SubDiv columns since these aren't essential to our analysis
columns_to_drop = ['Admin1', 'SubDiv']
df_pop_merged = df_pop_merged.drop(columns=columns_to_drop)
print(df_pop_merged.columns)

## Update buckets


In [None]:
# Add population age buckets

# Identify columns to group by looking for columns that start with the word `Pop`
population_columns = [col for col in df_pop_merged if col.startswith('Pop')]

# Ensure there are enough columns to avoid index out of range errors
if len(population_columns) < 26:
    raise ValueError("Not enough population columns found. Check the input data.")

# Create new grouped columns

# All ages - Pop1 column
df_pop_merged['Pop: All Ages'] = df_pop_merged[population_columns[0]]

# Smoketest - Sum all Deaths columns
df_pop_merged['Pop: Smoketest All Ages'] = df_pop_merged[population_columns[1:]].sum(axis=1)

# Between age 0 and 1 - Deaths2 column
df_pop_merged['Pop: 0-1 Years'] = df_pop_merged[population_columns[1]]

# Between ages 1 and 24 - Deaths3 through Deaths10 columns
df_pop_merged['Pop: 1-24 Years'] = df_pop_merged[population_columns[2:11]].sum(axis=1)

# Between ages 25 and 44 - Deaths11 through Deaths14 columns
df_pop_merged['Pop: 25-44 Years'] = df_pop_merged[population_columns[12:15]].sum(axis=1)

# Between ages 45 and 69 - Deaths15 through Deaths19
df_pop_merged['Pop: 45-69 Years'] = df_pop_merged[population_columns[16:20]].sum(axis=1)

# Ages 70 and older
df_pop_merged['Pop: 70-95+ Years'] = df_pop_merged[population_columns[21:24]].sum(axis=1)

# Age not specified
df_pop_merged['Pop: Age Not Specified'] = df_pop_merged[population_columns[25]]

# Drop original deaths columns
df_pop_merged = df_pop_merged.drop(columns=population_columns)

# Verify results
display(df_pop_merged.head(5))

## Final prettier stuff
* Add inline description for sex type
* Rename columns
* Reorder columns

In [None]:
# Inline note for Sex

# Define mapping dictionary
sex_mapping = {
    1: '1 - Male', 
    2: '2 - Female',
    9: '9 - Unspecified'
}

# Apply the mapping to the `Sex` column
df_pop_merged['Sex'] = df_pop_merged['Sex'].map(sex_mapping)

In [None]:
df_pop_merged.columns

In [None]:
# Rename columns
df_pop_merged = df_pop_merged.rename(columns={'Country':'Country Code', 'Frmat': 'Age Format', 'Lb': 'Live Births'})

In [None]:
# Define the desired order of columns
ordered_pop_columns = ['Year', 'Country Code', 'Country Name', 'Sex', 'Pop: All Ages', 'Pop: 0-1 Years', 'Pop: 1-24 Years', 'Pop: 25-44 Years', 'Pop: 45-69 Years', 'Pop: 70-95+ Years', 'Pop: Age Not Specified', 'Age Format', 'Pop: Smoketest All Ages']

# Reorder the columns using reindex
df_pop_reordered = df_pop_merged.reindex(columns=ordered_pop_columns)

# Display the new DataFrame with reorganized columns
display(df_pop_reordered.head())


## Write out the file
This code creates a `final` DataFrame from copying the DataFrame we've been using to clean the data. And then we save it out.

In [None]:
# Name the final DataFrame
df_pop_output = df_pop_reordered.copy()
display(df_pop_output.head(2))

In [None]:
# Write out the cleaned CSV
df_pop_output.to_csv('Resources/source_data/current_who_population.csv', index=False)

# Verify the file has been written
print("CSV file 'Resources/source_data/current_who_population.csv' has been written successfully.")

# Test the new CSV by creating a DataFrame and viewing it

In [None]:
# Import the CSV
df_pop_final = pd.read_csv('Resources/source_data/current_who_population.csv')

In [None]:
# View the first 5 rows of the new DataFrame
display(df_pop_final.head())