In [None]:
# Import dependancies 
import pandas as pd
from pathlib import Path


In [None]:
WHO_dae_csv = Path("/Users/mireille.walton/Desktop/da_bootcamp/ch_proj/Project_3/source_data/Death_attributable_to_Environment_data.csv")
World_Bank_pop_csv = Path("/Users/mireille.walton/Desktop/da_bootcamp/ch_proj/Project_3/source_data/World_Bank_Data.csv")

In [None]:
WHO_dae_read = pd.read_csv(WHO_dae_csv)
World_Bank_pop_read = pd.read_csv(World_Bank_pop_csv, skiprows = 4)

In [None]:
# Display the sample data 1
WHO_dae_read.head(2)

In [None]:
# Display column names and data types
WHO_dae_read.dtypes

In [None]:
# Create new dataframe 
WHO_dae_dataframe = pd.DataFrame(WHO_dae_read)

# Use copy to extract required columns
WHO_dae_data = WHO_dae_dataframe[["ParentLocationCode", "ParentLocation", "SpatialDimValueCode","Location", "Dim1","FactValueNumeric"]].copy()

#confirm columns extracted are correct
WHO_dae_data.head(2)

In [None]:
# Rename column headers
WHO_dae_data.rename(columns={"ParentLocationCode":"Continent_Code",
                            "ParentLocation":"Continent",
                            "SpatialDimValueCode":"Country_Code",
                            "Location":"Country",
                             "Dim1":"Indicator",
                            "FactValueNumeric":"No_Deaths"}, inplace=True)

In [None]:
# Change No_Deaths columns back to type "int"
WHO_dae_data['No_Deaths'] = WHO_dae_data['No_Deaths'].astype('int64')

# check dataframe column names datatypes are correct
WHO_dae_data.dtypes

In [None]:
# Display the sample data 2
World_Bank_pop_read.head(2)

In [None]:
# Display column names and data types
World_Bank_pop_read.dtypes

In [None]:
# Create new dataframe 
World_Bank_pop_dataframe = pd.DataFrame(World_Bank_pop_read)

# Use copy to extract required columns
World_Bank_pop_data = World_Bank_pop_dataframe[["Country Name", "Country Code", "2016"]].copy()

# Rename column headers
World_Bank_pop_data.rename(columns={"Country Code":"Country_Code",
                            "Country Name":"Country",
                                   "2016": "2016_Population"}, inplace=True)


#confirm columns extracted are correct
World_Bank_pop_data.head(2)

In [None]:
# merge dataframes (left merge on country code)
dae_pop_data_merge = pd.merge(WHO_dae_data, World_Bank_pop_data, how="left", on = ["Country_Code", "Country_Code"])
dae_pop_data = pd.DataFrame(dae_pop_data_merge)

# Add formating to dataframe
pd.options.display.float_format = '{:,.0f}'.format

# view new dataframe
dae_pop_data.head(2)

In [None]:
# Check datatypes of new dataframe
dae_pop_data.dtypes

In [None]:
# Drop rows with "NaN" in population column and "Total" in Inidicator category
# Ref: eumiro, edited by AMC, StackOverflow, 16 Feb 2020, 
# "How to drop rows of Pandas DataFrame whose value in a certain column is NaN"
# https://stackoverflow.com/questions/13413590/how-to-drop-rows-of-pandas-dataframe-whose-value-in-a-certain-column-is-nan, accessed 9 Sept 2023 
dae_pop_data_dropNaN_Total = dae_pop_data[
    (dae_pop_data.Country_y != "NaN") & 
    (dae_pop_data.Indicator != "Total")&
    (dae_pop_data["Country_y"].notna())]

# Check required rows have been dropped 
dae_pop_data_dropNaN_Total.head(2)


In [None]:
# Remove column "Country_y" 
dae_pop_data_dropNaN_Total.drop("Country_y", axis=1, inplace = True)

# Rename column header "Country_x" to "Country"
dae_pop_data_dropNaN_Total.rename(columns={"Country_x":"Country"}, inplace=True)

# Check required changes have been made
dae_pop_data_dropNaN_Total.head(2)

In [None]:
# Export file to csv
dae_pop_data_dropNaN_Total.to_csv("dae_MergedData_clean.csv", index = False)

In [31]:
# GLOBAL DEATHS BY INIDCATOR TYPE

# Add formating to dataframe
pd.options.display.float_format = '{:,.0f}'.format

# get list of unique Indicator Categories
Indicator = dae_pop_data_dropNaN_Total["Indicator"].unique()

# Calculate number of total deaths
No_Deaths_Global = dae_pop_data_dropNaN_Total["No_Deaths"].sum()

# Calculate number of total population
Population_Global = dae_pop_data_dropNaN_Total["2016_Population"].sum()

# Calculate number of deaths Indicator Category
No_Deaths_IC_Global = dae_pop_data_dropNaN_Total.groupby(["Indicator"])["No_Deaths"].sum()

# Calculate Indicator deaths as % of total deaths 
Perc_Deaths_IC_Global = No_Deaths_IC_Global/No_Deaths_Global*100

# Calculate Indicator deaths as % of total population
Perc_Deaths_IC_Pop_Global = No_Deaths_IC_Global/Population_Global*100

In [32]:
#create new Global Deaths dataframe to store results to be used for ploting charts
dae_Global_Data = pd.DataFrame({
    "Global_Deaths": No_Deaths_IC_Global,
    "Global_Population": Population_Global,
    "Indicator_Deaths_%_Total_Deaths": Perc_Deaths_IC_Global,
    "Indicator_Deaths_%_Total_Population": Perc_Deaths_IC_Pop_Global}, index=Indicator)

# Format columns
dae_Global_Data["Global_Deaths"] = (dae_Global_Data["Global_Deaths"]).map("{:,}".format)
dae_Global_Data["Indicator_Deaths_%_Total_Deaths"] = dae_Global_Data["Indicator_Deaths_%_Total_Deaths"].map("{:,.1f}%".format)
dae_Global_Data["Indicator_Deaths_%_Total_Population"] = dae_Global_Data["Indicator_Deaths_%_Total_Population"].map("{:,.4f}%".format)

# give name to index
# Ref Admin, 28 January 2023, Sparkby{Example}, https://sparkbyexamples.com/pandas/pandas-set-index-name-to-dataframe/, accessed 9 September 2023

dae_Global_Data.index.name = "Indicator"

# view dataframe results
dae_Global_Data.head(2)


Unnamed: 0_level_0,Global_Deaths,Global_Population,Indicator_Deaths_%_Total_Deaths,Indicator_Deaths_%_Total_Population
Indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Infectious, parasitic, neonatal and nutritional",2503682,22347706701,19.8%,0.0112%
Injuries,1950084,22347706701,15.4%,0.0087%


In [33]:
# Export file to csv
dae_Global_Data.to_csv("dae_Global_Data_clean.csv", index = True)


In [35]:
# Add formating to dataframe
pd.options.display.float_format = '{:,.0f}'.format

# Set Continent as the index column
Continent = dae_pop_data_dropNaN_Total.set_index("Continent")

# Calculate the total number of deaths by Continent
No_Deaths_Continent = dae_pop_data_dropNaN_Total.groupby(["Continent"])["No_Deaths"].sum()

# Calculate number of deaths by Continent and Indicator Category
No_Deaths_IC_Continent = dae_pop_data_dropNaN_Total.groupby(["Continent", "Indicator"])["No_Deaths"].sum()

# Calculate total population by Continent
Population_Continent1 = dae_pop_data_dropNaN_Total.groupby(["Continent"])["2016_Population"].sum()
#Population_Continent = Population_Continent1.map('{:,.0f}'.format)

# Calculate number of population by Continent and Indicator Category
Population_IC_Continent = dae_pop_data_dropNaN_Total.groupby(["Continent", "Indicator"])["2016_Population"].sum()

# Calculate the percentage of deaths for each Indicator_Category within each Continent
Perc_Deaths_IC_Continent = No_Deaths_IC_Continent / No_Deaths_Continent * 100

# Calculate Indicator deaths as % of total population
Perc_Deaths_IC_Pop_Continent = No_Deaths_IC_Continent / Population_IC_Continent*100

In [36]:
# Create a new DataFrame with the calculated values
dae_Continent_Data = pd.DataFrame({
    "Continent_Deaths": No_Deaths_IC_Continent,
    "Continent_Population": Population_IC_Continent,
    "Indicator_Deaths_%_Total_Deaths": Perc_Deaths_IC_Continent,
    "Indicator_Deaths_%_Total_Population": Perc_Deaths_IC_Pop_Continent})

# Format the "% Deaths" column as percentages with two decimal places
dae_Continent_Data["Continent_Deaths"] = dae_Continent_Data["Continent_Deaths"].map("{:,}".format)
dae_Continent_Data["Indicator_Deaths_%_Total_Deaths"] = (dae_Continent_Data["Indicator_Deaths_%_Total_Deaths"]).map("{:,.1f}%".format)
dae_Continent_Data["Indicator_Deaths_%_Total_Population"] = (dae_Continent_Data["Indicator_Deaths_%_Total_Population"]).map("{:,.4f}%".format)

# View the DataFrame results
dae_Continent_Data.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,Continent_Deaths,Continent_Population,Indicator_Deaths_%_Total_Deaths,Indicator_Deaths_%_Total_Population
Continent,Indicator,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,"Infectious, parasitic, neonatal and nutritional",1264387,1022825302,58.1%,0.1236%
Africa,Injuries,409520,1022825302,18.8%,0.0400%
Africa,Noncommunicable diseases,502448,1022825302,23.1%,0.0491%
Americas,"Infectious, parasitic, neonatal and nutritional",58204,982865966,6.9%,0.0059%
Americas,Injuries,180000,982865966,21.3%,0.0183%
Americas,Noncommunicable diseases,608683,982865966,71.9%,0.0619%
Eastern Mediterranean,"Infectious, parasitic, neonatal and nutritional",240077,697040699,28.1%,0.0344%
Eastern Mediterranean,Injuries,166038,697040699,19.4%,0.0238%
Eastern Mediterranean,Noncommunicable diseases,448284,697040699,52.5%,0.0643%
Europe,"Infectious, parasitic, neonatal and nutritional",26966,919175473,1.9%,0.0029%


In [37]:
# Export file to csv
dae_Continent_Data.to_csv("dae_Continent_Data_clean.csv", index = True)


In [38]:
# DEATHS BY COUNTRY & INIDCATOR TYPE

# Add formating to dataframe
pd.options.display.float_format = '{:,.0f}'.format

# Set Country as index
Country = dae_pop_data_dropNaN_Total.set_index("Country")

# Calculate the total number of deaths by Country
No_Deaths_Country = dae_pop_data_dropNaN_Total.groupby(["Country","Continent"])["No_Deaths"].sum()

# Calculate number of deaths by Country and Indicator Category
No_Deaths_IC_Country = dae_pop_data_dropNaN_Total.groupby(["Country","Continent", "Indicator"])["No_Deaths"].sum()

# Calculate total population by Country
Population_Country = dae_pop_data_dropNaN_Total.groupby(["Country","Continent"])["2016_Population"].sum()

# Calculate number of population by Country and Indicator Category
Population_IC_Country = dae_pop_data_dropNaN_Total.groupby(["Country","Continent", "Indicator"])["2016_Population"].sum()

# Calculate the percentage of deaths for each Indicator_Category within each Country
Perc_Deaths_IC_Country = No_Deaths_IC_Country / No_Deaths_Country * 100

# Calculate Indicator deaths as % of total population
Perc_Deaths_IC_Pop_Country = No_Deaths_IC_Country / Population_IC_Country*100

In [39]:
# Create a new DataFrame with the calculated values
dae_Country_Data = pd.DataFrame({
    "Country_Deaths": No_Deaths_IC_Country,
    "Country_Population": Population_IC_Country,
    "Indicator_Deaths_%_Total_Deaths": Perc_Deaths_IC_Country,
    "Indicator_Deaths_%_Total_Population": Perc_Deaths_IC_Pop_Country})

# Format the "% Deaths" column as percentages with two decimal places
dae_Country_Data["Country_Deaths"] = dae_Country_Data["Country_Deaths"].map("{:,}".format)
dae_Country_Data["Indicator_Deaths_%_Total_Deaths"] = (dae_Country_Data["Indicator_Deaths_%_Total_Deaths"]).map("{:,.1f}%".format)
dae_Country_Data["Indicator_Deaths_%_Total_Population"] = (dae_Country_Data["Indicator_Deaths_%_Total_Population"]).map("{:,.4f}%".format)

# View the DataFrame results
dae_Country_Data.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Country_Deaths,Country_Population,Indicator_Deaths_%_Total_Deaths,Indicator_Deaths_%_Total_Population
Country,Continent,Indicator,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,Eastern Mediterranean,"Infectious, parasitic, neonatal and nutritional",29571,34636207,45.4%,0.0854%
Afghanistan,Eastern Mediterranean,Injuries,13086,34636207,20.1%,0.0378%
Afghanistan,Eastern Mediterranean,Noncommunicable diseases,22490,34636207,34.5%,0.0649%
Albania,Europe,"Infectious, parasitic, neonatal and nutritional",78,2876101,1.2%,0.0027%
Albania,Europe,Injuries,514,2876101,7.6%,0.0179%
Albania,Europe,Noncommunicable diseases,6174,2876101,91.3%,0.2147%
Algeria,Africa,"Infectious, parasitic, neonatal and nutritional",3806,40339329,10.4%,0.0094%
Algeria,Africa,Injuries,7999,40339329,21.8%,0.0198%
Algeria,Africa,Noncommunicable diseases,24914,40339329,67.9%,0.0618%
Andorra,Europe,"Infectious, parasitic, neonatal and nutritional",0,72540,0.0%,0.0000%


In [40]:

# Export file to csv
dae_Country_Data.to_csv("dae_Country_Data_clean.csv", index = True)