In [1]:
# Import dependancies 
import pandas as pd
from pathlib import Path


In [2]:
WHO_dae_csv = Path("/Users/mireille.walton/Desktop/da_bootcamp/ch_proj/Project_3/source_data/Death_attributable_to_Environment_data.csv")
World_Bank_pop_csv = Path("/Users/mireille.walton/Desktop/da_bootcamp/ch_proj/Project_3/source_data/World_Bank_Data.csv")

In [3]:
WHO_dae_read = pd.read_csv(WHO_dae_csv)
World_Bank_pop_read = pd.read_csv(World_Bank_pop_csv, skiprows = 4)

In [4]:
# Display the sample data 1
WHO_dae_read.head(2)

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,TOTENV_1,Deaths attributable to the environment,numeric,EUR,Europe,Country,AND,Andorra,Year,2012,...,,,,,,0,,,EN,2016-03-08T13:00:00.000Z
1,TOTENV_1,Deaths attributable to the environment,numeric,EUR,Europe,Country,MCO,Monaco,Year,2012,...,,,,,,0,,,EN,2016-03-08T13:00:00.000Z


In [5]:
# Display column names and data types
WHO_dae_read.dtypes

IndicatorCode                  object
Indicator                      object
ValueType                      object
ParentLocationCode             object
ParentLocation                 object
Location type                  object
SpatialDimValueCode            object
Location                       object
Period type                    object
Period                          int64
IsLatestYear                     bool
Dim1 type                      object
Dim1                           object
Dim1ValueCode                  object
Dim2 type                     float64
Dim2                          float64
Dim2ValueCode                 float64
Dim3 type                     float64
Dim3                          float64
Dim3ValueCode                 float64
DataSourceDimValueCode        float64
DataSource                    float64
FactValueNumericPrefix        float64
FactValueNumeric                int64
FactValueUoM                  float64
FactValueNumericLowPrefix     float64
FactValueNum

In [6]:
# Create new dataframe 
WHO_dae_dataframe = pd.DataFrame(WHO_dae_read)

# Use copy to extract required columns
WHO_dae_data = WHO_dae_dataframe[["ParentLocationCode", "ParentLocation", "SpatialDimValueCode","Location", "Dim1","FactValueNumeric"]].copy()

#confirm columns extracted are correct
WHO_dae_data.head(2)

Unnamed: 0,ParentLocationCode,ParentLocation,SpatialDimValueCode,Location,Dim1,FactValueNumeric
0,EUR,Europe,AND,Andorra,"Infectious, parasitic, neonatal and nutritional",0
1,EUR,Europe,MCO,Monaco,"Infectious, parasitic, neonatal and nutritional",0


In [7]:
# Rename column headers
WHO_dae_data.rename(columns={"ParentLocationCode":"Continent_Code",
                            "ParentLocation":"Continent",
                            "SpatialDimValueCode":"Country_Code",
                            "Location":"Country",
                             "Dim1":"Indicator_Category",
                            "FactValueNumeric":"No_Deaths"}, inplace=True)

In [8]:
# Change No_Deaths columns back to type "int"
WHO_dae_data['No_Deaths'] = WHO_dae_data['No_Deaths'].astype('int64')

# check dataframe column names datatypes are correct
WHO_dae_data.dtypes

Continent_Code        object
Continent             object
Country_Code          object
Country               object
Indicator_Category    object
No_Deaths              int64
dtype: object

In [9]:
# Display the sample data 2
World_Bank_pop_read.head(2)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,...,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0,106537.0,106445.0,
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,...,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0,702977106.0,720839314.0,


In [10]:
# Display column names and data types
World_Bank_pop_read.dtypes

Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
1960              float64
                   ...   
2019              float64
2020              float64
2021              float64
2022              float64
Unnamed: 67       float64
Length: 68, dtype: object

In [11]:
# Create new dataframe 
World_Bank_pop_dataframe = pd.DataFrame(World_Bank_pop_read)

# Use copy to extract required columns
World_Bank_pop_data = World_Bank_pop_dataframe[["Country Name", "Country Code", "2016"]].copy()

# Rename column headers
World_Bank_pop_data.rename(columns={"Country Code":"Country_Code",
                            "Country Name":"Country"}, inplace=True)


#confirm columns extracted are correct
World_Bank_pop_data.head(2)

Unnamed: 0,Country,Country_Code,2016
0,Aruba,ABW,104874.0
1,Africa Eastern and Southern,AFE,616377605.0


In [12]:
# merge dataframes (left merge on country code)
dae_pop_data_merge = pd.merge(WHO_dae_data, World_Bank_pop_data, how="left", on = ["Country_Code", "Country_Code"])
dae_pop_data = pd.DataFrame(dae_pop_data_merge)

# view new dataframe
dae_pop_data.head(2)

Unnamed: 0,Continent_Code,Continent,Country_Code,Country_x,Indicator_Category,No_Deaths,Country_y,2016
0,EUR,Europe,AND,Andorra,"Infectious, parasitic, neonatal and nutritional",0,Andorra,72540.0
1,EUR,Europe,MCO,Monaco,"Infectious, parasitic, neonatal and nutritional",0,Monaco,37071.0


In [13]:
# Check datatypes of new dataframe
dae_pop_data.dtypes

Continent_Code         object
Continent              object
Country_Code           object
Country_x              object
Indicator_Category     object
No_Deaths               int64
Country_y              object
2016                  float64
dtype: object

In [20]:
# Drop rows with "NaN" in population column and "Total" in Inidicator category
# Ref: eumiro, edited by AMC, StackOverflow, 16 Feb 2020, 
# "How to drop rows of Pandas DataFrame whose value in a certain column is NaN"
# https://stackoverflow.com/questions/13413590/how-to-drop-rows-of-pandas-dataframe-whose-value-in-a-certain-column-is-nan, accessed 9 Sept 2023 
dae_pop_data_dropNaN_Total = dae_pop_data[
    (dae_pop_data.Country_y != "NaN") & 
    (dae_pop_data.Indicator_Category != "Total")&
    (dae_pop_data["Country_y"].notna())]

# Check required rows have been dropped 
dae_pop_data_dropNaN_Total.head(2)


Unnamed: 0,Continent_Code,Continent,Country_Code,Country_x,Indicator_Category,No_Deaths,Country_y,2016
0,EUR,Europe,AND,Andorra,"Infectious, parasitic, neonatal and nutritional",0,Andorra,72540.0
1,EUR,Europe,MCO,Monaco,"Infectious, parasitic, neonatal and nutritional",0,Monaco,37071.0


In [21]:
# Remove column "Country_y" 
dae_pop_data_dropNaN_Total.drop("Country_y", axis=1, inplace = True)

# Rename column header "Country_x" to "Country"
dae_pop_data_dropNaN_Total.rename(columns={"Country_x":"Country"}, inplace=True)

# Check required changes have been made
dae_pop_data_dropNaN_Total.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dae_pop_data_dropNaN_Total.drop("Country_y", axis=1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dae_pop_data_dropNaN_Total.rename(columns={"Country_x":"Country"}, inplace=True)


Unnamed: 0,Continent_Code,Continent,Country_Code,Country,Indicator_Category,No_Deaths,2016
0,EUR,Europe,AND,Andorra,"Infectious, parasitic, neonatal and nutritional",0,72540.0
1,EUR,Europe,MCO,Monaco,"Infectious, parasitic, neonatal and nutritional",0,37071.0


In [22]:
# Export file to csv
dae_pop_data_dropNaN_Total.to_csv("dae_data_clean.csv", index = True)

In [23]:
# GLOBAL DEATHS BY INIDCATOR TYPE

# get list of unique Indicator Categories
Indicator = dae_pop_data_dropNaN_Total["Indicator_Category"].unique()

# Calculate number of total deaths
No_Deaths_Global = dae_pop_data_dropNaN_Total["No_Deaths"].sum()

# Calculate number of deaths Indicator Category
No_Deaths_IC_Global = dae_pop_data_dropNaN_Total.groupby(["Indicator_Category"])["No_Deaths"].sum()

# Calculate number of deaths by Country and Indicator Category
Perc_Deaths_IC_Global = No_Deaths_IC_Global/No_Deaths_Global*100

In [96]:
#create new Global Deaths dataframe to store results to be used for ploting charts
dae_Global_Data = pd.DataFrame({
    "Global Deaths": No_Deaths_IC_Global,
    "% Deaths": Perc_Deaths_IC_Global}, index=Indicator)

# Format columns
dae_Global_Data["Global Deaths"] = dae_Global_Data["Global Deaths"].map("{:,}".format)
dae_Global_Data["% Deaths"] = (dae_Global_Data["% Deaths"]).map("{:,.1f}%".format)

# view dataframe results
dae_Global_Data.head(2)


Unnamed: 0,Global Deaths,% Deaths
"Infectious, parasitic, neonatal and nutritional",2503682,19.8%
Injuries,1950084,15.4%


In [None]:
# Export file to csv
dae_Global_Data.to_csv("dae_Global_Data_clean.csv", index = True)


In [91]:
# Set Continent as the index column
Continent = dae_pop_data_dropNaN_Total.set_index("Continent")

# Calculate number of deaths by Continent and Indicator Category
No_Deaths_IC_Continent = dae_pop_data_dropNaN_Total.groupby(["Continent", "Indicator_Category"])["No_Deaths"].sum()

# Calculate the total number of deaths by Continent
No_Deaths_Continent = dae_pop_data_dropNaN_Total.groupby(["Continent"])["No_Deaths"].sum()

# Calculate the percentage of deaths for each Indicator_Category within each Continent
Perc_Deaths_IC_Continent = No_Deaths_IC_Continent / No_Deaths_Continent * 100


In [101]:
# Create a new DataFrame with the calculated values
dae_Continent_Data = pd.DataFrame({
    "Continent Deaths": No_Deaths_IC_Continent,
    "% Deaths": Perc_Deaths_IC_Continent
})

# Format the "% Deaths" column as percentages with two decimal places
dae_Continent_Data["Continent Deaths"] = dae_Continent_Data["Continent Deaths"].map("{:,}".format)
dae_Continent_Data["% Deaths"] = (dae_Continent_Data["% Deaths"]).map("{:,.1f}%".format)

# View the DataFrame results
dae_Continent_Data.head(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Continent Deaths,% Deaths
Continent,Indicator_Category,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,"Infectious, parasitic, neonatal and nutritional",1264387,58.1%
Africa,Injuries,409520,18.8%
Africa,Noncommunicable diseases,502448,23.1%


In [None]:
# Export file to csv
dae_Continent_Data.to_csv("dae_Continent_Data_clean.csv", index = True)


In [106]:
# DEATHS BY Country & INIDCATOR TYPE

# Set Country as index
Country = dae_pop_data_dropNaN_Total.set_index("Country")

# Calculate number of deaths by Country
No_Deaths_Country = dae_pop_data_dropNaN_Total.groupby(["Country","Continent"])["No_Deaths"].sum()

# Calculate number of deaths by Country and Indicator Category
No_Deaths_IC_Country = dae_pop_data_dropNaN_Total.groupby(["Country","Continent","Indicator_Category"])["No_Deaths"].sum()

# Calculate number of deaths by Country and Indicator Category
Perc_Deaths_IC_Country = No_Deaths_IC_Country/No_Deaths_Country*100


In [108]:
# Create a new DataFrame with the calculated values
dae_Country_Data = pd.DataFrame({
    "Country Deaths": No_Deaths_IC_Country,
    "% Deaths": Perc_Deaths_IC_Country
})

# Format the "% Deaths" column as percentages with two decimal places
dae_Country_Data["Country Deaths"] = dae_Country_Data["Country Deaths"].map("{:,}".format)
dae_Country_Data["% Deaths"] = (dae_Country_Data["% Deaths"]).map("{:,.1f}%".format)

# View the DataFrame results
dae_Country_Data.head(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Country Deaths,% Deaths
Country,Continent,Indicator_Category,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,Eastern Mediterranean,"Infectious, parasitic, neonatal and nutritional",29571,45.4%
Afghanistan,Eastern Mediterranean,Injuries,13086,20.1%
Afghanistan,Eastern Mediterranean,Noncommunicable diseases,22490,34.5%


In [None]:

# Export file to csv
dae_Country_Data.to_csv("dae_Country_Data.csv", index = True)