## Add Country Populations to Dataset

Add the populations of each country in each year to the dataset.

In [1]:
# Dependencies.
import pandas as pd

In [2]:
# Greenhouse gas cleaned data.
pop_df = pd.read_csv('resources/populations.csv')
pop_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,"Population, total",SP.POP.TOTL,54208.0,55434.0,56234.0,56699.0,57029.0,57357.0,...,102050.0,102565.0,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0,106766.0
1,Africa Eastern and Southern,AFE,"Population, total",SP.POP.TOTL,130836765.0,134159786.0,137614644.0,141202036.0,144920186.0,148769974.0,...,532760424.0,547482863.0,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0,677243299.0
2,Afghanistan,AFG,"Population, total",SP.POP.TOTL,8996967.0,9169406.0,9351442.0,9543200.0,9744772.0,9956318.0,...,30117411.0,31161378.0,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0,38928341.0
3,Africa Western and Central,AFW,"Population, total",SP.POP.TOTL,96396419.0,98407221.0,100506960.0,102691339.0,104953470.0,107289875.0,...,360285439.0,370243017.0,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0,458803476.0
4,Angola,AGO,"Population, total",SP.POP.TOTL,5454938.0,5531451.0,5608499.0,5679409.0,5734995.0,5770573.0,...,24220660.0,25107925.0,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0,32866268.0


In [3]:
# Gather only needed columns.
pop_df = pop_df[['Country Name', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']]
pop_df.rename(columns={
    '2010': 2010,
    '2011': 2011,
    '2012': 2012,
    '2013': 2013,
    '2014': 2014,
    '2015': 2015,
    '2016': 2016,
    '2017': 2017,
    '2018': 2018,
    '2019': 2019
}, inplace=True)
pop_df.head()

Unnamed: 0,Country Name,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Aruba,101665.0,102050.0,102565.0,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0
1,Africa Eastern and Southern,518468229.0,532760424.0,547482863.0,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0
2,Afghanistan,29185511.0,30117411.0,31161378.0,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0
3,Africa Western and Central,350556886.0,360285439.0,370243017.0,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0
4,Angola,23356247.0,24220660.0,25107925.0,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0


In [4]:
# Set the Countries as the DataFrame index.
pop_df.set_index('Country Name', inplace=True)
pop_df.head()

Unnamed: 0_level_0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Aruba,101665.0,102050.0,102565.0,103165.0,103776.0,104339.0,104865.0,105361.0,105846.0,106310.0
Africa Eastern and Southern,518468229.0,532760424.0,547482863.0,562601578.0,578075373.0,593871847.0,609978946.0,626392880.0,643090131.0,660046272.0
Afghanistan,29185511.0,30117411.0,31161378.0,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0
Africa Western and Central,350556886.0,360285439.0,370243017.0,380437896.0,390882979.0,401586651.0,412551299.0,423769930.0,435229381.0,446911598.0
Angola,23356247.0,24220660.0,25107925.0,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0


In [5]:
# Stack column to index.
stack_df = pd.DataFrame(pop_df.stack(), columns=['Population'])
stack_df.rename_axis(index=['Country', 'Year'], inplace=True)
stack_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
Country,Year,Unnamed: 2_level_1
Aruba,2010,101665.0
Aruba,2011,102050.0
Aruba,2012,102565.0
Aruba,2013,103165.0
Aruba,2014,103776.0


In [6]:
# Read in full dataset.
df = pd.read_csv('ds13_happ_temp_water_gg.csv', index_col=['Country', 'Year'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,2010,4.758,14.629,48.28708,44910.0
Afghanistan,2011,3.832,16.487,50.82785,58650.0
Afghanistan,2012,3.783,14.373,53.40352,66750.0
Afghanistan,2013,3.572,16.156,56.01404,74800.0
Afghanistan,2014,3.131,15.647,58.65937,84620.0


In [7]:
# Compare country lists.
countries = df.index.levels[0]
countries_pop = stack_df.index.levels[0]

for country in countries:
    if country not in countries_pop:
        print(country)

Egypt
Kyrgyzstan
Macedonia
Russia
Slovakia
South Korea
Taiwan
Venezuela
Yemen


In [8]:
# Find corrected names in dataset, and rename.
stack_df.rename(index={
    'Egypt, Arab Rep.': 'Egypt',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'North Macedonia': 'Macedonia',
    'Russian Federation': 'Russia',
    'Slovak Republic': 'Slovakia',
    'Korea, Rep.': 'South Korea',
    'Venezuela, RB': 'Venezuela',
    'Yemen, Rep.': 'Yemen'
}, inplace=True)

In [9]:
# Compare country lists again.
countries = df.index.levels[0]
countries_pop = stack_df.index.levels[0]

for country in countries:
    if country not in countries_pop:
        print(country)

Taiwan


In [10]:
# Merge the datasets.
merged_df = pd.merge(df, stack_df,  how='left', left_on=['Country', 'Year'], right_on = ['Country', 'Year'])
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2010,4.758,14.629,48.28708,44910.0,29185511.0
Afghanistan,2011,3.832,16.487,50.82785,58650.0,30117411.0
Afghanistan,2012,3.783,14.373,53.40352,66750.0,31161378.0
Afghanistan,2013,3.572,16.156,56.01404,74800.0,32269592.0
Afghanistan,2014,3.131,15.647,58.65937,84620.0,33370804.0


In [11]:
# Save DataFrame.
merged_df.to_csv('ds14_country_happ_temp_cw_gg_pop.csv')