## Add PM2.5 to Dataset

From Jeffrey's branch adding the cleaned PM2.5 dataset to main dataset.

In [1]:
# Dependencies.
import pandas as pd

In [2]:
# Greenhouse gas cleaned data.
pm_df = pd.read_csv('resources/pm2.5_per_cubic_metre.csv', index_col=['Country', 'Year'])
pm_df.drop(columns=['Unnamed: 0', 'Code'], inplace=True)
pm_df.rename(columns={'pm_2.5_per_cubic_metre': 'PM2.5'}, inplace=True)
pm_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PM2.5
Country,Year,Unnamed: 2_level_1
Afghanistan,2010,52.49585
Afghanistan,2011,57.09972
Afghanistan,2012,55.46611
Afghanistan,2013,59.62277
Afghanistan,2014,62.72192


In [3]:
# Read in main dataset.
df = pd.read_csv('ds14_country_happ_temp_cw_gg_pop.csv', index_col=['Country', 'Year'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Afghanistan,2010,4.758,14.629,48.28708,44910.0,29185511.0
Afghanistan,2011,3.832,16.487,50.82785,58650.0,30117411.0
Afghanistan,2012,3.783,14.373,53.40352,66750.0,31161378.0
Afghanistan,2013,3.572,16.156,56.01404,74800.0,32269592.0
Afghanistan,2014,3.131,15.647,58.65937,84620.0,33370804.0


In [4]:
# Compare country lists.
countries = df.index.levels[0]
countries_pm = pm_df.index.levels[0]

for country in countries:
    if country not in countries_pm:
        print(country)

Kosovo
Macedonia
Slovakia
South Korea
Taiwan
Vietnam


In [5]:
# Find corrected names in dataset, and rename.
pm_df.rename(index={
    'Chinese Taipei': 'Taiwan',
    'Serbia': 'Kosovo',
    'North Macedonia': 'Macedonia',
    'Slovak Republic': 'Slovakia',
    'Korea': 'South Korea',
    'Viet Nam': 'Vietnam'
}, inplace=True)
pm_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PM2.5
Country,Year,Unnamed: 2_level_1
Afghanistan,2010,52.49585
Afghanistan,2011,57.09972
Afghanistan,2012,55.46611
Afghanistan,2013,59.62277
Afghanistan,2014,62.72192


In [6]:
# Compare country lists again.
countries = df.index.levels[0]
countries_pm = pm_df.index.levels[0]

for country in countries:
    if country not in countries_pm:
        print(country)

In [7]:
# Merge the datasets.
merged_df = pd.merge(df, pm_df,  how='left', left_on=['Country', 'Year'], right_on = ['Country', 'Year'])
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population,PM2.5
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,2010,4.758,14.629,48.28708,44910.0,29185511.0,52.49585
Afghanistan,2011,3.832,16.487,50.82785,58650.0,30117411.0,57.09972
Afghanistan,2012,3.783,14.373,53.40352,66750.0,31161378.0,55.46611
Afghanistan,2013,3.572,16.156,56.01404,74800.0,32269592.0,59.62277
Afghanistan,2014,3.131,15.647,58.65937,84620.0,33370804.0,62.72192


In [8]:
# Save dataset.
merged_df.to_csv('ds15_happ_temp_cw_gg_pop_pm.csv')