## Add Percentage of Forest to Dataset

Add the percentage of forest of each country in each year to the dataset.

In [1]:
# Dependencies.
import pandas as pd
from countryinfo import CountryInfo

In [2]:
# Forestry data.
for_df = pd.read_csv('resources/forest-area-km.csv', index_col=['Entity', 'Year'])
for_df.drop(columns=['Code'], inplace=True)
for_df.rename(columns={'Forest area': 'Forest Area'}, inplace=True)
for_df.rename_axis(index=['Country', 'Year'], inplace=True)
for_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Forest Area
Country,Year,Unnamed: 2_level_1
Afghanistan,1990,1208440.0
Afghanistan,1991,1208440.0
Afghanistan,1992,1208440.0
Afghanistan,1993,1208440.0
Afghanistan,1994,1208440.0


In [3]:
for_df['Forest Area KM'] = for_df['Forest Area'].apply(lambda x: x/100)
for_df.drop(columns='Forest Area', inplace=True)
for_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Forest Area KM
Country,Year,Unnamed: 2_level_1
Afghanistan,1990,12084.4
Afghanistan,1991,12084.4
Afghanistan,1992,12084.4
Afghanistan,1993,12084.4
Afghanistan,1994,12084.4


In [4]:
# Read in main dataset.
df = pd.read_csv('ds15_happ_temp_cw_gg_pop_pm.csv', index_col=['Country', 'Year'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population,PM2.5
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,2010,4.758,14.629,48.28708,44910.0,29185511.0,52.49585
Afghanistan,2011,3.832,16.487,50.82785,58650.0,30117411.0,57.09972
Afghanistan,2012,3.783,14.373,53.40352,66750.0,31161378.0,55.46611
Afghanistan,2013,3.572,16.156,56.01404,74800.0,32269592.0,59.62277
Afghanistan,2014,3.131,15.647,58.65937,84620.0,33370804.0,62.72192


In [5]:
# Compare country lists.
countries = df.index.levels[0]
countries_for = for_df.index.levels[0]

for country in countries:
    if country not in countries_for:
        print(country)

Kosovo
Macedonia
Taiwan


In [6]:
# Find corrected names in dataset, and rename.
for_df.rename(index={
    'Serbia': 'Kosovo',
    'North Macedonia': 'Macedonia',
}, inplace=True)
for_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Forest Area KM
Country,Year,Unnamed: 2_level_1
Afghanistan,1990,12084.4
Afghanistan,1991,12084.4
Afghanistan,1992,12084.4
Afghanistan,1993,12084.4
Afghanistan,1994,12084.4


In [7]:
# Compare country lists.
countries = df.index.levels[0]
countries_for = for_df.index.levels[0]

for country in countries:
    if country not in countries_for:
        print(country)

Taiwan


In [8]:
# Merge the datasets.
merged_df = pd.merge(df, for_df,  how='left', left_on=['Country', 'Year'], right_on = ['Country', 'Year'])
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population,PM2.5,Forest Area KM
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,2010,4.758,14.629,48.28708,44910.0,29185511.0,52.49585,12084.4
Afghanistan,2011,3.832,16.487,50.82785,58650.0,30117411.0,57.09972,12084.4
Afghanistan,2012,3.783,14.373,53.40352,66750.0,31161378.0,55.46611,12084.4
Afghanistan,2013,3.572,16.156,56.01404,74800.0,32269592.0,59.62277,12084.4
Afghanistan,2014,3.131,15.647,58.65937,84620.0,33370804.0,62.72192,12084.4


In [9]:
# Get landmasses of every country in list.
land_mass = []

for country in countries:
    try:
        land_mass.append([country, CountryInfo(country).area()])
    except:
        print('Country Error: ', country)

Country Error:  Kosovo
Country Error:  Macedonia
Country Error:  Montenegro


In [10]:
# Land mass of missing countries.
for country in land_mass:
    if country[0] == 'Kosovo':
        country[1] = CountryInfo('Serbia').area()
    if country[0] == 'Macedonia':
        country[1] = CountryInfo('North Macedonia').area()
    if country[0] == 'Montenegro':
        country[1] = 13812

In [11]:
# Create DF from land mass values.
lm_df = pd.DataFrame(land_mass, columns=['Country', 'Land Mass'])
lm_df.set_index('Country', inplace=True)
lm_df.head()

Unnamed: 0_level_0,Land Mass
Country,Unnamed: 1_level_1
Afghanistan,652230
Albania,28748
Argentina,2780400
Armenia,29743
Australia,7692024


In [12]:
# Merge the datasets.
merged_df = pd.merge(merged_df, lm_df,  how='left', left_on=['Country'], right_on = ['Country'])
merged_df.head()

Unnamed: 0_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population,PM2.5,Forest Area KM,Land Mass
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,4.758,14.629,48.28708,44910.0,29185511.0,52.49585,12084.4,652230.0
Afghanistan,3.832,16.487,50.82785,58650.0,30117411.0,57.09972,12084.4,652230.0
Afghanistan,3.783,14.373,53.40352,66750.0,31161378.0,55.46611,12084.4,652230.0
Afghanistan,3.572,16.156,56.01404,74800.0,32269592.0,59.62277,12084.4,652230.0
Afghanistan,3.131,15.647,58.65937,84620.0,33370804.0,62.72192,12084.4,652230.0


In [13]:
# Get percentage of land mass column.
merged_df['Forest Percentage'] = merged_df['Forest Area KM'] / merged_df['Land Mass']
merged_df.head()

Unnamed: 0_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population,PM2.5,Forest Area KM,Land Mass,Forest Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,4.758,14.629,48.28708,44910.0,29185511.0,52.49585,12084.4,652230.0,0.018528
Afghanistan,3.832,16.487,50.82785,58650.0,30117411.0,57.09972,12084.4,652230.0,0.018528
Afghanistan,3.783,14.373,53.40352,66750.0,31161378.0,55.46611,12084.4,652230.0,0.018528
Afghanistan,3.572,16.156,56.01404,74800.0,32269592.0,59.62277,12084.4,652230.0,0.018528
Afghanistan,3.131,15.647,58.65937,84620.0,33370804.0,62.72192,12084.4,652230.0,0.018528


In [14]:
# Drop columns.
merged_df.drop(columns=['Forest Area KM', 'Land Mass'], inplace=True)
merged_df.head()

Unnamed: 0_level_0,Life Ladder,Temperature,Clean Water,Greenhouse Gas Emissions,Population,PM2.5,Forest Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,4.758,14.629,48.28708,44910.0,29185511.0,52.49585,0.018528
Afghanistan,3.832,16.487,50.82785,58650.0,30117411.0,57.09972,0.018528
Afghanistan,3.783,14.373,53.40352,66750.0,31161378.0,55.46611,0.018528
Afghanistan,3.572,16.156,56.01404,74800.0,32269592.0,59.62277,0.018528
Afghanistan,3.131,15.647,58.65937,84620.0,33370804.0,62.72192,0.018528


In [15]:
# Create CSV.
merged_df.to_csv('ds16_happ_temp_cw_gg_pop_pm_lm.csv')