## Happiness Metrics for Global Life Expectancy Predictive Analysis
Nicole Chang, Sourish Guntipally, Aaron Park, Brandon To

Data used for this project is from Kaggle datasets of the [World Happiness Report](https://www.kaggle.com/unsdsn/world-happiness?select=2019.csv) and [Human Life Expectancy Around the World](https://www.kaggle.com/deepcontractor/human-life-expectancy-around-the-world).

The years we are analyzing are from 2015 - 2019.

In [503]:
import scipy as sp, numpy as np, pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneGroupOut, LeaveOneOut
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [504]:
#World Happiness Report (WHR)
#Human Life Expectancy Around the World (HLE)
#Use command below to show all elements in a dataframe
#pd.set_option("display.max_rows", None, "display.max_columns", None)

#HLE
dataHLE = pd.read_csv('Human_life_Expectancy.csv')

#2015 data
dataWHR2015 = pd.read_csv('2015.csv') #WHR

#2016 data
dataWHR2016 = pd.read_csv('2016.csv') #WHR

#2017 data
dataWHR2017 = pd.read_csv('2017.csv') #WHR

#2018 data
dataWHR2018 = pd.read_csv('2018.csv') #WHR

#2019 data
dataWHR2019 = pd.read_csv('2019.csv') #WHR

In [505]:
#Configuring HLE dataframe
dataHLE_NSN = dataHLE.set_index('Level')
dataHLE_NSN = dataHLE_NSN.drop('subnational')

In [506]:
#Configuring data into dataframes that are easier to use 

#Labels are the names of the measurements (countries) for all 5 years
labels = [dataWHR2015['Country'].astype(str), dataWHR2016['Country'].astype(str), dataWHR2017['Country'].astype(str),dataWHR2018['Country'].astype(str), dataWHR2019['Country'].astype(str), dataHLE_NSN['Country'].astype(str)] 
#dictionary to track countries participation in all 5 years
recurringCountries = {}

for i in range(len(dataWHR2015['Country'])):
    recurringCountries[labels[0][i]] = 1

for year in range(5):                                                      #Iterates for each year after 2015
    for countryind in range(len(dataWHR2015['Country'])):                  #Iterates through country labels for 2015
        for matchind in range(len(labels[year+1][:])):                     #Finds matches between 2015 label and other years
            if labels[0][countryind] == labels[year+1][matchind]:
                recurringCountries[labels[0][countryind]] += 1

#delete all the countries that don't appear in every year out of the dictionary
for i in range(len(recurringCountries)):
    if recurringCountries[labels[0][i]] < 6:
        recurringCountries.pop(labels[0][i])


In [507]:
#creating lists of the countries to delete using the WHR dictionary 
del_countries2015 = []

for i in range(len(dataWHR2015['Country'])):
    if labels[0][i] not in recurringCountries:
        del_countries2015.append(labels[0][i])

del_countries2016 = []

for i in range(len(dataWHR2016['Country'])):
    if labels[1][i] not in recurringCountries:
        del_countries2016.append(labels[1][i])

del_countries2017 = []

for i in range(len(dataWHR2017['Country'])):
    if labels[2][i] not in recurringCountries:
        del_countries2017.append(labels[2][i])
        
del_countries2018 = []

for i in range(len(dataWHR2018['Country'])):
    if labels[3][i] not in recurringCountries:
        del_countries2018.append(labels[3][i])
        
del_countries2019 = []

for i in range(len(dataWHR2019['Country'])):
    if labels[4][i] not in recurringCountries:
        del_countries2019.append(labels[4][i])

del_countriesHLE = []

for i in range(len(dataHLE_NSN['Country'])):
    if labels[5][i] not in recurringCountries:
        del_countriesHLE.append(labels[5][i])

In [508]:
#feel free to rename these dataframes
#new dataframes with dropped countries
#Syria does not appear in HLE which is why it is dropped in all other dataframes

#NOTE: This cell sometimes results in an error saying ['Syria'] is not in axis-- run the previous 2 cells again
dfWHR2015 = dataWHR2015.set_index('Country')
dfWHR2015 = dfWHR2015.drop(del_countries2015)
dfWHR2015 = dfWHR2015.drop('Syria')
dfWHR2015 = dfWHR2015.reset_index()

dfWHR2016 = dataWHR2016.set_index('Country')
dfWHR2016 = dfWHR2016.drop(del_countries2016)
dfWHR2016 = dfWHR2016.drop('Syria')
dfWHR2016 = dfWHR2016.reset_index()

dfWHR2017 = dataWHR2017.set_index('Country')
dfWHR2017 = dfWHR2017.drop(del_countries2017)
dfWHR2017 = dfWHR2017.drop('Syria')
dfWHR2017 = dfWHR2017.reset_index()

dfWHR2018 = dataWHR2018.set_index('Country')
dfWHR2018 = dfWHR2018.drop(del_countries2018)
dfWHR2018 = dfWHR2018.drop('Syria')
dfWHR2018 = dfWHR2018.reset_index()

dfWHR2019 = dataWHR2019.set_index('Country')
dfWHR2019 = dfWHR2019.drop(del_countries2019)
dfWHR2019 = dfWHR2019.drop('Syria')
dfWHR2019 = dfWHR2019.reset_index()


dfHLE = dataHLE_NSN.set_index('Country')
dfHLE = dfHLE.drop(del_countriesHLE)
dfHLE = dfHLE.drop('Syria')
dfHLE = dfHLE.reset_index()

dfHLE2015 = dfHLE['2015'] #data frame HLE 2015
dfHLE2016 = dfHLE['2016'] #data frame HLE 2016
dfHLE2017 = dfHLE['2017'] #data frame HLE 2017
dfHLE2018 = dfHLE['2018'] #data frame HLE 2018
dfHLE2019 = dfHLE['2019'] #data frame HLE 2019


In [509]:
dfWHR2016

Unnamed: 0,Country,Region,Overall Rank,Happiness Score,Standard Error,Economy,Social support,Healthy life expectancy,Freedom to make life choices,Perceptions of corruption,Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.43630,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.03880,1.45900,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176
...,...,...,...,...,...,...,...,...,...,...,...,...
128,Afghanistan,Southern Asia,153,3.575,0.03084,0.31982,0.30285,0.30335,0.23414,0.09719,0.36510,1.95210
129,Rwanda,Sub-Saharan Africa,154,3.465,0.03464,0.22208,0.77370,0.42864,0.59201,0.55191,0.22628,0.67042
130,Benin,Sub-Saharan Africa,155,3.340,0.03656,0.28665,0.35386,0.31910,0.48450,0.08010,0.18260,1.63328
131,Burundi,Sub-Saharan Africa,157,2.905,0.08658,0.01530,0.41587,0.22396,0.11850,0.10062,0.19727,1.83302


In [510]:
#Second round of finding recurring countries: this time checking WHR against HLE

# recurringCountries2 = {}

# for i in range(len(dfWHR2015.index)):
#     recurringCountries2[dfWHR2015.index[i]] = 1
                                                                
# for countryindex in range(len(dfWHR2015.index)):  
#     for matchindex in range(len(dfHLE.index)):             #Iterates through country labels for 2015                                       
#         if dfWHR2015.index[countryindex] == dfHLE2015.index[matchindex]:
#             recurringCountries2[dfWHR2015.index[countryindex]] += 1

# #delete all the countries that don't appear in every year out of the dictionary
# for i in range(len(recurringCountries2)):
#     if recurringCountries2[dfWHR2015.index[i]] < 2:
#         recurringCountries2.pop(dfWHR2015.index[i])

In [511]:
# #Second round of finding list of countries to delete
# del_countries_WHR = []

# for i in range(len(dfWHR2015.index)):
#     if dfWHR2015.index[i] not in recurringCountries2:
#         del_countries_WHR.append(dfWHR2015.index[i])
        
# del_countries_WHR

In [512]:
#Second round of deleting countries

#NOTE: This cell sometimes results in an error saying ['Taiwan, Hong Kong, ...'] are not in axis-- 
#run the previous 2 cells again

# dfWHR2015 = dfWHR2015.drop(del_countries_WHR)

# dfWHR2016 = dfWHR2016.drop(del_countries_WHR)

# dfWHR2017 = dfWHR2017.drop(del_countries_WHR)

# dfWHR2018 = dfWHR2018.drop(del_countries_WHR)

# dfWHR2019 = dfWHR2019.drop(del_countries_WHR)

#I Will delete the comments below later - Sourish
#dfHLE = dfHLE.drop(del_countries_WHR)
#dfHLE2015 = dfHLE2015.drop(del_countries_WHR) #data frame HLE 2015
#dfHLE2016 = dfHLE2016.drop(del_countries_WHR) #data frame HLE 2016
#dfHLE2017 = dfHLE2017.drop(del_countries_WHR) #data frame HLE 2017
#dfHLE2018 = dfHLE2018.drop(del_countries_WHR) #data frame HLE 2018
#dfHLE2019 = dfHLE2019.drop(del_countries_WHR) #data frame HLE 2019
#print(len(dfHLE))
#print(len(dfWHR2015))

In [513]:
# if you want to drop columns, here's an example below
#data_with_index1 = data_with_index1.drop(['Region', 'Standard Error'], axis=1)

In [514]:
# To alphabetize, follow below example.
# I left it like this so that we can call sort_values when needed. 
# Should just keep the main data frame sorted by overall rank by default
#data_with_index1.sort_values('Country') #sorting A-Z
#data_with_index1.sort_values('Country', ascending=False) #sorting Z-A

In [540]:
#To exclude certain columns
#dfWHR2015.loc[:, ~dfWHR2015.columns.isin(['Country', 'Region', 'Overall Rank'])]

#To exclude one column
#dfWHR2015.loc[:, dfWHR2015.columns != 'Country']

In [539]:
#Removing columns

dfWHR2015 = dfWHR2015.drop(['Region', 'Standard Error'], axis=1)
dfWHR2016 = dfWHR2016.drop(['Region', 'Lower Confidence Interval', 'Upper Confidence Interval'], axis=1)
dfWHR2017 = dfWHR2017.drop(['Whisker.high', 'Whisker.low'], axis=1)

In [561]:
dfWHR2018

Unnamed: 0,Country,Overall rank,Happiness Score,Economy,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,Finland,1,7.632,1.305,1.592,0.874,0.681,0.202,0.393
1,Norway,2,7.594,1.456,1.582,0.861,0.686,0.286,0.340
2,Denmark,3,7.555,1.351,1.590,0.868,0.683,0.284,0.408
3,Iceland,4,7.495,1.343,1.644,0.914,0.677,0.353,0.138
4,Switzerland,5,7.487,1.420,1.549,0.927,0.660,0.256,0.357
...,...,...,...,...,...,...,...,...,...
128,Liberia,149,3.495,0.076,0.858,0.267,0.419,0.206,0.030
129,Rwanda,151,3.408,0.332,0.896,0.400,0.636,0.200,0.444
130,Yemen,152,3.355,0.442,1.073,0.343,0.244,0.083,0.064
131,Tanzania,153,3.303,0.455,0.991,0.381,0.481,0.270,0.097


In [569]:
#PLSR
#132 total countries in each dataframe so it doesn't matter which year we choose
Y = [dfWHR2015.loc[:, ~dfWHR2015.columns.isin(['Country', 'Overall Rank'])], dfWHR2016.loc[:, ~dfWHR2016.columns.isin(['Country', 'Overall Rank'])], dfWHR2017.loc[:, ~dfWHR2017.columns.isin(['Country', 'Overall Rank'])], dfWHR2018.loc[:, ~dfWHR2018.columns.isin(['Country', 'Overall Rank'])], dfWHR2019.loc[:, ~dfWHR2019.columns.isin(['Country', 'Overall Rank'])]]
# dfWHR2016.loc[:, ~dfWHR2016.columns.isin(['Country', 'Overall Rank'])], dfWHR2017.loc[:, ~dfWHR2017.columns.isin(['Country', 'Overall Rank'])], dfWHR2018.loc[:, ~dfWHR2018.columns.isin(['Country', 'Overall Rank'])], dfWHR2019.loc[:, ~dfWHR2019.columns.isin(['Country', 'Overall Rank'])]]
X = dfWHR2015['Country']

r2y = np.zeros(132)
pcNum = 0
variance90 = 0

# for i in range(1,132):
#     plsrModel = PLSRegression(n_components=i)
#     plsrModel.fit(X, Y)
#     r2y[i] = plsrModel.score(X,Y)
#     if (variance90 == False and r2y[i] >= 0.90):
#         pcNum = i
#         variance90 = True
        
# print("You need " + str(pcNum) + " principal components for each to explain the 90% of the Y variance.")
# plt.figure(figsize=(15,5))
# plt.plot(range(0,132), r2y)
# #plt.xticks(np.arange(0,70,5))
# #plt.yticks(np.arange(0,1.1,0.1))
# plt.xlabel('Number of Components')
# plt.ylabel('Percent Variance')
# plt.title('PLSR Dataset Percent Variance')
# plt.show()

[     Happiness Score  Economy  Social support  Healthy life expectancy  \
 0              7.587  1.39651         1.34951                  0.94143   
 1              7.561  1.30232         1.40223                  0.94784   
 2              7.527  1.32548         1.36058                  0.87464   
 3              7.522  1.45900         1.33095                  0.88521   
 4              7.427  1.32629         1.32261                  0.90563   
 ..               ...      ...             ...                      ...   
 128            3.575  0.31982         0.30285                  0.30335   
 129            3.465  0.22208         0.77370                  0.42864   
 130            3.340  0.28665         0.35386                  0.31910   
 131            2.905  0.01530         0.41587                  0.22396   
 132            2.839  0.20868         0.13995                  0.28443   
 
      Freedom to make life choices  Perceptions of corruption  Generosity  \
 0                   

In [None]:
#hi baby i finished. it works now :)