## Happiness Metrics for Global Life Expectancy Predictive Analysis
Nicole Chang, Sourish Guntipally, Aaron Park, Brandon To

Data used for this project is from Kaggle datasets of the [World Happiness Report](https://www.kaggle.com/unsdsn/world-happiness?select=2019.csv) and [Human Life Expectancy Around the World](https://www.kaggle.com/deepcontractor/human-life-expectancy-around-the-world).

The years we are analyzing are from 2015 - 2019.

In [44]:
import scipy as sp, numpy as np, pandas as pd
from sklearn.cross_decomposition import PLSRegression
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [55]:
#World Happiness Report (WHR)
#Human Life Expectancy Around the World (HLE)
#Use command below to show all elements in a dataframe
#pd.set_option("display.max_rows", None, "display.max_columns", None)

#HLE
dataHLE = pd.read_csv('Human_life_Expectancy.csv')

#2015 data
dataWHR2015 = pd.read_csv('2015.csv') #WHR

#2016 data
dataWHR2016 = pd.read_csv('2016.csv') #WHR

#2017 data
dataWHR2017 = pd.read_csv('2017.csv') #WHR

#2018 data
dataWHR2018 = pd.read_csv('2018.csv') #WHR

#2019 data
dataWHR2019 = pd.read_csv('2019.csv') #WHR

In [56]:
#Configuring HLE dataframe
dataHLE = dataHLE.loc[dataHLE['Level'] == 'National']
dfHLE = dataHLE.set_index('Country')

In [57]:
#Configuring data into dataframes that are easier to use 

#Labels are the names of the measurements (countries) for all 5 years
labels = [dataWHR2015['Country'].astype(str), dataWHR2016['Country'].astype(str), dataWHR2017['Country'].astype(str),dataWHR2018['Country'].astype(str), dataWHR2019['Country'].astype(str)] 
#dictionary to track countries participation in all 5 years
recurringCountries = {}

for i in range(len(dataWHR2015['Country'])):
    recurringCountries[labels[0][i]] = 1

for year in range(4):                                                      #Iterates for each year after 2015
    for countryind in range(len(dataWHR2015['Country'])):                  #Iterates through country labels for 2015
        for matchind in range(len(labels[year+1][:])):                     #Finds matches between 2015 label and other years
            if labels[0][countryind] == labels[year+1][matchind]:
                recurringCountries[labels[0][countryind]] += 1

#delete all the countries that don't appear in every year out of the dictionary for year 2015
for i in range(len(recurringCountries)):
    if recurringCountries[labels[0][i]] < 5:
        recurringCountries.pop(labels[0][i])


In [58]:
#creating lists of the countries to delete using the WHR dictionary 
del_countries2015 = []

for i in range(len(dataWHR2015['Country'])):
    if labels[0][i] not in recurringCountries:
        del_countries2015.append(labels[0][i])

del_countries2016 = []

for i in range(len(dataWHR2016['Country'])):
    if labels[1][i] not in recurringCountries:
        del_countries2016.append(labels[1][i])

del_countries2017 = []

for i in range(len(dataWHR2017['Country'])):
    if labels[2][i] not in recurringCountries:
        del_countries2017.append(labels[2][i])
        
del_countries2018 = []

for i in range(len(dataWHR2018['Country'])):
    if labels[3][i] not in recurringCountries:
        del_countries2018.append(labels[3][i])
        
del_countries2019 = []

for i in range(len(dataWHR2019['Country'])):
    if labels[4][i] not in recurringCountries:
        del_countries2019.append(labels[4][i])

del_countriesHLE = []

for i in range(len(dfHLE.index)):
    if dfHLE.index[i] not in recurringCountries:
        del_countriesHLE.append(dfHLE.index[i])

In [59]:
#feel free to rename these dataframes
#new dataframes with dropped countries
#Syria does not appear in HLE which is why it is dropped in all other dataframes

#NOTE: This cell sometimes results in an error saying ['Syria'] is not in axis-- run the previous 2 cells again
dfWHR2015 = dataWHR2015.set_index('Country')
dfWHR2015 = dfWHR2015.drop(del_countries2015)
dfWHR2015 = dfWHR2015.drop('Syria')

dfWHR2016 = dataWHR2016.set_index('Country')
dfWHR2016 = dfWHR2016.drop(del_countries2016)
dfWHR2016 = dfWHR2016.drop('Syria')

dfWHR2017 = dataWHR2017.set_index('Country')
dfWHR2017 = dfWHR2017.drop(del_countries2017)
dfWHR2017 = dfWHR2017.drop('Syria')

dfWHR2018 = dataWHR2018.set_index('Country')
dfWHR2018 = dfWHR2018.drop(del_countries2018)
dfWHR2018 = dfWHR2018.drop('Syria')

dfWHR2019 = dataWHR2019.set_index('Country')
dfWHR2019 = dfWHR2019.drop(del_countries2019)
dfWHR2019 = dfWHR2019.drop('Syria')

dfHLE = dfHLE.drop('Syria')
dfHLE = dfHLE.drop(del_countriesHLE)
dfHLE2015 = dfHLE['2015'] #data frame HLE 2015
dfHLE2016 = dfHLE['2016'] #data frame HLE 2016
dfHLE2017 = dfHLE['2017'] #data frame HLE 2017
dfHLE2018 = dfHLE['2018'] #data frame HLE 2018
dfHLE2019 = dfHLE['2019'] #data frame HLE 2019

In [60]:
#Second round of finding recurring countries: this time checking WHR against HLE

recurringCountries2 = {}

for i in range(len(dfWHR2015.index)):
    recurringCountries2[dfWHR2015.index[i]] = 1
                                                                
for countryindex in range(len(dfWHR2015.index)):  
    for matchindex in range(len(dfHLE.index)):             #Iterates through country labels for 2015                                       
        if dfWHR2015.index[countryindex] == dfHLE2015.index[matchindex]:
            recurringCountries2[dfWHR2015.index[countryindex]] += 1

#delete all the countries that don't appear in every year out of the dictionary for year 2015
for i in range(len(recurringCountries2)):
    if recurringCountries2[dfWHR2015.index[i]] < 2:
        recurringCountries2.pop(dfWHR2015.index[i])

In [61]:
#Second round of finding list of countries to delete
del_countries_WHR = []

for i in range(len(dfWHR2015.index)):
    if dfWHR2015.index[i] not in recurringCountries2:
        del_countries_WHR.append(dfWHR2015.index[i])

In [62]:
#Second round of deleting countries

#NOTE: This cell sometimes results in an error saying ['Taiwan, Hong Kong, ...'] are not in axis-- 
#run the previous 2 cells again

dfWHR2015 = dfWHR2015.drop(del_countries_WHR)

dfWHR2016 = dfWHR2016.drop(del_countries_WHR)

dfWHR2017 = dfWHR2017.drop(del_countries_WHR)

dfWHR2018 = dfWHR2018.drop(del_countries_WHR)

dfWHR2019 = dfWHR2019.drop(del_countries_WHR)

#I Will delete the comments below later - Sourish
#dfHLE = dfHLE.drop(del_countries_WHR)
#dfHLE2015 = dfHLE2015.drop(del_countries_WHR) #data frame HLE 2015
#dfHLE2016 = dfHLE2016.drop(del_countries_WHR) #data frame HLE 2016
#dfHLE2017 = dfHLE2017.drop(del_countries_WHR) #data frame HLE 2017
#dfHLE2018 = dfHLE2018.drop(del_countries_WHR) #data frame HLE 2018
#dfHLE2019 = dfHLE2019.drop(del_countries_WHR) #data frame HLE 2019
#print(len(dfHLE))
#print(len(dfWHR2015))

133
133


In [None]:
# if you want to drop columns, here's an example below
#data_with_index1 = data_with_index1.drop(['Region', 'Standard Error'], axis=1)

In [13]:
# To alphabetize, follow below example.
# I left it like this so that we can call sort_values when needed. 
# Should just keep the main data frame sorted by overall rank by default
#data_with_index1.sort_values('Country') #sorting A-Z
#data_with_index1.sort_values('Country', ascending=False) #sorting Z-A