In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

# 한글깨짐 현상 방지
plt.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

In [2]:
path = './data/archive'

In [3]:
file_list = os.listdir(path)

In [4]:
file_list.sort()

In [5]:
file_list

['2015.csv',
 '2016.csv',
 '2017.csv',
 '2018.csv',
 '2019.csv',
 '2020.csv',
 '2021.csv',
 '2022.csv']

In [6]:
data_2015 = pd.read_csv(os.path.join(path, file_list[0]))
data_2015.head()

Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Standard Error,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [7]:
data_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Region                         158 non-null    object 
 2   Happiness Rank                 158 non-null    int64  
 3   Happiness Score                158 non-null    float64
 4   Standard Error                 158 non-null    float64
 5   Economy (GDP per Capita)       158 non-null    float64
 6   Family                         158 non-null    float64
 7   Health (Life Expectancy)       158 non-null    float64
 8   Freedom                        158 non-null    float64
 9   Trust (Government Corruption)  158 non-null    float64
 10  Generosity                     158 non-null    float64
 11  Dystopia Residual              158 non-null    float64
dtypes: float64(9), int64(1), object(2)
memory usage: 1

In [8]:
column_mapping_2015 = {
    "Happiness Rank": "Rank",
    "Happiness Score": "Score",
    "Economy (GDP per Capita)": "GDP",
    "Health (Life Expectancy)": "Health",
    "Family": "Social Support",
}

data_2015.rename(columns=column_mapping_2015, inplace=True)

In [9]:
data_2015['Year'] = 2015

In [10]:
data_2015 = data_2015[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [11]:
data_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         158 non-null    object 
 1   Region          158 non-null    object 
 2   Rank            158 non-null    int64  
 3   Score           158 non-null    float64
 4   GDP             158 non-null    float64
 5   Social Support  158 non-null    float64
 6   Health          158 non-null    float64
 7   Year            158 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 10.0+ KB


In [12]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2015[cols] = scaler.fit_transform(data_2015[cols])

In [13]:
data_2016 = pd.read_csv(os.path.join(path, file_list[1]))
data_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        157 non-null    object 
 1   Region                         157 non-null    object 
 2   Happiness Rank                 157 non-null    int64  
 3   Happiness Score                157 non-null    float64
 4   Lower Confidence Interval      157 non-null    float64
 5   Upper Confidence Interval      157 non-null    float64
 6   Economy (GDP per Capita)       157 non-null    float64
 7   Family                         157 non-null    float64
 8   Health (Life Expectancy)       157 non-null    float64
 9   Freedom                        157 non-null    float64
 10  Trust (Government Corruption)  157 non-null    float64
 11  Generosity                     157 non-null    float64
 12  Dystopia Residual              157 non-null    flo

In [14]:
column_mapping_2016 = {
    "Happiness Rank": "Rank",
    "Happiness Score": "Score",
    "Economy (GDP per Capita)": "GDP",
    "Health (Life Expectancy)": "Health",
    "Family": "Social Support",
}

data_2016.rename(columns=column_mapping_2016, inplace=True)

In [15]:
data_2016['Year'] = 2016

In [16]:
data_2016 = data_2016[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [17]:
data_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         157 non-null    object 
 1   Region          157 non-null    object 
 2   Rank            157 non-null    int64  
 3   Score           157 non-null    float64
 4   GDP             157 non-null    float64
 5   Social Support  157 non-null    float64
 6   Health          157 non-null    float64
 7   Year            157 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 9.9+ KB


In [18]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2016[cols] = scaler.fit_transform(data_2016[cols])

In [19]:
data_2017 = pd.read_csv(os.path.join(path, file_list[2]))
data_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        155 non-null    object 
 1   Happiness.Rank                 155 non-null    int64  
 2   Happiness.Score                155 non-null    float64
 3   Whisker.high                   155 non-null    float64
 4   Whisker.low                    155 non-null    float64
 5   Economy..GDP.per.Capita.       155 non-null    float64
 6   Family                         155 non-null    float64
 7   Health..Life.Expectancy.       155 non-null    float64
 8   Freedom                        155 non-null    float64
 9   Generosity                     155 non-null    float64
 10  Trust..Government.Corruption.  155 non-null    float64
 11  Dystopia.Residual              155 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 

In [20]:
column_mapping_2017 = {
    "Happiness.Rank": "Rank",
    "Happiness.Score": "Score",
    "Economy..GDP.per.Capita.": "GDP",
    "Health..Life.Expectancy.": "Health",
    "Family": "Social Support",
}

data_2017.rename(columns=column_mapping_2017, inplace=True)

In [21]:
data_2017['Year'] = 2017

In [22]:
data_2017 = data_2017[['Country','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [23]:
data_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         155 non-null    object 
 1   Rank            155 non-null    int64  
 2   Score           155 non-null    float64
 3   GDP             155 non-null    float64
 4   Social Support  155 non-null    float64
 5   Health          155 non-null    float64
 6   Year            155 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 8.6+ KB


In [24]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2017[cols] = scaler.fit_transform(data_2017[cols])

In [25]:
data_2018 = pd.read_csv(os.path.join(path, file_list[3]))
data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     155 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [26]:
column_mapping_2018 = {
    "Overall rank": "Rank",
    "Country or region": 'Country',
    "GDP per capita": "GDP",
    "Healthy life expectancy": "Health",
    "Social support": "Social Support"
}

data_2018.rename(columns=column_mapping_2018, inplace=True)

In [27]:
data_2018['Year'] = 2018

In [28]:
data_2018 = data_2018[['Country','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [29]:
data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         156 non-null    object 
 1   Rank            156 non-null    int64  
 2   Score           156 non-null    float64
 3   GDP             156 non-null    float64
 4   Social Support  156 non-null    float64
 5   Health          156 non-null    float64
 6   Year            156 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 8.7+ KB


In [30]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2018[cols] = scaler.fit_transform(data_2018[cols])

In [31]:
data_2018.describe()

Unnamed: 0,Rank,Score,GDP,Social Support,Health,Year
count,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,0.522724,0.42531,0.737979,0.579948,2018.0
std,45.177428,0.236832,0.186985,0.183924,0.240368,0.0
min,1.0,0.0,0.0,0.0,0.0,2018.0
25%,39.75,0.327639,0.294012,0.648875,0.409951,2018.0
50%,78.5,0.523165,0.453006,0.763382,0.625243,2018.0
75%,117.25,0.690396,0.571446,0.889903,0.754612,2018.0
max,156.0,1.0,1.0,1.0,1.0,2018.0


In [32]:
data_2019 = pd.read_csv(os.path.join(path, file_list[4]))

In [33]:
data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [34]:
column_mapping_2019 = {
    "Overall rank": "Rank",
    "Country or region": 'Country',
    "GDP per capita": "GDP",
    "Healthy life expectancy": "Health",
    "Social support": "Social Support"
}

data_2019.rename(columns=column_mapping_2019, inplace=True)

In [35]:
data_2019['Year'] = 2019

In [36]:
data_2019 = data_2019[['Country','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [37]:
data_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         156 non-null    object 
 1   Rank            156 non-null    int64  
 2   Score           156 non-null    float64
 3   GDP             156 non-null    float64
 4   Social Support  156 non-null    float64
 5   Health          156 non-null    float64
 6   Year            156 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 8.7+ KB


In [38]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2019[cols] = scaler.fit_transform(data_2019[cols])

In [39]:
file_list[5]

'2020.csv'

In [40]:
data_2020 = pd.read_csv(os.path.join(path, file_list[5]))
data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                153 non-null    object 
 1   Regional indicator                          153 non-null    object 
 2   Ladder score                                153 non-null    float64
 3   Standard error of ladder score              153 non-null    float64
 4   upperwhisker                                153 non-null    float64
 5   lowerwhisker                                153 non-null    float64
 6   Logged GDP per capita                       153 non-null    float64
 7   Social support                              153 non-null    float64
 8   Healthy life expectancy                     153 non-null    float64
 9   Freedom to make life choices                153 non-null    float64
 10  Generosity    

In [41]:
column_mapping_2020 = {
    "Country name": "Country",
    "Regional indicator": "Region",
    "Ladder score": "Score",
    "Explained by: Log GDP per capita": "GDP",
    "Explained by: Healthy life expectancy": "Health",
    "Explained by: Social support": "Social Support"
}

data_2020.rename(columns=column_mapping_2020, inplace=True)

In [42]:
data_2020['Year'] = 2020

In [43]:
data_2020['Rank'] = data_2020['Score'].rank(ascending=False)

In [44]:
data_2020 = data_2020[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [45]:
data_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         153 non-null    object 
 1   Region          153 non-null    object 
 2   Rank            153 non-null    float64
 3   Score           153 non-null    float64
 4   GDP             153 non-null    float64
 5   Social Support  153 non-null    float64
 6   Health          153 non-null    float64
 7   Year            153 non-null    int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 9.7+ KB


In [46]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2020[cols] = scaler.fit_transform(data_2020[cols])

In [47]:
data_2020.describe()

Unnamed: 0,Rank,Score,GDP,Social Support,Health,Year
count,153.0,153.0,153.0,153.0,153.0,153.0
mean,77.0,0.554455,0.565357,0.746725,0.608947,2020.0
std,44.311398,0.212192,0.242351,0.185365,0.223317,0.0
min,1.0,0.0,0.0,0.0,0.0,2020.0
25%,39.0,0.411538,0.374746,0.637593,0.435434,2020.0
50%,77.0,0.562421,0.597751,0.777986,0.667787,2020.0
75%,115.0,0.698539,0.760882,0.896335,0.762206,2020.0
max,153.0,1.0,1.0,1.0,1.0,2020.0


In [48]:
data_2021 = pd.read_csv(os.path.join(path, file_list[6]))
data_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149 entries, 0 to 148
Data columns (total 20 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Country name                                149 non-null    object 
 1   Regional indicator                          149 non-null    object 
 2   Ladder score                                149 non-null    float64
 3   Standard error of ladder score              149 non-null    float64
 4   upperwhisker                                149 non-null    float64
 5   lowerwhisker                                149 non-null    float64
 6   Logged GDP per capita                       149 non-null    float64
 7   Social support                              149 non-null    float64
 8   Healthy life expectancy                     149 non-null    float64
 9   Freedom to make life choices                149 non-null    float64
 10  Generosity    

In [49]:
column_mapping_2021 = {
    "Country name": "Country",
    "Regional indicator": "Region",
    "Ladder score": "Score",
    "Explained by: Log GDP per capita": "GDP",
    "Explained by: Healthy life expectancy": "Health",
    "Explained by: Social support": "Social Support"
}

data_2021.rename(columns=column_mapping_2021, inplace=True)

In [50]:
data_2021['Year'] = 2021

In [51]:
data_2021['Rank'] = data_2021['Score'].rank(ascending=False)

In [52]:
data_2021 = data_2021[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [53]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2021[cols] = scaler.fit_transform(data_2021[cols])

In [54]:
data_2021.describe()

Unnamed: 0,Rank,Score,GDP,Social Support,Health,Year
count,149.0,149.0,149.0,149.0,149.0,149.0
mean,75.0,0.565866,0.558059,0.67689,0.57989,2021.0
std,43.156613,0.201903,0.231148,0.22088,0.237479,0.0
min,1.0,0.0,0.0,0.0,0.0,2021.0
25%,38.0,0.437864,0.380354,0.552048,0.397993,2021.0
50%,75.0,0.566084,0.58538,0.709898,0.636566,2021.0
75%,112.0,0.701636,0.755568,0.849829,0.74136,2021.0
max,149.0,1.0,1.0,1.0,1.0,2021.0


In [55]:
data_2022 = pd.read_csv(os.path.join(path, file_list[7]))
data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 12 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   RANK                                        147 non-null    int64 
 1   Country                                     147 non-null    object
 2   Happiness score                             146 non-null    object
 3   Whisker-high                                146 non-null    object
 4   Whisker-low                                 146 non-null    object
 5   Dystopia (1.83) + residual                  146 non-null    object
 6   Explained by: GDP per capita                146 non-null    object
 7   Explained by: Social support                146 non-null    object
 8   Explained by: Healthy life expectancy       146 non-null    object
 9   Explained by: Freedom to make life choices  146 non-null    object
 10  Explained by: Generosity  

In [56]:
data_2022['Happiness score'] = data_2022['Happiness score'].str.replace(',', '.').astype(float)
data_2022['Explained by: GDP per capita'] = data_2022['Explained by: GDP per capita'].str.replace(',', '.').astype(float)
data_2022['Explained by: Social support'] = data_2022['Explained by: Social support'].str.replace(',', '.').astype(float)
data_2022['Explained by: Healthy life expectancy'] = data_2022['Explained by: Healthy life expectancy'].str.replace(',', '.').astype(float)

In [57]:
column_mapping_2022 = {
    "RANK": "Rank",
    "Happiness score": "Score",
    "Explained by: GDP per capita": "GDP",
    "Explained by: Social support": "Social Support",
    "Explained by: Healthy life expectancy": "Health",
}

data_2022.rename(columns=column_mapping_2022, inplace=True)

In [58]:
data_2022['Year'] = 2022

In [59]:
data_2022['Rank'] = data_2022['Score'].rank(ascending=False)

In [60]:
data_2022 = data_2022[['Country','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]

In [61]:
scaler = MinMaxScaler()
cols = ['Score', 'GDP', 'Social Support', 'Health']
data_2022[cols] = scaler.fit_transform(data_2022[cols])

In [62]:
data_2022.describe()

Unnamed: 0,Rank,Score,GDP,Social Support,Health,Year
count,146.0,146.0,146.0,146.0,146.0,147.0
mean,73.5,0.581424,0.638499,0.68626,0.622262,2022.0
std,42.290457,0.200636,0.190884,0.212213,0.187194,0.0
min,1.0,0.0,0.0,0.0,0.0,2022.0
25%,37.25,0.458695,0.495926,0.554545,0.491773,2022.0
50%,73.5,0.584179,0.654368,0.725379,0.659766,2022.0
75%,109.75,0.72014,0.807945,0.844129,0.764066,2022.0
max,146.0,1.0,1.0,1.0,1.0,2022.0


In [63]:
data_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         158 non-null    object 
 1   Region          158 non-null    object 
 2   Rank            158 non-null    int64  
 3   Score           158 non-null    float64
 4   GDP             158 non-null    float64
 5   Social Support  158 non-null    float64
 6   Health          158 non-null    float64
 7   Year            158 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 10.0+ KB


In [64]:
data_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         157 non-null    object 
 1   Region          157 non-null    object 
 2   Rank            157 non-null    int64  
 3   Score           157 non-null    float64
 4   GDP             157 non-null    float64
 5   Social Support  157 non-null    float64
 6   Health          157 non-null    float64
 7   Year            157 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 9.9+ KB


In [65]:
data_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         155 non-null    object 
 1   Rank            155 non-null    int64  
 2   Score           155 non-null    float64
 3   GDP             155 non-null    float64
 4   Social Support  155 non-null    float64
 5   Health          155 non-null    float64
 6   Year            155 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 8.6+ KB


In [66]:
data_2017 = pd.merge(data_2017, data_2015[['Country', 'Region']], on='Country', how='left')
data_2017 = data_2017[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]
data_2017.info()
# data_2018['Region'] = pd.merge(data_2018, data_2015[['Country', 'Region']], on='Country', how='left')['Region_x']
# data_2019['Region'] = pd.merge(data_2019, data_2015[['Country', 'Region']], on='Country', how='left')['Region_x']
# data_2020['Region'] = pd.merge(data_2020, data_2015[['Country', 'Region']], on='Country', how='left')['Region_x']
# data_2021['Region'] = pd.merge(data_2021, data_2015[['Country', 'Region']], on='Country', how='left')['Region_x']
# data_2022['Region'] = pd.merge(data_2022, data_2015[['Country', 'Region']], on='Country', how='left')['Region_x']
# data_2016.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 155 entries, 0 to 154
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         155 non-null    object 
 1   Region          149 non-null    object 
 2   Rank            155 non-null    int64  
 3   Score           155 non-null    float64
 4   GDP             155 non-null    float64
 5   Social Support  155 non-null    float64
 6   Health          155 non-null    float64
 7   Year            155 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 10.9+ KB


In [67]:
data_2017.describe()

Unnamed: 0,Rank,Score,GDP,Social Support,Health,Year
count,155.0,155.0,155.0,155.0,155.0,155.0
mean,78.0,0.549343,0.526372,0.738183,0.580669,2017.0
std,44.888751,0.233532,0.224931,0.178361,0.249684,0.0
min,1.0,0.0,0.0,0.0,0.0,2017.0
25%,39.5,0.374174,0.354599,0.647369,0.389541,2017.0
50%,78.0,0.533856,0.56906,0.778553,0.63828,2017.0
75%,116.5,0.703654,0.704539,0.878144,0.761467,2017.0
max,155.0,1.0,1.0,1.0,1.0,2017.0


In [68]:
data_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         156 non-null    object 
 1   Rank            156 non-null    int64  
 2   Score           156 non-null    float64
 3   GDP             156 non-null    float64
 4   Social Support  156 non-null    float64
 5   Health          156 non-null    float64
 6   Year            156 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 8.7+ KB


In [69]:
data_2018 = pd.merge(data_2018, data_2015[['Country', 'Region']], on='Country', how='left')
data_2018 = data_2018[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]
data_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 0 to 155
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         156 non-null    object 
 1   Region          150 non-null    object 
 2   Rank            156 non-null    int64  
 3   Score           156 non-null    float64
 4   GDP             156 non-null    float64
 5   Social Support  156 non-null    float64
 6   Health          156 non-null    float64
 7   Year            156 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 11.0+ KB


In [70]:
data_2018.describe()

Unnamed: 0,Rank,Score,GDP,Social Support,Health,Year
count,156.0,156.0,156.0,156.0,156.0,156.0
mean,78.5,0.522724,0.42531,0.737979,0.579948,2018.0
std,45.177428,0.236832,0.186985,0.183924,0.240368,0.0
min,1.0,0.0,0.0,0.0,0.0,2018.0
25%,39.75,0.327639,0.294012,0.648875,0.409951,2018.0
50%,78.5,0.523165,0.453006,0.763382,0.625243,2018.0
75%,117.25,0.690396,0.571446,0.889903,0.754612,2018.0
max,156.0,1.0,1.0,1.0,1.0,2018.0


In [71]:
data_2019 = pd.merge(data_2019, data_2015[['Country', 'Region']], on='Country', how='left')
data_2019 = data_2019[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]
data_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 0 to 155
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         156 non-null    object 
 1   Region          149 non-null    object 
 2   Rank            156 non-null    int64  
 3   Score           156 non-null    float64
 4   GDP             156 non-null    float64
 5   Social Support  156 non-null    float64
 6   Health          156 non-null    float64
 7   Year            156 non-null    int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 11.0+ KB


In [72]:
data_2022 = pd.merge(data_2022, data_2015[['Country', 'Region']], on='Country', how='left')
data_2022 = data_2022[['Country','Region','Rank', 'Score', 'GDP', "Social Support", 'Health', 'Year' ]]
data_2022.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 146
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         147 non-null    object 
 1   Region          119 non-null    object 
 2   Rank            146 non-null    float64
 3   Score           146 non-null    float64
 4   GDP             146 non-null    float64
 5   Social Support  146 non-null    float64
 6   Health          146 non-null    float64
 7   Year            147 non-null    int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 10.3+ KB


In [73]:
result = pd.concat([data_2015, data_2016], ignore_index=True)
result = pd.concat([result, data_2017], ignore_index=True)
result = pd.concat([result, data_2018], ignore_index=True)
result = pd.concat([result, data_2019], ignore_index=True)
result = pd.concat([result, data_2020], ignore_index=True)
result = pd.concat([result, data_2021], ignore_index=True)
result = pd.concat([result, data_2022], ignore_index=True)

In [74]:
result.tail()

Unnamed: 0,Country,Region,Rank,Score,GDP,Social Support,Health,Year
1226,Rwanda*,,143.0,0.159498,0.355364,0.100758,0.490446,2022
1227,Zimbabwe,Sub-Saharan Africa,144.0,0.109101,0.428701,0.522727,0.286624,2022
1228,Lebanon,Middle East and Northern Africa,145.0,0.101717,0.630149,0.377273,0.669851,2022
1229,Afghanistan,Southern Asia,146.0,0.0,0.343142,0.0,0.306794,2022
1230,xx,,,,,,,2022


In [75]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231 entries, 0 to 1230
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country         1231 non-null   object 
 1   Region          1184 non-null   object 
 2   Rank            1230 non-null   float64
 3   Score           1230 non-null   float64
 4   GDP             1230 non-null   float64
 5   Social Support  1230 non-null   float64
 6   Health          1230 non-null   float64
 7   Year            1231 non-null   int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 77.1+ KB


In [76]:
result.to_csv('df_final_2.csv', index=False)

In [77]:
# result 값이 'A' 와 'B' 가 아니라면 result 는 'F'
result.loc[(result['Region'] == 'North America and ANZ') | (result['Region'] == 'North America'),'Region'] = 'North America'
result.loc[(result['Region'] == 'Southeastern Asia') | (result['Region'] == 'Southeast Asia'), 'Region'] = 'Southeast Asia'
result.loc[(result['Region'] == 'Eastern Asia') | (result['Region'] == 'East Asia'), 'Region'] = 'East Asia'
result.loc[(result['Region'] == 'Southern Asia') | (result['Region'] == 'South Asia'), 'Region'] = 'South Asia'
result.loc[(result['Region'] == 'Middle East and Northern Africa') | (result['Region'] == 'Middle East and North Africa'), 'Region'] = 'Middle East and North Africa'
result.loc[(result['Region'] == 'Taiwan Province of China') | (result['Region'] == 'Taiwan, Hong kong S.A.R. OF China'), 'Region'] = 'Hong kong'
result.Country = result.Country.apply(lambda x: x.replace('*','') if '*' in x else x)

In [78]:
result.to_csv('./data/df_final_2.csv', index=False)

In [79]:
result['Region'].unique()

array(['Western Europe', 'North America', 'Australia and New Zealand',
       'Middle East and North Africa', 'Latin America and Caribbean',
       'Southeast Asia', 'Central and Eastern Europe', 'East Asia',
       'Sub-Saharan Africa', 'South Asia', nan,
       'Commonwealth of Independent States'], dtype=object)

In [80]:
result['Year'].unique()

array([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])