In [1]:
# Dependencies and Setup
import pandas as pd

In [2]:
years = [2015,2016,2017,2018,2019]
df={}
# Looping through years list
for year in years:
    # File to Load 
    file = f"../Resources/{year}.csv"
   # Read each years File and store into Pandas data frame
    df[year] = pd.read_csv(file)

# Assigning names to each item in the list of data frames
df_2015,df_2016,df_2017,df_2018,df_2019 = df[2015],df[2016],df[2017],df[2018],df[2019]

In [3]:
# Checking for null values in all data frame
for year in years:
    print(f"{year} \n{df[year].isna().sum()}\n----------")

2015 
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64
----------
2016 
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Lower Confidence Interval        0
Upper Confidence Interval        0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64
----------
2017 
Country                          0
Happiness.Rank          

In [4]:
# Replacing null value found in 2018 dataframe with 0
df_2018.fillna(0,inplace = True)
df_2018.isna().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [5]:
# replacing spaces with underscore
for year in years:
    df[year].columns = [x.replace(" ","_") for x in df[year].columns.values]
    print(f"{year} \n{df[year].columns}\n----------")

2015 
Index(['Country', 'Region', 'Happiness_Rank', 'Happiness_Score',
       'Standard_Error', 'Economy_(GDP_per_Capita)', 'Family',
       'Health_(Life_Expectancy)', 'Freedom', 'Trust_(Government_Corruption)',
       'Generosity', 'Dystopia_Residual'],
      dtype='object')
----------
2016 
Index(['Country', 'Region', 'Happiness_Rank', 'Happiness_Score',
       'Lower_Confidence_Interval', 'Upper_Confidence_Interval',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Trust_(Government_Corruption)', 'Generosity',
       'Dystopia_Residual'],
      dtype='object')
----------
2017 
Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.', 'Dystopia.Residual'],
      dtype='object')
----------
2018 
Index(['Overall_rank', 'Country_or_region', 'Score', 'GDP_per_capita',
      

In [6]:
# Checking the number of rows and columns in each data frame
for year in years:
    print(f"{year} \n{df[year].shape}\n----------")

2015 
(158, 12)
----------
2016 
(157, 13)
----------
2017 
(155, 12)
----------
2018 
(156, 9)
----------
2019 
(156, 9)
----------


In [7]:
#Drop unnecessary columns in each data frame

df_2015.drop(columns=['Region','Standard_Error'], inplace= True)

df_2016.drop(columns=['Region','Lower_Confidence_Interval','Upper_Confidence_Interval'], inplace= True)

df_2017.drop(columns=['Whisker.high','Whisker.low'], inplace= True)


In [8]:
#Calculating the missing dystopian columnn values for 2018 and 2019 and Renaming the columns over that period
for year in years[-2:]:
    df[year]['Dystopia_Residual'] = df[year].loc[:,'Score']-df[year].loc[:,'GDP_per_capita':].sum(axis=1)
    
    df[year].rename(columns={
    'Score':'Happiness_Score',
    'GDP_per_capita':'Economy_(GDP_per_Capita)',
    'Social_support':'Family',
    'Healthy_life_expectancy':'Health_(Life_Expectancy)',
    'Freedom_to_make_life_choices':'Freedom',
    'Perceptions_of_corruption':'Trust_(Government_Corruption)',
    'Overall_rank':'Happiness_Rank',
    'Country_or_region':'Country'
 },inplace=True)
    
#Renaming 2017 df column names
df[2017].rename(columns={
    'Happiness.Score':'Happiness_Score',
    'Economy..GDP.per.Capita.':'Economy_(GDP_per_Capita)',
    'Health..Life.Expectancy.':'Health_(Life_Expectancy)',
    'Trust..Government.Corruption.':'Trust_(Government_Corruption)',
    'Happiness.Rank':'Happiness_Rank',
    'Dystopia.Residual':'Dystopia_Residual'
 },inplace=True)


In [9]:
# Checking information of rows and columns in each data frame & if the right data type in data frames 
for year in years:
    print(f"{year} \n{df[year].info()}\n----------")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Happiness_Rank                 158 non-null    int64  
 2   Happiness_Score                158 non-null    float64
 3   Economy_(GDP_per_Capita)       158 non-null    float64
 4   Family                         158 non-null    float64
 5   Health_(Life_Expectancy)       158 non-null    float64
 6   Freedom                        158 non-null    float64
 7   Trust_(Government_Corruption)  158 non-null    float64
 8   Generosity                     158 non-null    float64
 9   Dystopia_Residual              158 non-null    float64
dtypes: float64(8), int64(1), object(1)
memory usage: 12.5+ KB
2015 
None
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (tota

In [10]:
#Checking if any duplicates in df
for year in years:
    print(f"{year} \n{df[year]['Country'].nunique()}\n----------")

2015 
158
----------
2016 
157
----------
2017 
155
----------
2018 
156
----------
2019 
156
----------


In [11]:
#Restructurin the column names for uniformity in df
for year in years:
    df[year] = df[year][['Country', 'Happiness_Rank', 'Happiness_Score',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Trust_(Government_Corruption)', 'Generosity',
       'Dystopia_Residual']]
#     df[year].to_csv(f"../UWA-Project/Output/{year}.csv",index=False)


In [13]:
# Merging all data frames into one data frame on Country column
merge_df = df_2015.merge(df_2016, on=['Country'], how='outer', suffixes=['_2015','_2016'])
merge_df = merge_df.merge(df_2017, on=['Country'], how='outer')
merge_df = merge_df.merge(df_2018, on=['Country'], how='outer', suffixes=['_2017','_2018'])
merge_df = merge_df.merge(df_2019, on=['Country'], how='outer')

# Adding suffix to 2019 columns
merge_df = merge_df.rename(columns={"Happiness_Rank": "Happiness_Rank_2019",
                                    "Happiness_Score": "Happiness_Score_2019",
                                    "Economy_(GDP_per_Capita)": "Economy_(GDP_per_Capita)_2019",
                                    "Family": "Family_2019",
                                    "Health_(Life_Expectancy)": "Health_(Life_Expectancy)_2019",
                                    "Trust_(Government Corruption)": "Trust_(Government Corruption)_2019",
                                    "Freedom": "Freedom_2019",
                                    "Generosity": "Generosity_2019",
                                    "Dystopia_Residual": "Dystopia_Residual_2019"
                                   })

merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 0 to 169
Data columns (total 46 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country                             170 non-null    object 
 1   Happiness_Rank_2015                 158 non-null    float64
 2   Happiness_Score_2015                158 non-null    float64
 3   Economy_(GDP_per_Capita)_2015       158 non-null    float64
 4   Family_2015                         158 non-null    float64
 5   Health_(Life_Expectancy)_2015       158 non-null    float64
 6   Freedom_2015                        158 non-null    float64
 7   Trust_(Government_Corruption)_2015  158 non-null    float64
 8   Generosity_2015                     158 non-null    float64
 9   Dystopia_Residual_2015              158 non-null    float64
 10  Happiness_Rank_2016                 157 non-null    float64
 11  Happiness_Score_2016                157 non-n

In [14]:
# Checking for countries that has records in 2015, but not in 2016
df_2015[~df_2015.Country.isin(df_2016.Country)]

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual
21,Oman,22,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2.47489
90,Somaliland region,91,5.057,0.18847,0.95152,0.43873,0.46582,0.39928,0.50318,2.11032
93,Mozambique,94,4.971,0.08308,1.02626,0.09131,0.34037,0.15603,0.22269,3.05137
96,Lesotho,97,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2.79832
100,Swaziland,101,4.867,0.71206,1.07284,0.07566,0.30658,0.0306,0.18259,2.48676
125,Djibouti,126,4.369,0.44025,0.59207,0.36291,0.46074,0.28105,0.18093,2.05125
147,Central African Republic,148,3.678,0.0785,0.0,0.06699,0.48879,0.08289,0.23835,2.7223


In [15]:
# Checking for countries that has records in 2016, but not in 2015
df_2016[~df_2016.Country.isin(df_2015.Country)]

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual
14,Puerto Rico,15,7.039,1.35943,1.08113,0.77758,0.46823,0.12275,0.22202,3.0076
51,Belize,52,5.956,0.87616,0.68655,0.45569,0.51231,0.10771,0.23684,3.08039
75,Somalia,76,5.44,0.0,0.33613,0.11466,0.56778,0.3118,0.27225,3.83772
96,Somaliland Region,97,5.057,0.25558,0.75862,0.33108,0.3913,0.36794,0.51479,2.43801
112,Namibia,113,4.574,0.93287,0.70362,0.34745,0.48614,0.10398,0.07795,1.92198
142,South Sudan,143,3.832,0.39394,0.18519,0.15781,0.19662,0.13015,0.25899,2.50929


In [16]:
# Checking for countries that has records in later years but not in 2015
merge_df[~merge_df.Country.isin(df_2015.Country)]

Unnamed: 0,Country,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,Dystopia_Residual_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government_Corruption),Dystopia_Residual_2019
158,Puerto Rico,,,,,,,,,,...,,,,,,,,,,
159,Belize,,,,,,,,,,...,2.709,,,,,,,,,
160,Somalia,,,,,,,,,,...,2.961,112.0,4.668,0.0,0.698,0.268,0.559,0.243,0.27,2.63
161,Somaliland Region,,,,,,,,,,...,,,,,,,,,,
162,Namibia,,,,,,,,,,...,1.287,113.0,4.639,0.879,1.313,0.477,0.401,0.07,0.056,1.443
163,South Sudan,,,,,,,,,,...,1.69,156.0,2.853,0.306,0.575,0.295,0.01,0.202,0.091,1.374
164,Taiwan Province of China,,,,,,,,,,...,,,,,,,,,,
165,"Hong Kong S.A.R., China",,,,,,,,,,...,,,,,,,,,,
166,Trinidad & Tobago,,,,,,,,,,...,2.148,39.0,6.192,1.231,1.477,0.713,0.489,0.185,0.016,2.081
167,Northern Cyprus,,,,,,,,,,...,1.658,64.0,5.718,1.263,1.252,1.042,0.417,0.191,0.162,1.391


In [17]:
# Replacing null value found in merged dataframe with 0
merge_df.fillna(0,inplace = True)
merge_df.isna().sum()

Country                               0
Happiness_Rank_2015                   0
Happiness_Score_2015                  0
Economy_(GDP_per_Capita)_2015         0
Family_2015                           0
Health_(Life_Expectancy)_2015         0
Freedom_2015                          0
Trust_(Government_Corruption)_2015    0
Generosity_2015                       0
Dystopia_Residual_2015                0
Happiness_Rank_2016                   0
Happiness_Score_2016                  0
Economy_(GDP_per_Capita)_2016         0
Family_2016                           0
Health_(Life_Expectancy)_2016         0
Freedom_2016                          0
Trust_(Government_Corruption)_2016    0
Generosity_2016                       0
Dystopia_Residual_2016                0
Happiness_Rank_2017                   0
Happiness_Score_2017                  0
Economy_(GDP_per_Capita)_2017         0
Family_2017                           0
Health_(Life_Expectancy)_2017         0
Freedom_2017                          0


In [18]:
# Checking the country names which are in Merged Data frame, but missing in any of the years data frames
merge_df["Country"].loc[(~merge_df["Country"].isin(df_2015["Country"])) | \
                        (~merge_df["Country"].isin(df_2016["Country"])) | \
                        (~merge_df["Country"].isin(df_2017["Country"])) | \
                        (~merge_df["Country"].isin(df_2018["Country"])) | \
                        (~merge_df["Country"].isin(df_2019["Country"]))
                       ].sort_values()

136                      Angola
159                      Belize
147    Central African Republic
139                     Comoros
125                    Djibouti
169                      Gambia
71                    Hong Kong
165     Hong Kong S.A.R., China
98                         Laos
96                      Lesotho
92                    Macedonia
93                   Mozambique
162                     Namibia
65                 North Cyprus
168             North Macedonia
167             Northern Cyprus
21                         Oman
158                 Puerto Rico
160                     Somalia
161           Somaliland Region
90            Somaliland region
163                 South Sudan
117                       Sudan
39                     Suriname
100                   Swaziland
37                       Taiwan
164    Taiwan Province of China
166           Trinidad & Tobago
40          Trinidad and Tobago
Name: Country, dtype: object

In [19]:
# Making the country names matching for those refering to the same country 
merge_df["Country"].loc[merge_df.Country == "Northern Cyprus"] = "North Cyprus"
merge_df["Country"].loc[merge_df.Country == "Macedonia"] = "North Macedonia"
merge_df["Country"].loc[merge_df.Country == "Hong Kong S.A.R., China"] = "Hong Kong"
merge_df["Country"].loc[merge_df.Country == "Taiwan Province of China"] = "Taiwan"
merge_df["Country"].loc[merge_df.Country == "Trinidad & Tobago"] = "Trinidad and Tobago"
merge_df["Country"].loc[merge_df.Country == "Somaliland Region"] = "Somaliland Region"
merge_df["Country"].loc[merge_df.Country == "South Sudan"] = "Sudan"

# merge_df["Country"].loc[merge_df.Country == "Somaliland Region"] = "Somalia"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [20]:
# Checking the country names which were not present in any of the years data frames
country_check_list = merge_df["Country"].loc[(~merge_df["Country"].isin(df_2015["Country"])) | \
                                             (~merge_df["Country"].isin(df_2016["Country"])) | \
                                             (~merge_df["Country"].isin(df_2017["Country"])) | \
                                             (~merge_df["Country"].isin(df_2018["Country"])) | \
                                             (~merge_df["Country"].isin(df_2019["Country"]))
                                            ].sort_values().tolist()
country_check_list = set(country_check_list)
country_check_list

{'Angola',
 'Belize',
 'Central African Republic',
 'Comoros',
 'Djibouti',
 'Gambia',
 'Hong Kong',
 'Laos',
 'Lesotho',
 'Mozambique',
 'Namibia',
 'North Cyprus',
 'North Macedonia',
 'Oman',
 'Puerto Rico',
 'Somalia',
 'Somaliland Region',
 'Somaliland region',
 'Sudan',
 'Suriname',
 'Swaziland',
 'Taiwan',
 'Trinidad and Tobago'}

In [21]:
# Displaying the list of Countries missing data in some year
merge_df.loc[merge_df["Country"].isin(country_check_list)]

Unnamed: 0,Country,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,Dystopia_Residual_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government_Corruption),Dystopia_Residual_2019
21,Oman,22.0,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2.47489,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,Taiwan,38.0,6.298,1.29098,1.07617,0.8753,0.3974,0.08129,0.25376,2.32323,...,2.136,25.0,6.446,1.368,1.43,0.914,0.351,0.242,0.097,2.044
39,Suriname,40.0,6.269,0.99534,0.972,0.6082,0.59657,0.13633,0.16991,2.79094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,Trinidad and Tobago,41.0,6.168,1.21183,1.18354,0.61483,0.55884,0.0114,0.31844,2.26882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,North Cyprus,66.0,5.695,1.20806,1.07008,0.92356,0.49027,0.1428,0.26169,1.59888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,Hong Kong,72.0,5.474,1.38604,1.05818,1.01328,0.59608,0.37124,0.39478,0.65429,...,0.644,76.0,5.43,1.438,1.277,1.122,0.44,0.258,0.287,0.608
90,Somaliland region,91.0,5.057,0.18847,0.95152,0.43873,0.46582,0.39928,0.50318,2.11032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,North Macedonia,93.0,5.007,0.91851,1.00232,0.73545,0.33457,0.05327,0.22359,1.73933,...,1.677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,Mozambique,94.0,4.971,0.08308,1.02626,0.09131,0.34037,0.15603,0.22269,3.05137,...,2.249,123.0,4.466,0.204,0.986,0.39,0.494,0.197,0.138,2.057
96,Lesotho,97.0,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2.79832,...,1.391,144.0,3.802,0.489,1.169,0.168,0.359,0.107,0.093,1.417


In [22]:
# Creating a list of country names duplicated
dup_countries = merge_df['Country'].loc[merge_df['Country'].duplicated()].tolist() 
dup_countries

['Sudan',
 'Taiwan',
 'Hong Kong',
 'Trinidad and Tobago',
 'North Cyprus',
 'North Macedonia']

In [23]:
# Merging duplicate country names row wise by getting sum of each column values

# Loop through each duplicated country names in the list
for country in dup_countries:
    
    # Making a new data frame having only the duplicated country names
    joined_rows = merge_df.loc[merge_df.Country == country]
    # Adding a row to the new data frame with the sum of each columns
    joined_rows.loc[country,:] = joined_rows.sum(axis=0)
    # Correcting the Country column value
    joined_rows['Country'] = country
    
    # Removing those rows from the merged data frame
    merge_df.drop(merge_df[merge_df.Country == country].index, inplace=True)
    # Concatenating the last row added(sum) to the original merged data frame
    merge_df = pd.concat([merge_df, joined_rows.tail(1)])

# Reseting the merged data frame's index
merge_df.reset_index(drop=True, inplace=True)

# Displaying the countries data which were missing in any of the year's data frame - after cleaning
merge_df.loc[merge_df["Country"].isin(country_check_list)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,Country,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,Dystopia_Residual_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government_Corruption),Dystopia_Residual_2019
21,Oman,22.0,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2.47489,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,Suriname,40.0,6.269,0.99534,0.972,0.6082,0.59657,0.13633,0.16991,2.79094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,Somaliland region,91.0,5.057,0.18847,0.95152,0.43873,0.46582,0.39928,0.50318,2.11032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,Mozambique,94.0,4.971,0.08308,1.02626,0.09131,0.34037,0.15603,0.22269,3.05137,...,2.249,123.0,4.466,0.204,0.986,0.39,0.494,0.197,0.138,2.057
91,Lesotho,97.0,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2.79832,...,1.391,144.0,3.802,0.489,1.169,0.168,0.359,0.107,0.093,1.417
93,Laos,99.0,4.876,0.59066,0.73803,0.54909,0.59591,0.24249,0.42192,1.73799,...,1.398,105.0,4.796,0.764,1.03,0.551,0.547,0.266,0.164,1.474
95,Swaziland,101.0,4.867,0.71206,1.07284,0.07566,0.30658,0.0306,0.18259,2.48676,...,0.0,135.0,4.212,0.811,1.149,0.0,0.313,0.074,0.135,1.73
119,Djibouti,126.0,4.369,0.44025,0.59207,0.36291,0.46074,0.28105,0.18093,2.05125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130,Angola,137.0,4.033,0.75778,0.8604,0.16683,0.10384,0.07122,0.12344,1.94939,...,1.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,Comoros,140.0,3.956,0.23906,0.79273,0.36315,0.22917,0.199,0.17441,1.95812,...,0.0,142.0,3.973,0.274,0.757,0.505,0.142,0.275,0.078,1.942


In [24]:
# Writing the cleaned merged data frame to a csv file
merge_df.to_csv("../Output/happiness_merged.csv", index=False)

In [25]:
for year in years:
    # Adding country names to each years dataframe
    df[year]["Country"] = merge_df["Country"]
    # Adding a new Year column to each dataframe
    df[year]["Year"] = year
    # Finding columns with particular year in merged data frame
    year_cols = [col for col in merge_df.columns if str(year) in col]

    # Adding columns in the year_cols list to each dataframe
    for col_year in year_cols:
        # Removing year part from column names
        col = col_year[:-5]
        # Copying column values form merged dataframe to year's data frame
        df[year][col] = merge_df[col_year]   
        
    # Rearranging column names, making Year as 2nd column
    df[year] = df[year].iloc[:,[0,10,1,2,3,4,5,6,7,8,9]]
    # Dropping Countries which have no values in that year
    df[year] = df[year].loc[df[year]["Happiness_Rank"]!= 0]

df[2015].head()

Unnamed: 0,Country,Year,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual
0,Switzerland,2015,1.0,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
1,Iceland,2015,2.0,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201
2,Denmark,2015,3.0,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204
3,Norway,2015,4.0,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531
4,Canada,2015,5.0,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176


In [26]:
# writing cleaned individual year files
for year in years:
    df[year].to_csv(f"../Output/{year}.csv", index=False)

In [27]:
# Concatenating each year dataframes
df_list = [df[year] for year in range(2015,2020)]
concat_df =  pd.concat(df_list)

concat_df.sort_values(["Happiness_Rank","Year"],inplace=True, ascending=[True,False])
concat_df.reset_index(drop=True,inplace=True)

concat_df.loc[concat_df["Happiness_Rank"] <= 10]

Unnamed: 0,Country,Year,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual
0,Finland,2019,1.0,7.769,1.34,1.587,0.986,0.596,0.343,0.153,2.714
1,Finland,2018,1.0,7.632,1.305,1.592,0.874,0.681,0.393,0.202,2.585
2,Norway,2017,1.0,7.537,1.616463,1.533524,0.796667,0.635423,0.315964,0.362012,2.277027
3,Denmark,2016,1.0,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
4,Switzerland,2015,1.0,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738
5,Denmark,2019,2.0,7.6,1.383,1.573,0.996,0.592,0.341,0.252,2.394
6,Norway,2018,2.0,7.594,1.456,1.582,0.861,0.686,0.34,0.286,2.383
7,Denmark,2017,2.0,7.522,1.482383,1.551122,0.792566,0.626007,0.40077,0.35528,2.313707
8,Switzerland,2016,2.0,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
9,Iceland,2015,2.0,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201


In [28]:
# Writing the cleaned concatenated data frame to a csv file
concat_df.to_csv("../Output/happiness_concat.csv", index=False)