### Importing necessary modules

In [1]:
import pandas as pd
import numpy as np
import os
import numpy as np
import requests
import json
#Reading the data from csv and storing the df in empty dictionary
years = [2015,2016,2017,2018,2019]
df={}
for year in years:
    url = f"../Resources/{year}.csv"
    pd.set_option("display.max_rows",800)
    df[year] = pd.read_csv(url)


# 1.1 Importing Raw Data
### Cleaning the df over the period 2016-2019

In [2]:
#Unpacking the df
df_2015,df_2016,df_2017,df_2018,df_2019 = df[2015],df[2016],df[2017],df[2018],df[2019]

In [3]:
# Checking for null values in the df
for year in years:
    print(f"""{year} \n{df[year].isna().sum()}
----------""")

2015 
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Standard Error                   0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64
----------
2016 
Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Lower Confidence Interval        0
Upper Confidence Interval        0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
dtype: int64
----------
2017 
Country                          0
Happiness.Rank          

In [4]:
# Checked for null values - replaced it with with zero.
df_2018.fillna(0,inplace = True)
df_2018.isna().sum()

Overall rank                    0
Country or region               0
Score                           0
GDP per capita                  0
Social support                  0
Healthy life expectancy         0
Freedom to make life choices    0
Generosity                      0
Perceptions of corruption       0
dtype: int64

In [5]:
# replacing spaces with underscore
for year in years:
    df[year].columns = [x.replace(" ","_") for x in df[year].columns.values]
    print(f"""{year} \n{df[year].columns}
----------""")


2015 
Index(['Country', 'Region', 'Happiness_Rank', 'Happiness_Score',
       'Standard_Error', 'Economy_(GDP_per_Capita)', 'Family',
       'Health_(Life_Expectancy)', 'Freedom', 'Trust_(Government_Corruption)',
       'Generosity', 'Dystopia_Residual'],
      dtype='object')
----------
2016 
Index(['Country', 'Region', 'Happiness_Rank', 'Happiness_Score',
       'Lower_Confidence_Interval', 'Upper_Confidence_Interval',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Trust_(Government_Corruption)', 'Generosity',
       'Dystopia_Residual'],
      dtype='object')
----------
2017 
Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.', 'Dystopia.Residual'],
      dtype='object')
----------
2018 
Index(['Overall_rank', 'Country_or_region', 'Score', 'GDP_per_capita',
      

In [6]:
#Removing non-contributing columns of the df 
col = ['Lower_Confidence_Interval','Upper_Confidence_Interval','Region','Whisker.high','Whisker.low','Standard_Error']
for year in years:
    for value in df[year].columns:
          if value in col:
            del df[year][value]

df[2019]

Unnamed: 0,Overall_rank,Country_or_region,Score,GDP_per_capita,Social_support,Healthy_life_expectancy,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
5,6,Switzerland,7.48,1.452,1.526,1.052,0.572,0.263,0.343
6,7,Sweden,7.343,1.387,1.487,1.009,0.574,0.267,0.373
7,8,New Zealand,7.307,1.303,1.557,1.026,0.585,0.33,0.38
8,9,Canada,7.278,1.365,1.505,1.039,0.584,0.285,0.308
9,10,Austria,7.246,1.376,1.475,1.016,0.532,0.244,0.226


In [7]:
#Checking the missing columns of the df
for year in years:
    print(f"""{year} \n{df[year].shape}
----------""")

2015 
(158, 10)
----------
2016 
(157, 10)
----------
2017 
(155, 10)
----------
2018 
(156, 9)
----------
2019 
(156, 9)
----------


In [8]:
#Calculating the missing dystopian columnn values for 2018 and 2019 and Renaming the columns over that period
for year in years[-2:]:
    df[year]['Dystopia_Residual'] = df[year].loc[:,'Score']-df[year].loc[:,'GDP_per_capita':].sum(axis=1)
    df[year].rename(columns={
    'Score':'Happiness_Score',
    'GDP_per_capita':'Economy_(GDP_per_Capita)',
    'Social_support':'Family',
    'Healthy_life_expectancy':'Health_(Life_Expectancy)',
    'Freedom_to_make_life_choices':'Freedom',
    'Perceptions_of_corruption':'Trust_(Government_Corruption)',
    'Overall_rank':'Happiness_Rank',
    'Country_or_region':'Country'
 },inplace=True)
    
#Renaming 2017 df column names
df[2017].rename(columns={
    'Happiness.Score':'Happiness_Score',
    'Economy..GDP.per.Capita.':'Economy_(GDP_per_Capita)',
    'Health..Life.Expectancy.':'Health_(Life_Expectancy)',
    'Trust..Government.Corruption.':'Trust_(Government_Corruption)',
    'Happiness.Rank':'Happiness_Rank',
    'Dystopia.Residual':'Dystopia_Residual'
 },inplace=True)


In [9]:
#Showing all the columns of the df are equal size
for year in years:
    print(f"""{year} \n{df[year].shape}
----------""")

2015 
(158, 10)
----------
2016 
(157, 10)
----------
2017 
(155, 10)
----------
2018 
(156, 10)
----------
2019 
(156, 10)
----------


In [10]:
#Finding any duplicates in df
for year in years:
    print(f"""{year} \n{df[year]['Country'].nunique()}
----------""")

2015 
158
----------
2016 
157
----------
2017 
155
----------
2018 
156
----------
2019 
156
----------


In [11]:
#Checking if the right data type in data frames 
for year in years:
    print(f"""\n{year} \n{df[year].info()}
----------\n""")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        158 non-null    object 
 1   Happiness_Rank                 158 non-null    int64  
 2   Happiness_Score                158 non-null    float64
 3   Economy_(GDP_per_Capita)       158 non-null    float64
 4   Family                         158 non-null    float64
 5   Health_(Life_Expectancy)       158 non-null    float64
 6   Freedom                        158 non-null    float64
 7   Trust_(Government_Corruption)  158 non-null    float64
 8   Generosity                     158 non-null    float64
 9   Dystopia_Residual              158 non-null    float64
dtypes: float64(8), int64(1), object(1)
memory usage: 12.5+ KB

2015 
None
----------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (to

In [12]:
for year in years:
    print(f"""\n{year} \n{df[year].columns}
----------\n""")


2015 
Index(['Country', 'Happiness_Rank', 'Happiness_Score',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Trust_(Government_Corruption)', 'Generosity',
       'Dystopia_Residual'],
      dtype='object')
----------


2016 
Index(['Country', 'Happiness_Rank', 'Happiness_Score',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Trust_(Government_Corruption)', 'Generosity',
       'Dystopia_Residual'],
      dtype='object')
----------


2017 
Index(['Country', 'Happiness_Rank', 'Happiness_Score',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Generosity', 'Trust_(Government_Corruption)',
       'Dystopia_Residual'],
      dtype='object')
----------


2018 
Index(['Happiness_Rank', 'Country', 'Happiness_Score',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Generosity', 'Trust_(Government_Corruption)',
       'Dys

# 1.2 Exporting the Cleaned Data

In [13]:
#Restructurin the column names for uniformity in df
#Writing the cleaned df into csv
for year in years:
    df[year] = df[year][['Country', 'Happiness_Rank', 'Happiness_Score',
       'Economy_(GDP_per_Capita)', 'Family', 'Health_(Life_Expectancy)',
       'Freedom', 'Trust_(Government_Corruption)', 'Generosity',
       'Dystopia_Residual']]
    df[year].to_csv(f"../Output/{year}.csv",index=False)


In [14]:
for year in years:
    df[year]['year'] = [year for x in range(len(df[year].Country))]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[year]['year'] = [year for x in range(len(df[year].Country))]


# 1.3 Exporting Caoncated Data
### Cancated CSV is used for plotly graphs with  to diplay year-slider

In [15]:
df_list = [df[x] for x in range(2015,2020)]
concated_df =  pd.concat(df_list)


In [16]:
#

In [17]:
concated_df.sort_values('Happiness_Rank',inplace=True)
concated_df.reset_index(drop=True,inplace=True)
concated_df.to_csv(f"../Output/concated_df.csv",index=False)
concated_df.head(50)

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual,year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Finland,1,7.769,1.34,1.587,0.986,0.596,0.393,0.153,2.714,2019
2,Finland,1,7.632,1.305,1.592,0.874,0.681,0.393,0.202,2.585,2018
3,Norway,1,7.537,1.616463,1.533524,0.796667,0.635423,0.315964,0.362012,2.277027,2017
4,Denmark,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939,2016
5,Norway,2,7.594,1.456,1.582,0.861,0.686,0.34,0.286,2.383,2018
6,Switzerland,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463,2016
7,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
8,Denmark,2,7.6,1.383,1.573,0.996,0.592,0.41,0.252,2.394,2019
9,Denmark,2,7.522,1.482383,1.551122,0.792566,0.626007,0.40077,0.35528,2.313707,2017


# 1.4 Merging the Data

In [18]:
for year in years:
    print(f"{year} Total Countries: {df[year]['Country'].count()}   Last Rank is:  {df[year]['Happiness_Rank'].max()}")

2015 Total Countries: 158   Last Rank is:  158
2016 Total Countries: 157   Last Rank is:  157
2017 Total Countries: 155   Last Rank is:  155
2018 Total Countries: 156   Last Rank is:  156
2019 Total Countries: 156   Last Rank is:  156


In [19]:
# Merging all data frames into one data frame on Country column
merge_df = df_2015.merge(df_2016, on=['Country'], how='outer', suffixes=['_2015','_2016'])
merge_df = merge_df.merge(df_2017, on=['Country'], how='outer')
merge_df = merge_df.merge(df_2018, on=['Country'], how='outer', suffixes=['_2017','_2018'])
merge_df = merge_df.merge(df_2019, on=['Country'], how='outer')

# Adding suffix to 2019 columns
merge_df = merge_df.rename(columns={"Happiness_Rank": "Happiness_Rank_2019",
                                    "Happiness_Score": "Happiness_Score_2019",
                                    "Economy_(GDP_per_Capita)": "Economy_(GDP_per_Capita)_2019",
                                    "Family": "Family_2019",
                                    "Health_(Life_Expectancy)": "Health_(Life_Expectancy)_2019",
                                    "Trust_(Government_Corruption)": "Trust_(Government Corruption)_2019",
                                    "Freedom": "Freedom_2019",
                                    "Generosity": "Generosity_2019",
                                    "Dystopia_Residual": "Dystopia_Residual_2019"
                                   })

merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 0 to 169
Data columns (total 46 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country                             170 non-null    object 
 1   Happiness_Rank_2015                 158 non-null    float64
 2   Happiness_Score_2015                158 non-null    float64
 3   Economy_(GDP_per_Capita)_2015       158 non-null    float64
 4   Family_2015                         158 non-null    float64
 5   Health_(Life_Expectancy)_2015       158 non-null    float64
 6   Freedom_2015                        158 non-null    float64
 7   Trust_(Government_Corruption)_2015  158 non-null    float64
 8   Generosity_2015                     158 non-null    float64
 9   Dystopia_Residual_2015              158 non-null    float64
 10  Happiness_Rank_2016                 157 non-null    float64
 11  Happiness_Score_2016                157 non-n

In [20]:
# Checking for countries that has records in 2015, but not in 2016
df_2015[~df_2015.Country.isin(df_2016.Country)]

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual
21,Oman,22,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2.47489
90,Somaliland region,91,5.057,0.18847,0.95152,0.43873,0.46582,0.39928,0.50318,2.11032
93,Mozambique,94,4.971,0.08308,1.02626,0.09131,0.34037,0.15603,0.22269,3.05137
96,Lesotho,97,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2.79832
100,Swaziland,101,4.867,0.71206,1.07284,0.07566,0.30658,0.0306,0.18259,2.48676
125,Djibouti,126,4.369,0.44025,0.59207,0.36291,0.46074,0.28105,0.18093,2.05125
147,Central African Republic,148,3.678,0.0785,0.0,0.06699,0.48879,0.08289,0.23835,2.7223


In [21]:
# Checking for countries that has records in 2016, but not in 2015
df_2016[~df_2016.Country.isin(df_2015.Country)]

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_(GDP_per_Capita),Family,Health_(Life_Expectancy),Freedom,Trust_(Government_Corruption),Generosity,Dystopia_Residual
14,Puerto Rico,15,7.039,1.35943,1.08113,0.77758,0.46823,0.12275,0.22202,3.0076
51,Belize,52,5.956,0.87616,0.68655,0.45569,0.51231,0.10771,0.23684,3.08039
75,Somalia,76,5.44,0.0,0.33613,0.11466,0.56778,0.3118,0.27225,3.83772
96,Somaliland Region,97,5.057,0.25558,0.75862,0.33108,0.3913,0.36794,0.51479,2.43801
112,Namibia,113,4.574,0.93287,0.70362,0.34745,0.48614,0.10398,0.07795,1.92198
142,South Sudan,143,3.832,0.39394,0.18519,0.15781,0.19662,0.13015,0.25899,2.50929


In [22]:
# Checking for countries that has records in later years but not in 2015
merge_df[~merge_df.Country.isin(df_2015.Country)]

Unnamed: 0,Country,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,Dystopia_Residual_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government Corruption)_2019,Dystopia_Residual_2019
158,Puerto Rico,,,,,,,,,,...,,,,,,,,,,
159,Belize,,,,,,,,,,...,2.709,,,,,,,,,
160,Somalia,,,,,,,,,,...,2.961,112.0,4.668,0.0,0.698,0.268,0.559,0.243,0.27,2.63
161,Somaliland Region,,,,,,,,,,...,,,,,,,,,,
162,Namibia,,,,,,,,,,...,1.287,113.0,4.639,0.879,1.313,0.477,0.401,0.07,0.056,1.443
163,South Sudan,,,,,,,,,,...,1.69,156.0,2.853,0.306,0.575,0.295,0.01,0.202,0.091,1.374
164,Taiwan Province of China,,,,,,,,,,...,,,,,,,,,,
165,"Hong Kong S.A.R., China",,,,,,,,,,...,,,,,,,,,,
166,Trinidad & Tobago,,,,,,,,,,...,2.148,39.0,6.192,1.231,1.477,0.713,0.489,0.185,0.016,2.081
167,Northern Cyprus,,,,,,,,,,...,1.658,64.0,5.718,1.263,1.252,1.042,0.417,0.191,0.162,1.391


In [23]:
# Replacing null value found in merged dataframe with 0
merge_df.fillna(0,inplace = True)
merge_df.isna().sum()

Country                               0
Happiness_Rank_2015                   0
Happiness_Score_2015                  0
Economy_(GDP_per_Capita)_2015         0
Family_2015                           0
Health_(Life_Expectancy)_2015         0
Freedom_2015                          0
Trust_(Government_Corruption)_2015    0
Generosity_2015                       0
Dystopia_Residual_2015                0
Happiness_Rank_2016                   0
Happiness_Score_2016                  0
Economy_(GDP_per_Capita)_2016         0
Family_2016                           0
Health_(Life_Expectancy)_2016         0
Freedom_2016                          0
Trust_(Government_Corruption)_2016    0
Generosity_2016                       0
Dystopia_Residual_2016                0
Happiness_Rank_2017                   0
Happiness_Score_2017                  0
Economy_(GDP_per_Capita)_2017         0
Family_2017                           0
Health_(Life_Expectancy)_2017         0
Freedom_2017                          0


In [24]:
# Checking the country names which are in Merged Data frame, but missing in any of the years data frames
merge_df["Country"].loc[(~merge_df["Country"].isin(df_2015["Country"])) | \
                        (~merge_df["Country"].isin(df_2016["Country"])) | \
                        (~merge_df["Country"].isin(df_2017["Country"])) | \
                        (~merge_df["Country"].isin(df_2018["Country"])) | \
                        (~merge_df["Country"].isin(df_2019["Country"]))
                       ].sort_values()

136                      Angola
159                      Belize
147    Central African Republic
139                     Comoros
125                    Djibouti
169                      Gambia
71                    Hong Kong
165     Hong Kong S.A.R., China
98                         Laos
96                      Lesotho
92                    Macedonia
93                   Mozambique
162                     Namibia
65                 North Cyprus
168             North Macedonia
167             Northern Cyprus
21                         Oman
158                 Puerto Rico
160                     Somalia
161           Somaliland Region
90            Somaliland region
163                 South Sudan
117                       Sudan
39                     Suriname
100                   Swaziland
37                       Taiwan
164    Taiwan Province of China
166           Trinidad & Tobago
40          Trinidad and Tobago
Name: Country, dtype: object

In [25]:
# Making the country names matching for those refering to the same country 
merge_df["Country"].loc[merge_df.Country == "North Cyprus"] = "Northern Cyprus"
merge_df["Country"].loc[merge_df.Country == "Hong Kong S.A.R., China"] = "Hong Kong"
merge_df["Country"].loc[merge_df.Country == "Taiwan Province of China"] = "Taiwan"
merge_df["Country"].loc[merge_df.Country == "Macedonia"] = "North Macedonia"
merge_df["Country"].loc[merge_df.Country == "Trinidad & Tobago"] = "Trinidad and Tobago"
merge_df["Country"].loc[merge_df.Country == "Somaliland region"] = "Somaliland Region"

# Checking the country names which were not present in any of the years data frames
country_check_list = merge_df["Country"].loc[(~merge_df["Country"].isin(df_2015["Country"])) | \
                                             (~merge_df["Country"].isin(df_2016["Country"])) | \
                                             (~merge_df["Country"].isin(df_2017["Country"])) | \
                                             (~merge_df["Country"].isin(df_2018["Country"])) | \
                                             (~merge_df["Country"].isin(df_2019["Country"]))
                                            ].sort_values().tolist()
country_check_list = set(country_check_list)
country_check_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


{'Angola',
 'Belize',
 'Central African Republic',
 'Comoros',
 'Djibouti',
 'Gambia',
 'Hong Kong',
 'Laos',
 'Lesotho',
 'Mozambique',
 'Namibia',
 'North Macedonia',
 'Northern Cyprus',
 'Oman',
 'Puerto Rico',
 'Somalia',
 'Somaliland Region',
 'South Sudan',
 'Sudan',
 'Suriname',
 'Swaziland',
 'Taiwan',
 'Trinidad and Tobago'}

In [26]:
# Displaying the list of Countries missing data in some year
merge_df.loc[merge_df["Country"].isin(country_check_list)]

Unnamed: 0,Country,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,Dystopia_Residual_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government Corruption)_2019,Dystopia_Residual_2019
21,Oman,22.0,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2.47489,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,Taiwan,38.0,6.298,1.29098,1.07617,0.8753,0.3974,0.08129,0.25376,2.32323,...,2.136,25.0,6.446,1.368,1.43,0.914,0.351,0.242,0.097,2.044
39,Suriname,40.0,6.269,0.99534,0.972,0.6082,0.59657,0.13633,0.16991,2.79094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,Trinidad and Tobago,41.0,6.168,1.21183,1.18354,0.61483,0.55884,0.0114,0.31844,2.26882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65,Northern Cyprus,66.0,5.695,1.20806,1.07008,0.92356,0.49027,0.1428,0.26169,1.59888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,Hong Kong,72.0,5.474,1.38604,1.05818,1.01328,0.59608,0.37124,0.39478,0.65429,...,0.644,76.0,5.43,1.438,1.277,1.122,0.44,0.258,0.287,0.608
90,Somaliland Region,91.0,5.057,0.18847,0.95152,0.43873,0.46582,0.39928,0.50318,2.11032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92,North Macedonia,93.0,5.007,0.91851,1.00232,0.73545,0.33457,0.05327,0.22359,1.73933,...,1.677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,Mozambique,94.0,4.971,0.08308,1.02626,0.09131,0.34037,0.15603,0.22269,3.05137,...,2.249,123.0,4.466,0.204,0.986,0.39,0.494,0.197,0.138,2.057
96,Lesotho,97.0,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2.79832,...,1.391,144.0,3.802,0.489,1.169,0.168,0.359,0.107,0.093,1.417


In [27]:
# Creating a list of country names duplicated
dup_countries = merge_df['Country'].loc[merge_df['Country'].duplicated()].tolist() 
dup_countries

['Somaliland Region',
 'Taiwan',
 'Hong Kong',
 'Trinidad and Tobago',
 'Northern Cyprus',
 'North Macedonia']

In [28]:
# Merging duplicate country names row wise by getting sum of each column values

# Loop through each duplicated country names in the list
for country in dup_countries:
    
    # Making a new data frame having only the duplicated country names
    joined_rows = merge_df.loc[merge_df.Country == country]
    # Adding a row to the new data frame with the sum of each columns
    joined_rows.loc[country,:] = joined_rows.sum(axis=0)
    # Correcting the Country column value
    joined_rows['Country'] = country
    
    # Removing those rows from the merged data frame
    merge_df.drop(merge_df[merge_df.Country == country].index, inplace=True)
    # Concatenating the last row added(sum) to the original merged data frame
    merge_df = pd.concat([merge_df, joined_rows.tail(1)])

# Reseting the merged data frame's index
merge_df.reset_index(drop=True, inplace=True)

# Displaying the countries data which were missing in any of the year's data frame - after cleaning
merge_df.loc[merge_df["Country"].isin(country_check_list)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joined_rows.loc[country,:] = joined_rows.sum(axis=0)


Unnamed: 0,Country,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,Dystopia_Residual_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government Corruption)_2019,Dystopia_Residual_2019
21,Oman,22.0,6.853,1.36011,1.08182,0.76276,0.63274,0.32524,0.21542,2.47489,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,Suriname,40.0,6.269,0.99534,0.972,0.6082,0.59657,0.13633,0.16991,2.79094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,Mozambique,94.0,4.971,0.08308,1.02626,0.09131,0.34037,0.15603,0.22269,3.05137,...,2.249,123.0,4.466,0.204,0.986,0.39,0.494,0.197,0.138,2.057
90,Lesotho,97.0,4.898,0.37545,1.04103,0.07612,0.31767,0.12504,0.16388,2.79832,...,1.391,144.0,3.802,0.489,1.169,0.168,0.359,0.107,0.093,1.417
92,Laos,99.0,4.876,0.59066,0.73803,0.54909,0.59591,0.24249,0.42192,1.73799,...,1.398,105.0,4.796,0.764,1.03,0.551,0.547,0.266,0.164,1.474
94,Swaziland,101.0,4.867,0.71206,1.07284,0.07566,0.30658,0.0306,0.18259,2.48676,...,0.0,135.0,4.212,0.811,1.149,0.0,0.313,0.074,0.135,1.73
111,Sudan,118.0,4.55,0.52107,1.01404,0.36878,0.10081,0.1466,0.19062,2.20857,...,1.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,Djibouti,126.0,4.369,0.44025,0.59207,0.36291,0.46074,0.28105,0.18093,2.05125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130,Angola,137.0,4.033,0.75778,0.8604,0.16683,0.10384,0.07122,0.12344,1.94939,...,1.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,Comoros,140.0,3.956,0.23906,0.79273,0.36315,0.22917,0.199,0.17441,1.95812,...,0.0,142.0,3.973,0.274,0.757,0.505,0.142,0.275,0.078,1.942


In [29]:
# Replacing 0 values with NAN
cols = merge_df.columns
merge_df[cols] = merge_df[cols].replace({'0':np.nan, 0:np.nan})

merge_df.sort_values

# Writing the cleaned merged data frame to a csv file
merge_df.to_csv("../Output/happiness_merged.csv", index=False)

### Getting Region data using API

In [30]:
# Loading data from merged csv
df = pd.read_csv("../Output/happiness_merged.csv")

# URL for GET requests to retrieve Region data of Countries from Rest Countries API
base_url = "https://restcountries.eu/rest/v2/name/"

In [31]:
# Defining a function to fetch each Contries Region with API
def fetchRegion(countries):

    country_list = []
    region_list = []
    not_found = []

    for country in countries:
        try:   
            # Appending country name to the base url
            url = f"{base_url}{country}"

            # Perform a get request for each country
            response = requests.get(url)

            # Storing the JSON response within a variable
            data = response.json()
            
            # Adding the region and country name to lists
            region_list.append(data[0]['region'])
            country_list.append(country)

        except:
            not_found.append(country)
    # function return all the list of values
    return country_list, region_list, not_found

In [32]:
# Creating a list of all countries from each years data
countries = df['Country']

# Calling out the function to fetch regions of countries in countries list
country_list, region_list, not_found = fetchRegion(countries)

# Creating a data frame with the list values returned from the function
df_region = pd.DataFrame({"Country": country_list, "Region": region_list})

print(f"Countries Not Found: {not_found}")
df_region

Countries Not Found: ['South Korea', 'Palestinian Territories', 'Congo (Kinshasa)', 'Congo (Brazzaville)', 'Somaliland Region', 'Northern Cyprus', 'North Macedonia']


Unnamed: 0,Country,Region
0,Switzerland,Europe
1,Iceland,Europe
2,Denmark,Europe
3,Norway,Europe
4,Canada,Americas
5,Finland,Europe
6,Netherlands,Europe
7,Sweden,Europe
8,New Zealand,Oceania
9,Australia,Oceania


In [33]:
# Getting region from nearby location or other name
df_not_found = pd.DataFrame({'Country': not_found,
                             'Try_Name':['Korea', 'Palestine',
                                         'Congo', 'Congo',
                                         'Somalia', 'Cyprus',
                                         'Macedonia']                              
                           })
df_not_found

Unnamed: 0,Country,Try_Name
0,South Korea,Korea
1,Palestinian Territories,Palestine
2,Congo (Kinshasa),Congo
3,Congo (Brazzaville),Congo
4,Somaliland Region,Somalia
5,Northern Cyprus,Cyprus
6,North Macedonia,Macedonia


In [34]:
# Creating a new list of countries to fecth region which didn't get results yet
countries = df_not_found["Try_Name"].tolist()

# Calling out the function to fetch regions of countries in the new list
country_list, region_list, not_found = fetchRegion(countries)

# Creating a data frame with the list values returned from the function
df_found = pd.DataFrame({"Try_Name": country_list, "Region": region_list})

# Checking if any Country's Region is still missing
print(f"Number of Countries Not Found: {len(not_found)}")

df_found

Number of Countries Not Found: 0


Unnamed: 0,Try_Name,Region
0,Korea,Asia
1,Palestine,Asia
2,Congo,Africa
3,Congo,Africa
4,Somalia,Africa
5,Cyprus,Europe
6,Macedonia,Europe


In [35]:
# Merging The 2 data frames to eliminate Try_Name in between
merge_df = df_not_found.merge(df_found, on='Try_Name', how= 'left')
merge_df.drop(['Try_Name'], axis=1, inplace= True)
merge_df

Unnamed: 0,Country,Region
0,South Korea,Asia
1,Palestinian Territories,Asia
2,Congo (Kinshasa),Africa
3,Congo (Kinshasa),Africa
4,Congo (Brazzaville),Africa
5,Congo (Brazzaville),Africa
6,Somaliland Region,Africa
7,Northern Cyprus,Europe
8,North Macedonia,Europe


In [36]:
# Concatenating the data frames with Country and Region found in 2 function calls
df_region = pd.concat([df_region, merge_df]).reset_index(drop=True)
df_region

Unnamed: 0,Country,Region
0,Switzerland,Europe
1,Iceland,Europe
2,Denmark,Europe
3,Norway,Europe
4,Canada,Americas
5,Finland,Europe
6,Netherlands,Europe
7,Sweden,Europe
8,New Zealand,Oceania
9,Australia,Oceania


In [37]:
# Merging region with the happiness data frame
df = df_region.merge(df, on="Country", how="left")

# Writing the data back to the merge csv file
df.to_csv('../Output/happiness_merged.csv', index=False)

# 1.5 Suicide csv data cleaning

In [38]:
# Reading all the needed csv data into dataframes 
df = pd.read_csv("../Output/happiness_merged.csv")
df_suicide = pd.read_csv("../Resources/suicide_death_rate.csv", encoding='utf-8')

df.head()

Unnamed: 0,Country,Region,Happiness_Rank_2015,Happiness_Score_2015,Economy_(GDP_per_Capita)_2015,Family_2015,Health_(Life_Expectancy)_2015,Freedom_2015,Trust_(Government_Corruption)_2015,Generosity_2015,...,Dystopia_Residual_2018,Happiness_Rank_2019,Happiness_Score_2019,Economy_(GDP_per_Capita)_2019,Family_2019,Health_(Life_Expectancy)_2019,Freedom_2019,Generosity_2019,Trust_(Government Corruption)_2019,Dystopia_Residual_2019
0,Switzerland,Europe,1.0,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,...,2.318,6.0,7.48,1.452,1.526,1.052,0.572,0.263,0.343,2.272
1,Iceland,Europe,2.0,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,...,2.426,4.0,7.494,1.38,1.624,1.026,0.591,0.354,0.118,2.401
2,Denmark,Europe,3.0,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,...,2.371,2.0,7.6,1.383,1.573,0.996,0.592,0.252,0.41,2.394
3,Norway,Europe,4.0,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,...,2.383,3.0,7.554,1.488,1.582,1.028,0.603,0.271,0.341,2.241
4,Canada,Americas,5.0,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,...,2.305,9.0,7.278,1.365,1.505,1.039,0.584,0.285,0.308,2.192


In [39]:
# Selecting the needed Columns and dropping the unnecessary from the happiness data frame
df = df.loc[:,['Country',
               'Region',
               'Happiness_Score_2015',
               'Happiness_Score_2016',
               'Happiness_Score_2017',
               'Happiness_Score_2018',
               'Happiness_Score_2019']]

# Renaming the Yearwise Column names
df = df.rename(columns={'Happiness_Score_2015': '2015',
               'Happiness_Score_2016': '2016',
               'Happiness_Score_2017': '2017',
               'Happiness_Score_2018': '2018',
               'Happiness_Score_2019': '2019'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  166 non-null    object 
 1   Region   166 non-null    object 
 2   2015     160 non-null    float64
 3   2016     159 non-null    float64
 4   2017     157 non-null    float64
 5   2018     158 non-null    float64
 6   2019     158 non-null    float64
dtypes: float64(5), object(2)
memory usage: 9.2+ KB


In [40]:
# Selecting the needed Columns and dropping the unnecessary from the suicide data frame
df_suicide = df_suicide.loc[:,['location','year','val']]

# Renaming the coulumns to match with Happiness data frame
df_suicide = df_suicide.rename(columns={"location": "Country", "year": "Year", "val": "Suicide_Rate"})

# Changing the type of values in Year column to string to match with column names in happiness data frame
df_suicide.Year = df_suicide.Year.astype('str')
df_suicide.head()

Unnamed: 0,Country,Year,Suicide_Rate
0,Malawi,2015,6.675765
1,Sao Tome and Principe,2015,1.947396
2,Western Sub-Saharan Africa,2015,5.255762
3,Latin America and Caribbean,2015,6.433427
4,Uzbekistan,2015,11.81007


In [41]:
# Checking for Country name mismatches in both data frame
# Listing Country nemes in Happiness but not found in Suicide data frame
mismatch = df[~df.Country.isin(df_suicide.Country)].Country
mismatch

14               United States
22                   Venezuela
30              Czech Republic
47                     Bolivia
48                     Moldova
60                      Russia
64                      Kosovo
69                     Vietnam
91                        Laos
93                   Swaziland
101                       Iran
135                   Tanzania
140                Ivory Coast
145                      Syria
154                     Taiwan
155                  Hong Kong
157                South Korea
158    Palestinian Territories
159           Congo (Kinshasa)
160           Congo (Kinshasa)
161        Congo (Brazzaville)
162        Congo (Brazzaville)
163          Somaliland Region
164            Northern Cyprus
Name: Country, dtype: object

In [42]:
# Checking for Country names in Suicide dataframe which contains, Country names in Happiness dataframe as part of it

temp_df = pd.DataFrame()
for country in mismatch: 
    temp_df2 = df_suicide.loc[df_suicide['Country'].str.contains(country)]
    temp_df = pd.concat([temp_df, temp_df2])
    
temp_df

  return func(self, *args, **kwargs)


Unnamed: 0,Country,Year,Suicide_Rate
111,United States Virgin Islands,2015,8.323183
564,United States of America,2015,14.411347
656,United States of America,2017,14.401417
872,United States Virgin Islands,2017,8.367747
1565,United States of America,2016,14.736476
1641,United States Virgin Islands,2016,8.291533
1979,United States of America,2019,13.826758
2076,United States Virgin Islands,2019,8.452237
2089,United States of America,2018,14.100089
2270,United States Virgin Islands,2018,8.418641


In [43]:
# Changing the Country names in Suicide data frame to match with Happiness dataframe
df_suicide["Country"].loc[df_suicide.Country == "United States of America"] = "United States"
df_suicide["Country"].loc[df_suicide.Country == "Venezuela (Bolivarian Republic of)"] = "Venezuela"
df_suicide["Country"].loc[df_suicide.Country == "Bolivia (Plurinational State of)"] = "Bolivia"
df_suicide["Country"].loc[df_suicide.Country == "Republic of Moldova"] = "Moldova"
df_suicide["Country"].loc[df_suicide.Country == "Russian Federation"] = "Russia"
df_suicide["Country"].loc[df_suicide.Country == "Iran (Islamic Republic of)"] = "Iran"
df_suicide["Country"].loc[df_suicide.Country == "United Republic of Tanzania"] = "Tanzania"
df_suicide["Country"].loc[df_suicide.Country == "Syrian Arab Republic"] = "Syria"
df_suicide["Country"].loc[df_suicide.Country == "Taiwan (Province of China)"] = "Taiwan"

# Again checking for mismatch, Listing Country nemes in Happiness but not found in Suicide data frame
mismatch = df[~df.Country.isin(df_suicide.Country)].Country
mismatch

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


30              Czech Republic
64                      Kosovo
69                     Vietnam
91                        Laos
93                   Swaziland
140                Ivory Coast
155                  Hong Kong
157                South Korea
158    Palestinian Territories
159           Congo (Kinshasa)
160           Congo (Kinshasa)
161        Congo (Brazzaville)
162        Congo (Brazzaville)
163          Somaliland Region
164            Northern Cyprus
Name: Country, dtype: object

In [44]:
# Checking for Country names in Suicide dataframe which starts with first 4 characters of, 
# Country names in Happiness dataframe as part of it

temp_df = pd.DataFrame()
for country in mismatch: 
    temp_df2 = df_suicide.loc[df_suicide['Country'].str.startswith(country[:4])]
    temp_df = pd.concat([temp_df, temp_df2])
    
temp_df

Unnamed: 0,Country,Year,Suicide_Rate
323,Czechia,2015,14.840185
802,Czechia,2017,14.358788
1547,Czechia,2016,14.476422
1924,Czechia,2019,13.900502
3033,Czechia,2018,14.220664
79,Viet Nam,2015,8.270146
577,Viet Nam,2016,8.346326
1353,Viet Nam,2018,8.42965
2338,Viet Nam,2017,8.409816
2556,Viet Nam,2019,8.419432


In [45]:
# Changing the Country names in Suicide data frame to match with Happiness dataframe
df_suicide["Country"].loc[df_suicide.Country == "Czechia"] = "Czech Republic"
df_suicide["Country"].loc[df_suicide.Country == "Viet Nam"] = "Vietnam"
df_suicide["Country"].loc[df_suicide.Country == "Republic of Korea"] = "South Korea"
df_suicide["Country"].loc[df_suicide.Country == "Palestine"] = "Palestinian Territories"
df_suicide["Country"].loc[df_suicide.Country == "Democratic Republic of the Congo"] = "Congo (Kinshasa)"
df_suicide["Country"].loc[df_suicide.Country == "Congo"] = "Congo (Brazzaville)"
df_suicide["Country"].loc[df_suicide.Country == "Lao People's Democratic Republic"] = "Laos"
df_suicide["Country"].loc[df_suicide.Country == "Eswatini"] = "Swaziland"
df_suicide["Country"].loc[df_suicide.Country == "CÃ´te d'Ivoire"] = "Ivory Coast"

# Again checking for mismatch, Listing Country nemes in Happiness but not found in Suicide data frame
mismatch = df[~df.Country.isin(df_suicide.Country)].Country
# The mismatch is due to missing data.
mismatch

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


64                Kosovo
155            Hong Kong
163    Somaliland Region
164      Northern Cyprus
Name: Country, dtype: object

In [46]:
# Melting the happiness dataframe to make a new column 'Year' 
# Listing the values of Year column
value_vars = ['2015', '2016', '2017', '2018', '2019']

# Melting keeping Country and Region same and Creating Year column, assigning year values and corresponding happiness score
df = pd.melt(df, id_vars=['Country','Region'], value_vars=value_vars,
             var_name='Year', value_name='Happiness_Score', col_level=None)

df.head()

Unnamed: 0,Country,Region,Year,Happiness_Score
0,Switzerland,Europe,2015,7.587
1,Iceland,Europe,2015,7.561
2,Denmark,Europe,2015,7.527
3,Norway,Europe,2015,7.522
4,Canada,Americas,2015,7.427


In [47]:
# Merging Suicide rates to the happiness dataframe
df = df.merge(df_suicide, left_on=["Country","Year"], right_on=["Country","Year"], how="left")

# Dropping rows with happiness score as NAN
df = df[df['Happiness_Score'].notna()]

# Writing the data to a file to easy access in analysis
df.to_csv("../Output/suicide_concat.csv", index = False)

df.head()

Unnamed: 0,Country,Region,Year,Happiness_Score,Suicide_Rate
0,Switzerland,Europe,2015,7.587,13.844501
1,Iceland,Europe,2015,7.561,11.10399
2,Denmark,Europe,2015,7.527,11.467831
3,Norway,Europe,2015,7.522,10.965794
4,Canada,Americas,2015,7.427,13.671676
