In [1]:
import os
from sqlalchemy import create_engine
import pymysql
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
from sklearn.linear_model import LinearRegression
import math
from statsmodels.tsa.ar_model import AR

In [2]:
pd.set_option('display.max_columns', 100)
sqlpassword = os.environ['SQLpassword']
engine = create_engine(f"mysql+pymysql://root:{sqlpassword}@127.0.0.1/world_happiness")

## **Step 1.** Reading and cleaning data.

### a) Extraction of standartised country codes and names for further unification of data sets.

In [3]:
url = 'https://www.iban.com/country-codes'
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")
cdict = {'class':'table'}
table = soup.find_all('table', cdict)[0]
rows = table.find_all('td')
rows = [x.contents[0] for x in rows]
rows[:21]

['Afghanistan',
 'AF',
 'AFG',
 '004',
 'Åland Islands',
 'AX',
 'ALA',
 '248',
 'Albania',
 'AL',
 'ALB',
 '008',
 'Algeria',
 'DZ',
 'DZA',
 '012',
 'American Samoa',
 'AS',
 'ASM',
 '016',
 'Andorra']

In [4]:
# There are 4 columns in a table so we split all content of a table in lists of 4 elements each.
n_countries = int(len(rows)/4)
countries = np.array_split(rows, n_countries)

# Convert list of rows values into dataframe.
df_countries = pd.DataFrame(countries,columns=['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric'])

# Rename some countries to make them more readable on future graphs. Code below came up as a result of review each data set
# below.

pat1 = r" \(the\)"
pat2 = r" \(the Republic of\)"
combined_pat = r'|'.join((pat1, pat2))
df_countries['Country'] = df_countries['Country'].map(lambda x: re.sub(combined_pat, '', x))
old_names = ['United Kingdom of Great Britain and Northern Ireland', 'United States of America', 'Korea',
             'Bolivia (Plurinational State of)', 'Iran (Islamic Republic of)', "Lao People's Democratic Republic",
             'Republic of North Macedonia', 'Palestine, State of', 'Eswatini', 'Syrian Arab Republic',
             'Tanzania, United Republic of', 'Venezuela (Bolivarian Republic of)', 'Viet Nam', 'Åland Islands']

new_names = ['United Kingdom', 'United States', 'South Korea', 'Bolivia', 'Iran', 'Laos', 'Macedonia', 'Palestine', 
            'Swaziland (Eswatini)', 'Syria', 'Tanzania', 'Venezuela', 'Vietnam', 'Aland Islands']

df_countries['Country'].replace(old_names, new_names, inplace=True)
czech = {'Country':'Czech Republic', 'Alpha-2 code':'CZ', 'Alpha-3 code':'CZE', 'Numeric':'420'}
df_countries = df_countries.append(czech, ignore_index=True)

# Form dictionary out of 2 suitable columns.
countries_dict = dict(zip(df_countries['Alpha-3 code'], df_countries['Country']))
countries_dict

{'AFG': 'Afghanistan',
 'ALA': 'Aland Islands',
 'ALB': 'Albania',
 'DZA': 'Algeria',
 'ASM': 'American Samoa',
 'AND': 'Andorra',
 'AGO': 'Angola',
 'AIA': 'Anguilla',
 'ATA': 'Antarctica',
 'ATG': 'Antigua and Barbuda',
 'ARG': 'Argentina',
 'ARM': 'Armenia',
 'ABW': 'Aruba',
 'AUS': 'Australia',
 'AUT': 'Austria',
 'AZE': 'Azerbaijan',
 'BHS': 'Bahamas',
 'BHR': 'Bahrain',
 'BGD': 'Bangladesh',
 'BRB': 'Barbados',
 'BLR': 'Belarus',
 'BEL': 'Belgium',
 'BLZ': 'Belize',
 'BEN': 'Benin',
 'BMU': 'Bermuda',
 'BTN': 'Bhutan',
 'BOL': 'Bolivia',
 'BES': 'Bonaire, Sint Eustatius and Saba',
 'BIH': 'Bosnia and Herzegovina',
 'BWA': 'Botswana',
 'BVT': 'Bouvet Island',
 'BRA': 'Brazil',
 'IOT': 'British Indian Ocean Territory',
 'BRN': 'Brunei Darussalam',
 'BGR': 'Bulgaria',
 'BFA': 'Burkina Faso',
 'BDI': 'Burundi',
 'CPV': 'Cabo Verde',
 'KHM': 'Cambodia',
 'CMR': 'Cameroon',
 'CAN': 'Canada',
 'CYM': 'Cayman Islands',
 'CAF': 'Central African Republic',
 'TCD': 'Chad',
 'CHL': 'Chile',


***Analysis of each unmached value from data set 'happiness rating':***

'Congo (Brazzaville)' - rename in *happiness* \
'Congo (Kinshasa)' - rename in *happiness* \
'Hong Kong S.A.R. China' - rename in *happiness* \
'Iran' - rename in *countries* \
'Ivory Coast' - keep\
'Kosovo' - keep\
'Laos' - rename in *countries* \
'Macedonia' - rename in *countries* \
'North Cyprus'  - rename in *happiness*\
'North Macedonia' - rename in *happiness*\
'Northern Cyprus' - keep \
'Palestinian Territories' - rename in *happiness*\
'Russia' - rename in *happiness* \
'Somaliland Region' - keep\
'Somaliland region' - rename in *happiness*\
'South Korea' - rename in *countries* \
'Swaziland' - rename in *countries* & *happiness*\
'Syria' - rename in *countries*\
'Taiwan' - rename in *happiness*\
'Taiwan Province of China' - rename in *happiness*\
'Tanzania' - rename in *countries*\
'Trinidad & Tobago' - rename in *happiness*\
'Venezuela' - rename in *countries*\
'Vietnam' - rename in *countries*

In [5]:
# Defining two functions which will be used to check and amend country names spelling.

def country_name(row):
    for key, value in countries_dict.items():
        if row.Country == key:
            return value

def country_code(row):
    for key, value in countries_dict.items():
        if row.Country == value:
            return key

### b) Combining data sets for various periods in one table based on data topic.

### *Suicide data.*

In [57]:
df_suicide_new = pd.read_sql_query('''SELECT * FROM world_happiness.suicide_2015_2018''', engine)
df_suicide_new = df_suicide_new.rename(columns={'LOCATION': 'Country'})
df_suicide_new.head()

Unnamed: 0,Country,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
0,AUS,SUICIDE,TOT,100000PER,A,2015,13.1
1,AUS,SUICIDE,TOT,100000PER,A,2016,12.1
2,AUS,SUICIDE,TOT,100000PER,A,2017,12.8
3,AUT,SUICIDE,TOT,100000PER,A,2015,12.9
4,AUT,SUICIDE,TOT,100000PER,A,2016,12.2


In [58]:
df_suicide_new['Country'] = df_suicide_new.apply(lambda row: country_name(row), axis=1)
df_suicide_new.head()

Unnamed: 0,Country,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
0,Australia,SUICIDE,TOT,100000PER,A,2015,13.1
1,Australia,SUICIDE,TOT,100000PER,A,2016,12.1
2,Australia,SUICIDE,TOT,100000PER,A,2017,12.8
3,Austria,SUICIDE,TOT,100000PER,A,2015,12.9
4,Austria,SUICIDE,TOT,100000PER,A,2016,12.2


In [59]:
df_suicide_who = pd.read_sql_query('''SELECT * FROM world_happiness.suicide_1987_2014''', engine)
df_suicide_who.head()

Unnamed: 0,country,Year,sex,age,suicides_no,population,suicides_100k_pop,country_year,HDI_for_year,gdp_for_year_$,gdp_per_capita_$,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,0.0,2,796,Generation X\r
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,0.0,2,796,Silent\r
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,0.0,2,796,Generation X\r
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,0.0,2,796,G.I. Generation\r
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,0.0,2,796,Boomers\r


In [111]:
# There is split by different gender and age groups so we will take an average per year.
df_suicide_who_cut = df_suicide_who[['country', 'Year', 'suicides_100k_pop']].groupby(['country', 'Year']).mean().reset_index()
df_suicide_who_cut

Unnamed: 0,country,Year,suicides_100k_pop
0,Albania,1987,2.654167
1,Albania,1988,2.705000
2,Albania,1989,2.783333
3,Albania,1992,1.500000
4,Albania,1993,2.713333
...,...,...,...
2316,Uzbekistan,2010,5.269167
2317,Uzbekistan,2011,5.595000
2318,Uzbekistan,2012,6.460833
2319,Uzbekistan,2013,6.571667


In [113]:
df1 = df_suicide_new[['Country', 'TIME', 'Value']]
df1.columns = ['Country', 'Year', 'Suicide_rate_per_100K']
df2 = df_suicide_who_cut[['country', 'Year', 'suicides_100k_pop']]
df2.columns=['Country', 'Year', 'Suicide_rate_per_100K']
frames = [df1, df2]
df_suicide = pd.concat(frames)

# There might be an overlap in 2015 between two data sets so let's double check these as well.
df_suicide = df_suicide[['Country', 'Year', 'Suicide_rate_per_100K']].groupby(['Country', 'Year']).mean().reset_index()

df_suicide.head()

Unnamed: 0,Country,Year,Suicide_rate_per_100K
0,Albania,1987,2.654167
1,Albania,1988,2.705
2,Albania,1989,2.783333
3,Albania,1992,1.5
4,Albania,1993,2.713333


In [114]:
# Check spelling of countries. It should be unified accross all data sets as this field will be used for merging eventually.

df_suicide['Country_Code'] = df_suicide.apply(lambda row: country_code(row), axis=1)
df_suicide.head()

Unnamed: 0,Country,Year,Suicide_rate_per_100K,Country_Code
0,Albania,1987,2.654167,ALB
1,Albania,1988,2.705,ALB
2,Albania,1989,2.783333,ALB
3,Albania,1992,1.5,ALB
4,Albania,1993,2.713333,ALB


In [115]:
# List of countries which spelling is different from international standards.

df_suicide.loc[df_suicide['Country_Code'].isnull()]['Country'].unique()

array(['Macau', 'Republic of Korea', 'Saint Vincent and Grenadines'],
      dtype=object)

In [116]:
# Manually prepared mapping for renaming several countries in order to unify them and make them easiry readable.

df_suicide['Country'].replace(['Macau', 'Saint Vincent and Grenadines'], 
                                ['Macao', 'Saint Vincent and the Grenadines'], 
                                inplace=True)

In [117]:
# Second check to see that everything is unified now.

df_suicide['Country_Code'] = df_suicide.apply(lambda row: country_code(row), axis=1)
df_suicide.loc[df_suicide['Country_Code'].isnull()]['Country'].unique()

array(['Republic of Korea'], dtype=object)

In [118]:
df_suicide.to_sql('suicide_consol', engine, 'world_happiness', if_exists='replace')

### *Global happiness data.*

In [15]:
df_happiness_2015 = pd.read_sql_query('''SELECT * FROM world_happiness.world_happiness_2015''', engine)
df_happiness_2015['Year'] = '2015'
df_happiness_2015.head()

Unnamed: 0,Country,Region,Happiness_Rank,Happiness_Score,Standard_Error,Economy_GDP_per_Capita,Family,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Dystopia_Residual,Year
0,Switzerland,Western Europe,1,7.587,0.03411,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,Western Europe,2,7.561,0.04884,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
2,Denmark,Western Europe,3,7.527,0.03328,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015
3,Norway,Western Europe,4,7.522,0.0388,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015
4,Canada,North America,5,7.427,0.03553,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015


In [16]:
df_happiness_2016 = pd.read_sql_query('''SELECT * FROM world_happiness.world_happiness_2016''', engine)
df_happiness_2016['Year'] = '2016'
df_happiness_2016.head()

Unnamed: 0,Country,Region,Happiness_Rank,Happiness_Score,Lower_Confidence_Interval,Upper_Confidence_Interval,Economy_GDP_per_Capita,Family,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Dystopia_Residual,Year
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939,2016
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463,2016
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137,2016
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465,2016
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596,2016


In [17]:
df_happiness_2017 = pd.read_sql_query('''SELECT * FROM world_happiness.world_happiness_2017''', engine)
df_happiness_2017['Year'] = '2017'
df_happiness_2017.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Whisker_high,Whisker_low,Economy_GDP_per_Capita,Family,Health_Life_Expectancy,Freedom,Generosity,Trust_Government_Corruption,Dystopia_Residual,Year
0,Norway,1,7.537,7.59444,7.47956,1.61646,1.53352,0.796667,0.635423,0.362012,0.315964,2.27703,2017
1,Denmark,2,7.522,7.58173,7.46227,1.48238,1.55112,0.792566,0.626007,0.35528,0.40077,2.31371,2017
2,Iceland,3,7.504,7.62203,7.38597,1.48063,1.61057,0.833552,0.627163,0.47554,0.153527,2.32272,2017
3,Switzerland,4,7.494,7.56177,7.42623,1.56498,1.51691,0.858131,0.620071,0.290549,0.367007,2.27672,2017
4,Finland,5,7.469,7.52754,7.41046,1.44357,1.54025,0.809158,0.617951,0.245483,0.382612,2.43018,2017


In [18]:
df_happiness_2018 = pd.read_sql_query('''SELECT * FROM world_happiness.world_happiness_2018''', engine)
df_happiness_2018['Year'] = '2018'
df_happiness_2018.head()

Unnamed: 0,Happiness_Rank,Country_or_region,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Generosity,Trust_Government_Corruption,Year
0,1,Finland,7.632,1.305,1.592,0.874,0.681,0.202,0.393,2018
1,2,Norway,7.594,1.456,1.582,0.861,0.686,0.286,0.34,2018
2,3,Denmark,7.555,1.351,1.59,0.868,0.683,0.284,0.408,2018
3,4,Iceland,7.495,1.343,1.644,0.914,0.677,0.353,0.138,2018
4,5,Switzerland,7.487,1.42,1.549,0.927,0.66,0.256,0.357,2018


In [19]:
df_happiness_2019 = pd.read_sql_query('''SELECT * FROM world_happiness.world_happiness_2019''', engine)
df_happiness_2019['Year'] = '2019'
df_happiness_2019.head()

Unnamed: 0,Happiness_Rank,Country_or_region,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Generosity,Trust_Government_Corruption,Year
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393,2019
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41,2019
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,2019
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118,2019
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,2019


In [20]:
# Based on review of all five data sets above some of the columns should be dropped to be able to combine all tables in one.

df_happiness_2015.drop(columns=['Region', 'Standard_Error', 'Dystopia_Residual'], inplace=True)
df_happiness_2015 = df_happiness_2015.rename(columns={'Family': 'Social_support'})
df_happiness_2015.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015


In [21]:
df_happiness_2016.drop(columns=['Region', 'Lower_Confidence_Interval', 'Upper_Confidence_Interval', 'Dystopia_Residual'],
                       inplace=True)
df_happiness_2016 = df_happiness_2016.rename(columns={'Family': 'Social_support'})
df_happiness_2016.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year
0,Denmark,1,7.526,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2016
1,Switzerland,2,7.509,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2016
2,Iceland,3,7.501,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2016
3,Norway,4,7.498,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2016
4,Finland,5,7.413,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2016


In [22]:
col_list = df_happiness_2015.columns.to_list()

In [23]:
df_happiness_2017.drop(columns=['Whisker_high', 'Whisker_low', 'Dystopia_Residual'], inplace=True)
df_happiness_2017 = df_happiness_2017.rename(columns={'Family': 'Social_support'})
df_happiness_2017 = df_happiness_2017[col_list]
df_happiness_2017.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year
0,Norway,1,7.537,1.61646,1.53352,0.796667,0.635423,0.315964,0.362012,2017
1,Denmark,2,7.522,1.48238,1.55112,0.792566,0.626007,0.40077,0.35528,2017
2,Iceland,3,7.504,1.48063,1.61057,0.833552,0.627163,0.153527,0.47554,2017
3,Switzerland,4,7.494,1.56498,1.51691,0.858131,0.620071,0.367007,0.290549,2017
4,Finland,5,7.469,1.44357,1.54025,0.809158,0.617951,0.382612,0.245483,2017


In [24]:
df_happiness_2018 = df_happiness_2018.rename(columns={'Country_or_region': 'Country'})
df_happiness_2018 = df_happiness_2018[col_list]
df_happiness_2018.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year
0,Finland,1,7.632,1.305,1.592,0.874,0.681,0.393,0.202,2018
1,Norway,2,7.594,1.456,1.582,0.861,0.686,0.34,0.286,2018
2,Denmark,3,7.555,1.351,1.59,0.868,0.683,0.408,0.284,2018
3,Iceland,4,7.495,1.343,1.644,0.914,0.677,0.138,0.353,2018
4,Switzerland,5,7.487,1.42,1.549,0.927,0.66,0.357,0.256,2018


In [25]:
df_happiness_2019 = df_happiness_2019.rename(columns={'Country_or_region': 'Country'})
df_happiness_2019 = df_happiness_2019[col_list]
df_happiness_2019.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year
0,Finland,1,7.769,1.34,1.587,0.986,0.596,0.393,0.153,2019
1,Denmark,2,7.6,1.383,1.573,0.996,0.592,0.41,0.252,2019
2,Norway,3,7.554,1.488,1.582,1.028,0.603,0.341,0.271,2019
3,Iceland,4,7.494,1.38,1.624,1.026,0.591,0.118,0.354,2019
4,Netherlands,5,7.488,1.396,1.522,0.999,0.557,0.298,0.322,2019


In [26]:
frames = [df_happiness_2015, df_happiness_2016, df_happiness_2017, df_happiness_2018, df_happiness_2019]

df_happiness = pd.concat(frames)
df_happiness.head()

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year
0,Switzerland,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2015
1,Iceland,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2015
2,Denmark,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2015
3,Norway,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2015
4,Canada,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2015


In [27]:
# Check spelling of countries. It should be unified accross all data sets as this field will be used for merging eventually.

df_happiness['Country_Code'] = df_happiness.apply(lambda row: country_code(row), axis=1)
df_happiness.head()
df_happiness.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 782 entries, 0 to 155
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      782 non-null    object 
 1   Happiness_Rank               782 non-null    int64  
 2   Happiness_Score              782 non-null    float64
 3   Economy_GDP_per_Capita       782 non-null    float64
 4   Social_support               782 non-null    float64
 5   Health_Life_Expectancy       782 non-null    float64
 6   Freedom                      782 non-null    float64
 7   Trust_Government_Corruption  782 non-null    float64
 8   Generosity                   782 non-null    float64
 9   Year                         782 non-null    object 
 10  Country_Code                 734 non-null    object 
dtypes: float64(7), int64(1), object(3)
memory usage: 73.3+ KB


In [28]:
# List of countries which spelling is different from international standards.

df_happiness.loc[df_happiness['Country_Code'].isnull()]['Country'].unique()

array(['Taiwan', 'Russia', 'North Cyprus', 'Kosovo', 'Somaliland region',
       'Swaziland', 'Palestinian Territories', 'Congo (Kinshasa)',
       'Congo (Brazzaville)', 'Ivory Coast', 'Somaliland Region',
       'Taiwan Province of China', 'Hong Kong S.A.R., China',
       'Trinidad & Tobago', 'Northern Cyprus', 'North Macedonia'],
      dtype=object)

In [29]:
old_names = ['Hong Kong S.A.R., China', 'North Cyprus', 'Palestinian Territories', 'Russia', 'Congo (Kinshasa)',
             'Congo (Brazzaville)', 'Somaliland region', 'Swaziland', 'Taiwan', 'Taiwan Province of China',
             'Trinidad & Tobago']

new_names = ['Hong Kong', 'Northern Cyprus', 'Palestine', 'Russian Federation', 'Congo (the Democratic Republic of the)',
            'Congo', 'Somaliland Region', 'Swaziland (Eswatini)', 'Taiwan (Province of China)', 'Taiwan (Province of China)',
            'Trinidad and Tobago']

df_happiness['Country'].replace(old_names, new_names, inplace=True)

In [30]:
# Second check to see that everything is unified now.

df_happiness['Country_Code'] = df_happiness.apply(lambda row: country_code(row), axis=1)
df_happiness.loc[df_happiness['Country_Code'].isnull()]['Country'].unique()

# These are areas of territorial disputes and will stay blank for analytical purposes.

array(['Northern Cyprus', 'Kosovo', 'Somaliland Region', 'Ivory Coast',
       'North Macedonia'], dtype=object)

In [90]:
df_happiness['Year'] = df_happiness['Year'].astype('int')
df_happiness.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 782 entries, 0 to 155
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      782 non-null    object 
 1   Happiness_Rank               782 non-null    int64  
 2   Happiness_Score              782 non-null    float64
 3   Economy_GDP_per_Capita       782 non-null    float64
 4   Social_support               782 non-null    float64
 5   Health_Life_Expectancy       782 non-null    float64
 6   Freedom                      782 non-null    float64
 7   Trust_Government_Corruption  782 non-null    float64
 8   Generosity                   782 non-null    float64
 9   Year                         782 non-null    int32  
 10  Country_Code                 764 non-null    object 
dtypes: float64(7), int32(1), int64(1), object(2)
memory usage: 70.3+ KB


In [91]:
df_happiness.to_sql('world_happiness_consol', engine, 'world_happiness', if_exists='replace')

### *Climate data.*

In [32]:
df_climate_raw = pd.read_sql_query('''SELECT * FROM world_happiness.temperature_1900_2013''', engine)
df_climate_raw.drop(columns=['index'], inplace=True)
df_climate_raw.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,01/01/1900,-3.026,0.538,Aland Islands
1,01/02/1900,-8.063,0.657,Aland Islands
2,01/03/1900,-3.196,0.467,Aland Islands
3,01/04/1900,0.781,0.224,Aland Islands
4,01/05/1900,4.96,0.503,Aland Islands


In [33]:
df_climate_raw['dt'] = pd.to_datetime(df_climate_raw['dt'])
df_climate_raw.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1900-01-01,-3.026,0.538,Aland Islands
1,1900-01-02,-8.063,0.657,Aland Islands
2,1900-01-03,-3.196,0.467,Aland Islands
3,1900-01-04,0.781,0.224,Aland Islands
4,1900-01-05,4.96,0.503,Aland Islands


In [34]:
# Intermediary dataframe suitable for testing two regression models and further clean-up.

df_climate_raw['Year_int'] = pd.DatetimeIndex(df_climate_raw['dt']).year
df_climate_raw['Year_period'] = pd.to_datetime(df_climate_raw['dt']).dt.to_period('Y')

df_climate_grouped = df_climate_raw.drop(columns=['dt']).groupby(['Country', 'Year_period']).mean().reset_index()
df_climate_grouped['AverageTemperature'] = df_climate_grouped['AverageTemperature'].fillna(0)
df_climate_grouped.head()

Unnamed: 0,Country,Year_period,AverageTemperature,AverageTemperatureUncertainty,Year_int
0,Afghanistan,1900,13.749333,0.748833,1900
1,Afghanistan,1901,13.894,0.699333,1901
2,Afghanistan,1902,14.505833,0.726083,1902
3,Afghanistan,1903,12.986417,0.732333,1903
4,Afghanistan,1904,13.80575,0.713667,1904


In [35]:
data = df_climate_grouped.loc[df_climate_grouped['Country'] == 'Afghanistan']
data.set_index((data['Year_period']), inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
PeriodIndex: 114 entries, 1900 to 2013
Freq: A-DEC
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype        
---  ------                         --------------  -----        
 0   Country                        114 non-null    object       
 1   Year_period                    114 non-null    period[A-DEC]
 2   AverageTemperature             114 non-null    float64      
 3   AverageTemperatureUncertainty  114 non-null    float64      
 4   Year_int                       114 non-null    int64        
dtypes: float64(2), int64(1), object(1), period[A-DEC](1)
memory usage: 5.3+ KB


***Test quality of predicted values created by autoregression model to further comparison with linear regression model.***

In [36]:
train, test = data.AverageTemperature[:-10], data.AverageTemperature[-10:]
model = AR(train)
model_fit = model.fit(maxlag=3)
predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)
b = pd.DataFrame({'observed':data.AverageTemperature[-10:], 'predicted':predictions})
b['var_sq'] = (b['observed'] - b['predicted'])**2
err_b = math.sqrt(b['var_sq'].sum())
err_b

statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
statsmodels.tsa.SARIMAX.

AutoReg adds the ability to specify exogenous variables, include time trends,
and add seasonal dummies. The AutoReg API differs from AR since the model is
treated as immutable, and so the entire specification including the lag
length must be specified when creating the model. This change is too
substantial to incorporate into the existing AR api. The function
ar_select_order performs lag length selection for AutoReg models.

AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
estimate ARX and related models using full MLE via the Kalman Filter.





3.3155264939607796

***Test quality of predicted values created by linear model for further comparison with authoregression model.***

In [37]:
X = data['Year_int'].values.reshape(-1, 1)  # values converts it into a numpy array
Y = data['AverageTemperature'].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
model = LinearRegression()
model.fit(X, Y)
Y_pred = model.predict(X)
Y_pred = [x[0] for x in Y_pred]
Y_pred

a = pd.DataFrame({'observed':data.AverageTemperature, 'predicted':Y_pred})
a['var_sq'] = (a['observed'] - a['predicted'])**2
err_a = math.sqrt(a['var_sq'][-10:].sum())
err_a

2.1633887739079056

***Linear regression has smaller error so we will use it to predict temperature for missing years.***

In [147]:
df_climate = pd.DataFrame({'Country':[], 'Year':[], 'AverageTemperature':[]})
for country in df_climate_grouped['Country'].unique():
    data = df_climate_grouped.loc[df_climate_grouped['Country'] == country]
    X = data['Year_int'].values.reshape(-1, 1)  # values converts it into a numpy array
    Y = data['AverageTemperature'].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
    model = LinearRegression()
    model.fit(X, Y)
    intercept = model.intercept_[0]
    slope = model.coef_[0][0]
    for year in range(2015, 2020):
        temp = slope*year + intercept
        df_climate = df_climate.append({'Country':country, 'Year':year, 'AverageTemperature':temp}, ignore_index=True)
df_climate['Year'] = df_climate['Year'].astype('int')
df_climate.head()

Unnamed: 0,Country,Year,AverageTemperature
0,Afghanistan,2015,15.084436
1,Afghanistan,2016,15.098997
2,Afghanistan,2017,15.113559
3,Afghanistan,2018,15.12812
4,Afghanistan,2019,15.142681


In [148]:
# Check spelling of countries. It should be unified accross all data sets as this field will be used for merging eventually.

df_climate['Country_Code'] = df_climate.apply(lambda row: country_code(row), axis=1)
df_climate.head()
df_climate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Country             1215 non-null   object 
 1   Year                1215 non-null   int32  
 2   AverageTemperature  1215 non-null   float64
 3   Country_Code        970 non-null    object 
dtypes: float64(1), int32(1), object(2)
memory usage: 33.3+ KB


In [149]:
# List of countries which spelling is different from international standards.

df_climate.loc[df_climate['Country_Code'].isnull()]['Country'].unique()

array(['Africa', 'Antigua And Barbuda', 'Asia', 'Baker Island',
       'Bonaire, Saint Eustatius And Saba', 'Bosnia And Herzegovina',
       'British Virgin Islands', 'Burma', 'Cape Verde',
       'Congo (Democratic Republic Of The)', 'CuraÃ§ao', "CÃ´te D'Ivoire",
       'Denmark (Europe)', 'Europe', 'Falkland Islands (Islas Malvinas)',
       'Federated States Of Micronesia', 'France (Europe)',
       'French Southern And Antarctic Lands', 'Gaza Strip',
       'Guinea Bissau', 'Heard Island And Mcdonald Islands',
       'Isle Of Man', 'Kingman Reef', 'Macau', 'Netherlands (Europe)',
       'North America', 'North Korea', 'Oceania', 'Palestina',
       'Palmyra Atoll', 'Reunion', 'Russia', 'Saint BarthÃ©lemy',
       'Saint Kitts And Nevis', 'Saint Martin',
       'Saint Pierre And Miquelon', 'Saint Vincent And The Grenadines',
       'Sao Tome And Principe', 'Sint Maarten', 'South America',
       'South Georgia And The South Sandwich Isla',
       'Svalbard And Jan Mayen', 'Swaziland

**Delete from Climate:**

'Africa'\
'Asia'\
'Europe'\
'Oceania'\
'North America'\
'South America'

**Rename in Climate:**

'Denmark (Europe)'\
'France (Europe)'\
'Macau'\
'Netherlands (Europe)'\
'Swaziland'\
'Taiwan'\
'North Korea'\
'Palestina'\
'Russia'\
'United Kingdom (Europe)'\
'Congo (Democratic Republic Of The)'\
'Antigua And Barbuda' ---> 'Antigua and Barbuda'\
'Bonaire, Saint Eustatius And Saba' ---> 'Bonaire, Sint Eustatius and Saba'\
'Bosnia And Herzegovina' ---> 'Bosnia and Herzegovina'\
'British Virgin Islands' ---> 'Virgin Islands (British)'\
'Burma' ---> 'Myanmar'\
'Cape Verde' ---> 'Cabo Verde'\
"CÃ´te D'Ivoire" ---> "Côte d'Ivoire"\
'Falkland Islands (Islas Malvinas)' ---> 'Falkland Islands [Malvinas]'\
'Federated States Of Micronesia'---> 'Micronesia (Federated States of)'\
'Guinea Bissau' ---> 'Guinea-Bissau'\
'Isle Of Man' ---> 'Isle of Man'\
'Saint Kitts And Nevis' ---> 'Saint Kitts and Nevis'\
'Saint Pierre And Miquelon' ---> 'Saint Pierre and Miquelon'\
'Saint Vincent And The Grenadines' ---> 'Saint Vincent and the Grenadines'\
'Sao Tome And Principe' ---> 'Sao Tome and Principe'\
'South Georgia And The South Sandwich Isla' ---> 'South Georgia and the South Sandwich Islands'\
'Svalbard And Jan Mayen' ---> 'Svalbard and Jan Mayen'\
'Timor Leste' ---> 'Timor-Leste'\
'Trinidad And Tobago' ---> 'Trinidad and Tobago'\
'Turks And Caicas Islands' ---> 'Turks and Caicos Islands'\
'Virgin Islands' ----> 'Virgin Islands (U.S.)'

**Put them on map:**

'CuraÃ§ao'\
'Sint Maarten'\
'Saint Martin'\
'Baker Island'\
'French Southern And Antarctic Lands'\
'Gaza Strip'\
'Heard Island And Mcdonald Islands'\
'Kingman Reef'\
'Palmyra Atoll'\
'Reunion'\
'Saint BarthÃ©lemy'

In [157]:
# Delete rows containing consolidated info per continent.
lst_delete = ['Africa', 'Asia', 'Europe', 'Oceania', 'North America', 'South America']
df_climate = df_climate[df_climate['Country'].isin(lst_delete) == False]

# Rename countries to align with ISO.
old_names = ['Denmark (Europe)', 'France (Europe)', 'Macau', 'Netherlands (Europe)', 'Swaziland', 'Taiwan', 'North Korea',
             'Palestina', 'Russia', 'United Kingdom (Europe)', 'Congo (Democratic Republic Of The)', 'Antigua And Barbuda',
             'Bonaire, Saint Eustatius And Saba', 'Bosnia And Herzegovina', 'British Virgin Islands', 'Burma', 'Cape Verde',
             "CÃ´te D'Ivoire", 'Falkland Islands (Islas Malvinas)', 'Guinea Bissau', 'Isle Of Man', 'Saint Kitts And Nevis',
             'Saint Pierre And Miquelon', 'Saint Vincent And The Grenadines', 'Sao Tome And Principe',
             'South Georgia And The South Sandwich Isla', 'Svalbard And Jan Mayen', 'Timor Leste', 'Trinidad And Tobago',
             'Turks And Caicas Islands', 'Virgin Islands']

new_names = ['Denmark', 'France', 'Macao', 'Netherlands', 'Swaziland (Eswatini)', 'Taiwan (Province of China)',
             "Korea (the Democratic People's Republic of)", 'Palestine', 'Russian Federation', 'United Kingdom',
             'Congo (the Democratic Republic of the)', 'Antigua and Barbuda', 'Bonaire, Sint Eustatius and Saba',
             'Bosnia and Herzegovina', 'Virgin Islands (British)', 'Myanmar', 'Cabo Verde', "Côte d'Ivoire",
             'Falkland Islands [Malvinas]', 'Guinea-Bissau', 'Isle of Man', 'Saint Kitts and Nevis', 'Saint Pierre and Miquelon',
             'Saint Vincent and the Grenadines', 'Sao Tome and Principe', 'South Georgia and the South Sandwich Islands',
             'Svalbard and Jan Mayen', 'Timor-Leste', 'Trinidad and Tobago', 'Turks and Caicos Islands', 'Virgin Islands (U.S.)']

df_climate['Country'].replace(old_names, new_names, inplace=True)

In [158]:
# Second check to see that everything is unified now.

df_climate['Country_Code'] = df_climate.apply(lambda row: country_code(row), axis=1)
df_climate.loc[df_climate['Country_Code'].isnull()]['Country'].unique()

# There are various territosies which are not relevant for analysis but would be nice to be shouwn on a map.

array(['Baker Island', 'CuraÃ§ao', 'Federated States Of Micronesia',
       'French Southern And Antarctic Lands', 'Gaza Strip',
       'Heard Island And Mcdonald Islands', 'Kingman Reef',
       'Palmyra Atoll', 'Reunion', 'Saint BarthÃ©lemy', 'Saint Martin',
       'Sint Maarten'], dtype=object)

In [159]:
df_climate['Year'].unique()

array([2015, 2016, 2017, 2018, 2019])

In [160]:
df_climate.groupby(df_climate['Country']).count()['Year'].sort_values(ascending=False)

Country
France            10
Denmark           10
Netherlands       10
United Kingdom    10
French Guiana      5
                  ..
Nicaragua          5
New Zealand        5
New Caledonia      5
Nepal              5
Afghanistan        5
Name: Year, Length: 233, dtype: int64

In [161]:
# Four countries contain two data sets - for continent and islands. As variance is not big we will take average.
df_climate.loc[df_climate['Country'].str.contains('Netherlands')]

Unnamed: 0,Country,Year,AverageTemperature,Country_Code
760,Netherlands,2015,10.426569,NLD
761,Netherlands,2016,10.437039,NLD
762,Netherlands,2017,10.447509,NLD
763,Netherlands,2018,10.457979,NLD
764,Netherlands,2019,10.46845,NLD
765,Netherlands,2015,10.034423,NLD
766,Netherlands,2016,10.044901,NLD
767,Netherlands,2017,10.055378,NLD
768,Netherlands,2018,10.065856,NLD
769,Netherlands,2019,10.076334,NLD


In [167]:
df_climate = df_climate[['Country', 'Year', 'AverageTemperature']].pivot_table(index=['Country', 'Year']).reset_index()
df_climate.loc[df_climate['Country'] == 'Netherlands']

Unnamed: 0,Country,Year,AverageTemperature
735,Netherlands,2015,10.230496
736,Netherlands,2016,10.24097
737,Netherlands,2017,10.251444
738,Netherlands,2018,10.261918
739,Netherlands,2019,10.272392


In [168]:
df_climate.to_sql('temperature_consol', engine, 'world_happiness', if_exists='replace')

***Consolidate all obtained and cleaned data into one dataframe for further analysis.***

In [120]:
df_suicide['Year'].unique()

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016, 2017,
       2018], dtype=int64)

In [121]:
df_suicide_cut = df_suicide[df_suicide['Year'].isin([2015, 2016, 2017, 2018, 2019]) == True]
df_suicide_cut['Year'].unique()

array([2015, 2016, 2017, 2018], dtype=int64)

In [122]:
df_suicide_cut.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 48 to 2360
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Country                140 non-null    object 
 1   Year                   140 non-null    int64  
 2   Suicide_rate_per_100K  140 non-null    float64
 3   Country_Code           139 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 5.5+ KB


In [169]:
df_climate_cut['Year'].unique()

array([2015, 2016, 2017, 2018])

In [170]:
df_climate_cut = df_climate[df_climate['Year'].isin([2015, 2016, 2017, 2018]) == True]
df_climate_cut['Year'].unique()

array([2015, 2016, 2017, 2018], dtype=int64)

In [87]:
df_happiness['Year'].unique()

array(['2015', '2016', '2017', '2018', '2019'], dtype=object)

In [92]:
df_happiness_cut = df_happiness[df_happiness['Year'].isin([2015, 2016, 2017, 2018]) == True]
df_happiness_cut['Year'].unique()

array([2015, 2016, 2017, 2018])

In [193]:
df0 = pd.merge(df_happiness_cut, df_climate_cut, on = ['Country', 'Year'], how = 'outer')
df = pd.merge(df0, df_suicide_cut, on = ['Country', 'Year'], how = 'outer')

# Spot check to verify data.
df.loc[df['Country'] == 'Netherlands']
df.loc[df['Country'] == 'Russian Federation']
df.loc[df['Country'] == 'China']
df.loc[df['Country'] == 'Unated States']

Unnamed: 0,Country,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year,Country_Code_x,AverageTemperature,Suicide_rate_per_100K,Country_Code_y


In [181]:
df.to_sql('total_consol_2015_2018', engine, 'world_happiness', if_exists='replace')

## **Step 2.** Review and further cleaning of consolidated dataframe.

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 950 entries, 0 to 949
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      950 non-null    object 
 1   Happiness_Rank               626 non-null    float64
 2   Happiness_Score              626 non-null    float64
 3   Economy_GDP_per_Capita       626 non-null    float64
 4   Social_support               626 non-null    float64
 5   Health_Life_Expectancy       626 non-null    float64
 6   Freedom                      626 non-null    float64
 7   Trust_Government_Corruption  626 non-null    float64
 8   Generosity                   626 non-null    float64
 9   Year                         950 non-null    int64  
 10  Country_Code_x               612 non-null    object 
 11  AverageTemperature           932 non-null    float64
 12  Suicide_rate_per_100K        140 non-null    float64
 13  Country_Code_y      

In [183]:
df.describe()

Unnamed: 0,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year,AverageTemperature,Suicide_rate_per_100K
count,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,950.0,932.0,140.0
mean,78.747604,5.372021,0.918764,1.045891,0.584299,0.415706,0.128932,0.226981,2016.497895,19.650902,11.218058
std,45.219609,1.131774,0.409808,0.328946,0.241948,0.154943,0.108238,0.126854,1.118621,8.952886,6.297985
min,1.0,2.693,0.0,0.0,0.0,0.0,0.0,0.0,2015.0,-17.541515,0.0
25%,40.0,4.49775,0.606755,0.847945,0.404143,0.3105,0.056141,0.137263,2015.25,12.160691,6.655625
50%,79.0,5.307,0.983704,1.081275,0.632553,0.434635,0.093574,0.208582,2016.0,23.691709,11.114583
75%,118.0,6.18725,1.239502,1.283383,0.772957,0.538998,0.16134,0.290915,2017.0,26.98727,14.281562
max,158.0,7.632,2.096,1.644,1.03,0.724,0.55191,0.838075,2018.0,29.559294,32.428333


In [208]:
df_full = df.loc[df['Suicide_rate_per_100K'].notnull()].loc[df['Happiness_Rank'].notnull()]
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 0 to 532
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      131 non-null    object 
 1   Happiness_Rank               131 non-null    float64
 2   Happiness_Score              131 non-null    float64
 3   Economy_GDP_per_Capita       131 non-null    float64
 4   Social_support               131 non-null    float64
 5   Health_Life_Expectancy       131 non-null    float64
 6   Freedom                      131 non-null    float64
 7   Trust_Government_Corruption  131 non-null    float64
 8   Generosity                   131 non-null    float64
 9   Year                         131 non-null    int64  
 10  Country_Code_x               131 non-null    object 
 11  AverageTemperature           131 non-null    float64
 12  Suicide_rate_per_100K        131 non-null    float64
 13  Country_Code_y      

In [209]:
df_full.describe()

Unnamed: 0,Happiness_Rank,Happiness_Score,Economy_GDP_per_Capita,Social_support,Health_Life_Expectancy,Freedom,Trust_Government_Corruption,Generosity,Year,AverageTemperature,Suicide_rate_per_100K
count,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0
mean,40.473282,6.356015,1.241461,1.165101,0.792812,0.468201,0.162617,0.243391,2015.770992,12.460608,11.397518
std,32.01546,0.856101,0.217043,0.228069,0.108979,0.142245,0.136436,0.141686,0.837256,7.953883,5.928559
min,1.0,4.297,0.47428,0.38562,0.27688,0.05822,0.006,0.0,2015.0,-4.242882,1.041667
25%,13.0,5.7585,1.1141,1.039685,0.7208,0.37749,0.044925,0.122065,2015.0,7.981444,6.78125
50%,32.0,6.481,1.27074,1.19777,0.809158,0.49615,0.12869,0.2324,2016.0,10.638226,11.15
75%,62.5,7.1115,1.389455,1.316025,0.874915,0.575515,0.268235,0.33495,2016.0,17.970604,14.354375
max,130.0,7.587,1.82427,1.644,1.02525,0.677,0.52208,0.58696,2018.0,27.800142,31.436
