In [2]:
import os
from sqlalchemy import create_engine
import pymysql
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

In [3]:
pd.set_option('display.max_columns', 100)
# sqlpassword =  'smaa2lkb1&D+Sf]6]o#F'
sqlpassword = os.environ['SQLpassword']
engine = create_engine(f"mysql+pymysql://root:{sqlpassword}@127.0.0.1/world_happiness")

**Step 0.** Most of the data sets have been imported directly to SQL data base but data sets relating to medical health have too many columns with very extensive, so these data sets will be preprocessed by pandas below.

In [47]:
# 2016 mental health data set

df = pd.read_csv('.\\data\\mental-heath-in-tech-2016.csv')
pd.set_option('display.max_columns', 100)
data = df.iloc[:, [2, 45, 46, 47, 48, 49, 50, 51, 55, 57, 58, 59, 60, 61]]
data.columns = ['tech_employer','family_history','sickness_past','sickness_current','diagnosis','supposed_diagnosis',
                'professional_help','professional_diagnosis','age','country_origin','us_state_origin','country_location',
                'us_state_location','occupation']
data.head()
data.to_sql('mental_heath_in_tech_2016_redused', engine, 'world_happiness')

In [45]:
# 2017 mental health data set

df = pd.read_csv('.\\data\\mental-heath-in-tech-2017.csv')
data = df.iloc[:, [3, 49, 89, 91, 112, 113, 114, 115]]
data.columns = ['tech_employer', 'sickness_current','sickness_past','family_history','age','gender','country_location',
                'us_state_location']
data.head()
data.to_sql('mental_heath_in_tech_2017_redused', engine, 'world_happiness')

In [50]:
# 2018 mental health data set

df = pd.read_csv('.\\data\\mental-heath-in-tech-2018.csv')
data = df.iloc[:, [3, 48, 49, 91, 112, 113, 114, 115]]
data.columns = ['tech_employer', 'sickness_current','sickness_past','family_history','age','gender','country_location',
                'us_state_location']
data.head()
data.to_sql('mental_heath_in_tech_2018_redused', engine, 'world_happiness')

In [49]:
# 2019 mental health data set

df = pd.read_csv('.\\data\\mental-heath-in-tech-2019.csv')
data = df.iloc[:, [2, 47, 52, 54, 75, 76, 77, 78]]
data.columns = ['tech_employer', 'sickness_current','sickness_past','family_history','age','gender','country_location',
                'us_state_location']
data.head()
data.to_sql('mental_heath_in_tech_2019_redused', engine, 'world_happiness')

**Step 1.** Reading and cleaning data.

a) Combining data sets for various periods in one table based on data topic.

*Suicide data.*

In [4]:
df_suicide_new = pd.read_sql_query('''SELECT * FROM world_happiness.suicide_2015_2018''', engine)
df_suicide_new.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
0,AUS,SUICIDE,TOT,100000PER,A,2015,13.1
1,AUS,SUICIDE,TOT,100000PER,A,2016,12.1
2,AUS,SUICIDE,TOT,100000PER,A,2017,12.8
3,AUT,SUICIDE,TOT,100000PER,A,2015,12.9
4,AUT,SUICIDE,TOT,100000PER,A,2016,12.2


In [5]:
df_suicide_who = pd.read_sql_query('''SELECT * FROM world_happiness.suicide_1987_2014''', engine)
df_suicide_who.head()

Unnamed: 0,country,Year,sex,age,suicides_no,population,suicides_100k_pop,country_year,HDI_for_year,gdp_for_year_$,gdp_per_capita_$,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,0.0,2,796,Generation X\r
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,0.0,2,796,Silent\r
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,0.0,2,796,Generation X\r
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,0.0,2,796,G.I. Generation\r
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,0.0,2,796,Boomers\r


In [6]:
url = 'https://www.iban.com/country-codes'
html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")
cdict = {'class':'table'}
table = soup.find_all('table', cdict)[0]
rows = table.find_all('td')
rows = [x.contents[0] for x in rows]
rows

['Afghanistan',
 'AF',
 'AFG',
 '004',
 'Åland Islands',
 'AX',
 'ALA',
 '248',
 'Albania',
 'AL',
 'ALB',
 '008',
 'Algeria',
 'DZ',
 'DZA',
 '012',
 'American Samoa',
 'AS',
 'ASM',
 '016',
 'Andorra',
 'AD',
 'AND',
 '020',
 'Angola',
 'AO',
 'AGO',
 '024',
 'Anguilla',
 'AI',
 'AIA',
 '660',
 'Antarctica',
 'AQ',
 'ATA',
 '010',
 'Antigua and Barbuda',
 'AG',
 'ATG',
 '028',
 'Argentina',
 'AR',
 'ARG',
 '032',
 'Armenia',
 'AM',
 'ARM',
 '051',
 'Aruba',
 'AW',
 'ABW',
 '533',
 'Australia',
 'AU',
 'AUS',
 '036',
 'Austria',
 'AT',
 'AUT',
 '040',
 'Azerbaijan',
 'AZ',
 'AZE',
 '031',
 'Bahamas (the)',
 'BS',
 'BHS',
 '044',
 'Bahrain',
 'BH',
 'BHR',
 '048',
 'Bangladesh',
 'BD',
 'BGD',
 '050',
 'Barbados',
 'BB',
 'BRB',
 '052',
 'Belarus',
 'BY',
 'BLR',
 '112',
 'Belgium',
 'BE',
 'BEL',
 '056',
 'Belize',
 'BZ',
 'BLZ',
 '084',
 'Benin',
 'BJ',
 'BEN',
 '204',
 'Bermuda',
 'BM',
 'BMU',
 '060',
 'Bhutan',
 'BT',
 'BTN',
 '064',
 'Bolivia (Plurinational State of)',
 'BO',
 'B

In [7]:
# There are 4 columns in a table so we split all content of a table in lists of 4 elements each.
n_countries = int(len(rows)/4)
countries = np.array_split(rows, n_countries)
# Convert list of rows values into dataframe.
df_countries = pd.DataFrame(countries,columns=['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric'])
# Form dictionary out of 2 suitable columns.
countries_dict = dict(zip(df_countries['Alpha-3 code'], df_countries['Country']))
countries_dict

{'AFG': 'Afghanistan',
 'ALA': 'Åland Islands',
 'ALB': 'Albania',
 'DZA': 'Algeria',
 'ASM': 'American Samoa',
 'AND': 'Andorra',
 'AGO': 'Angola',
 'AIA': 'Anguilla',
 'ATA': 'Antarctica',
 'ATG': 'Antigua and Barbuda',
 'ARG': 'Argentina',
 'ARM': 'Armenia',
 'ABW': 'Aruba',
 'AUS': 'Australia',
 'AUT': 'Austria',
 'AZE': 'Azerbaijan',
 'BHS': 'Bahamas (the)',
 'BHR': 'Bahrain',
 'BGD': 'Bangladesh',
 'BRB': 'Barbados',
 'BLR': 'Belarus',
 'BEL': 'Belgium',
 'BLZ': 'Belize',
 'BEN': 'Benin',
 'BMU': 'Bermuda',
 'BTN': 'Bhutan',
 'BOL': 'Bolivia (Plurinational State of)',
 'BES': 'Bonaire, Sint Eustatius and Saba',
 'BIH': 'Bosnia and Herzegovina',
 'BWA': 'Botswana',
 'BVT': 'Bouvet Island',
 'BRA': 'Brazil',
 'IOT': 'British Indian Ocean Territory (the)',
 'BRN': 'Brunei Darussalam',
 'BGR': 'Bulgaria',
 'BFA': 'Burkina Faso',
 'BDI': 'Burundi',
 'CPV': 'Cabo Verde',
 'KHM': 'Cambodia',
 'CMR': 'Cameroon',
 'CAN': 'Canada',
 'CYM': 'Cayman Islands (the)',
 'CAF': 'Central African R

In [8]:
def country_name(row):
    for key, value in countries_dict.items():
        if row.LOCATION == key:
            return value

df_suicide_new['Country'] = df_suicide_new.apply(lambda row: country_name(row), axis=1)

In [9]:
df1 = df_suicide_new[['Country', 'TIME', 'Value']]
df1.columns = ['Country', 'Year', 'Suicide_rate_per_100K']
df2 = df_suicide_who[['country', 'Year', 'suicides_100k_pop']]
df2.columns=['Country', 'Year', 'Suicide_rate_per_100K']
frames = [df1, df2]

df_suicide = pd.concat(frames)
df_suicide.head()

Unnamed: 0,Country,Year,Suicide_rate_per_100K
0,Australia,2015,13.1
1,Australia,2016,12.1
2,Australia,2017,12.8
3,Austria,2015,12.9
4,Austria,2016,12.2


*Global happiness data.