In [2]:
import pandas as pd


## Suicide Rates dataset (2000 - 2019)
### File name: 'Suicide rates WHO.csv', 'Crude Suuicide Rates WHO.csv'
link: https://www.who.int/data/gho/data/themes/mental-health/suicide-rates

#### Age-standardized suicide rate (per 100,000 population)

In [50]:
df_suicide = pd.read_csv('Suicide rates WHO.csv')

df_suicide2 = df_suicide[["Period", "Location", "SpatialDimValueCode", "ParentLocation", "ParentLocationCode",
            "Dim1", "Dim1ValueCode", "FactValueNumeric", "FactValueNumericLow", "FactValueNumericHigh"]]

df_suicide2.columns = ["Year", "Country", "CountryCode", "ParentCountry", "ParentCountryCode",
            "Sex", "SexCode", "SuicideRate", "SuicideRateLower", "SuicideRateUpper"]

df_suicide2

Unnamed: 0,Year,Country,CountryCode,ParentCountry,ParentCountryCode,Sex,SexCode,SuicideRate,SuicideRateLower,SuicideRateUpper
0,2019,Antigua and Barbuda,ATG,Americas,AMR,Male,MLE,0.00,0.00,0.00
1,2019,Barbados,BRB,Americas,AMR,Female,FMLE,0.16,0.11,0.22
2,2019,Barbados,BRB,Americas,AMR,Both sexes,BTSX,0.31,0.22,0.42
3,2019,Antigua and Barbuda,ATG,Americas,AMR,Both sexes,BTSX,0.32,0.22,0.45
4,2019,Barbados,BRB,Americas,AMR,Male,MLE,0.49,0.34,0.65
...,...,...,...,...,...,...,...,...,...,...
10975,2000,Guinea,GIN,Africa,AFR,Both sexes,BTSX,9.73,5.57,15.96
10976,2000,Ghana,GHA,Africa,AFR,Both sexes,BTSX,9.75,6.46,14.30
10977,2000,Malta,MLT,Europe,EUR,Male,MLE,9.75,7.56,12.29
10978,2000,Seychelles,SYC,Africa,AFR,Both sexes,BTSX,9.76,6.60,13.46


### Check NA values

In [12]:
df_suicide2.isnull().sum()

Year                 0
Country              0
CountryCode          0
ParentCountry        0
ParentCountryCode    0
Sex                  0
SexCode              0
SuicideRate          0
SuicideRateLower     0
SuicideRateUpper     0
dtype: int64

In [13]:
df_suicide2.dtypes

Year                   int64
Country               object
CountryCode           object
ParentCountry         object
ParentCountryCode     object
Sex                   object
SexCode               object
SuicideRate          float64
SuicideRateLower     float64
SuicideRateUpper     float64
dtype: object

### Strip string columns

In [166]:
df_suicide2 = df_suicide2.apply(lambda x: x.str.strip() if x.dtype == "object" else x)


### Get country region

In [169]:
country_region = pd.read_csv("all_country_regions.csv")
#country_region = country_region.rename(columns={"name":"Country", "alpha-3":"CountryCode"})
#country_region['intermediate-region-code'] = country_region['intermediate-region-code'].fillna(country_region['sub-region-code'])

In [170]:
country_region.loc[country_region["Country"] == "China"]

Unnamed: 0,Country,CountryCode,country-code,iso_3166-2,region,sub-region,region-code,sub-region-code
45,China,CHN,156,ISO 3166-2:CN,Asia,Eastern Asia,142.0,30.0


In [159]:
df_suicide2 = df_suicide2.merge(country_region[["CountryCode", "sub-region"]], how="left", on="CountryCode")
df_suicide2

Unnamed: 0,Year,Country,CountryCode,ParentCountry,ParentCountryCode,Sex,SexCode,SuicideRate,SuicideRateLower,SuicideRateUpper,Regional indicator,sub-region_x,sub-region_y,sub-region_x.1,sub-region_y.1,sub-region
0,2019,Antigua and Barbuda,ATG,Americas,AMR,Male,MLE,0.00,0.00,0.00,,Caribbean,Caribbean,Caribbean,Caribbean,Caribbean
1,2019,Barbados,BRB,Americas,AMR,Female,FMLE,0.16,0.11,0.22,,Caribbean,Caribbean,Caribbean,Caribbean,Caribbean
2,2019,Barbados,BRB,Americas,AMR,Both sexes,BTSX,0.31,0.22,0.42,,Caribbean,Caribbean,Caribbean,Caribbean,Caribbean
3,2019,Antigua and Barbuda,ATG,Americas,AMR,Both sexes,BTSX,0.32,0.22,0.45,,Caribbean,Caribbean,Caribbean,Caribbean,Caribbean
4,2019,Barbados,BRB,Americas,AMR,Male,MLE,0.49,0.34,0.65,,Caribbean,Caribbean,Caribbean,Caribbean,Caribbean
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10975,2000,Guinea,GIN,Africa,AFR,Both sexes,BTSX,9.73,5.57,15.96,Sub-Saharan Africa,Western Africa,Western Africa,Western Africa,Western Africa,Western Africa
10976,2000,Ghana,GHA,Africa,AFR,Both sexes,BTSX,9.75,6.46,14.30,Sub-Saharan Africa,Western Africa,Western Africa,Western Africa,Western Africa,Western Africa
10977,2000,Malta,MLT,Europe,EUR,Male,MLE,9.75,7.56,12.29,Western Europe,Southern Europe,Southern Europe,Southern Europe,Southern Europe,Southern Europe
10978,2000,Seychelles,SYC,Africa,AFR,Both sexes,BTSX,9.76,6.60,13.46,,Eastern Africa,Eastern Africa,Eastern Africa,Eastern Africa,Eastern Africa


Countries with null in regional indicator

In [160]:
len(df_suicide2.loc[df_suicide2["sub-region"].isnull()]["Country"].unique())

0

Total Countries in the Suicide data

In [144]:
len(df_suicide2["Country"].unique())

183

Country without region names

In [146]:
df_suicide2.loc[df_suicide2["sub-region"].isnull()]["Country"].unique()

array(["Democratic People's Republic of Korea",
       'The former Yugoslav Republic of Macedonia', 'Republic of Moldova',
       'Democratic Republic of the Congo', 'Republic of Korea',
       'United Republic of Tanzania', 'Côte d’Ivoire'], dtype=object)

## Mortality data
### File name: 'MortalityDataWHR2021C2CSV.csv'
link: https://worldhappiness.report/ed/2021/#appendices-and-data

- Population data (2019-2020)
- All-cause of death (2017-2020)

In [118]:
df_morta = pd.read_csv("MortalityDataWHR2021C2CSV.csv")
df_morta["Country name"] = df_morta["Country name"].str.strip()
df_morta

Unnamed: 0,Country name,Population 2020,Population 2019,"COVID-19 deaths per 100,000 population in 2020",Median age,Island,Index of exposure to COVID-19 infections in other countries as of March 31,Log of average distance to SARS countries,WHO Western Pacific Region,Female head of government,Index of institutional trust,Gini coefficient of income,"All-cause death count, 2017","All-cause death count, 2018","All-cause death count, 2019","All-cause death count, 2020","Excess deaths in 2020 per 100,000 population, relative to 2017-2019 average"
0,United States,331002647.0,328239523.0,104.451,38.3,0,1.688,9.315,0,0,0.250,47.51,2810927.0,2839076.0,2852747.0,3424996.0,179.220
1,Egypt,102334403.0,100388073.0,7.457,25.3,0,1.627,9.007,0,0,0.446,31.56,,,,,
2,Morocco,36910558.0,36471769.0,20.016,29.6,0,2.336,9.226,0,0,0.397,39.55,,,,,
3,Lebanon,6825442.0,6855713.0,21.508,31.1,0,1.891,8.956,0,0,0.107,31.83,,,,,
4,Saudi Arabia,34813867.0,34268528.0,17.875,31.9,0,1.250,8.897,0,0,0.651,45.90,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Yemen,29825968.0,29161922.0,2.045,20.3,0,1.005,8.966,0,0,0.267,36.71,,,,,
162,Kosovo,1932774.0,1794248.0,68.916,35.0,0,3.134,9.015,0,1,0.169,29.01,8738.0,8866.0,9552.0,11108.0,110.329
163,Somaliland region,,,,,0,,,0,0,,,,,,,
164,North Cyprus,,,,,0,,,0,0,0.305,,,,,,


In [119]:
population = df_morta[['Country name', 'Population 2020', 'Population 2019']]
population_t = pd.melt(population[:-1],
        id_vars="Country name",
        value_vars = list(population.columns[1:]),
        var_name='Year',
        value_name='Population')

population_t["Year"] = population_t["Year"].apply(lambda x: x.strip().replace("Population ", "")).astype("int64")

In [120]:
population_t

Unnamed: 0,Country name,Year,Population
0,United States,2020,331002647.0
1,Egypt,2020,102334403.0
2,Morocco,2020,36910558.0
3,Lebanon,2020,6825442.0
4,Saudi Arabia,2020,34813867.0
...,...,...,...
325,Uzbekistan,2019,33580650.0
326,Yemen,2019,29161922.0
327,Kosovo,2019,1794248.0
328,Somaliland region,2019,


In [121]:
all_cause_death = df_morta[["Country name", "All-cause death count, 2017", "All-cause death count, 2018",
                         "All-cause death count, 2019", "All-cause death count, 2020"]]
all_cause_death_t = pd.melt(all_cause_death[:-1],
                        id_vars="Country name",
                        value_vars = list(all_cause_death.columns[1:]),
                        var_name='Year',
                        value_name='count')

all_cause_death_t["Year"] = all_cause_death_t["Year"].apply(lambda x: x.strip().replace("All-cause death count, ", "")).astype("int64")

In [122]:
all_cause_death_t

Unnamed: 0,Country name,Year,count
0,United States,2017,2810927.0
1,Egypt,2017,
2,Morocco,2017,
3,Lebanon,2017,
4,Saudi Arabia,2017,
...,...,...,...
655,Uzbekistan,2020,175637.0
656,Yemen,2020,
657,Kosovo,2020,11108.0
658,Somaliland region,2020,


In [123]:
morta_other_features = df_morta[["Country name", "Median age", "Island", "Female head of government",
                        "Index of institutional trust", "Gini coefficient of income",
                        "Index of exposure to COVID-19  infections in other countries as of March 31",
                        "COVID-19 deaths per 100,000 population in 2020"]]
morta_other_features

Unnamed: 0,Country name,Median age,Island,Female head of government,Index of institutional trust,Gini coefficient of income,Index of exposure to COVID-19 infections in other countries as of March 31,"COVID-19 deaths per 100,000 population in 2020"
0,United States,38.3,0,0,0.250,47.51,1.688,104.451
1,Egypt,25.3,0,0,0.446,31.56,1.627,7.457
2,Morocco,29.6,0,0,0.397,39.55,2.336,20.016
3,Lebanon,31.1,0,0,0.107,31.83,1.891,21.508
4,Saudi Arabia,31.9,0,0,0.651,45.90,1.250,17.875
...,...,...,...,...,...,...,...,...
161,Yemen,20.3,0,0,0.267,36.71,1.005,2.045
162,Kosovo,35.0,0,1,0.169,29.01,3.134,68.916
163,Somaliland region,,0,0,,,,
164,North Cyprus,,0,0,0.305,,,
