# Global Suicide Analysis - Task 1: Data Collection
### - Patrick Mbanusi  
In this assignment we will collect data on Suicide rates worlwide and additional related data.  

In [2]:
import json, requests, urllib
from pathlib import Path
from datetime import datetime
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

Create directory for raw data storage, if it does not already exist:

In [3]:
dir_raw = Path("raw")
dir_raw.mkdir(parents=True, exist_ok=True)

## Data Collection

All data is being collected from the World Health Organisation(WHO)

This data being collected is crude suicide rates (per 100 000 population) over two decades(2000-2019). The dissaggregations are age, sex, country, region and year.   

In [5]:
# retrieve data (JSON)
# *this data does not need to fetched several times*
url = "https://ghoapi.azureedge.net/api/SDGSUICIDE"
response = urllib.request.urlopen(url)
raw_json = response.read().decode("utf-8")


In [8]:
# laod data into a dict
data = json.loads(raw_json)
# save relevant list of dicts (called "value")
data = data["value"]
print(data[0])

{'Id': 26130097, 'IndicatorCode': 'SDGSUICIDE', 'SpatialDimType': 'COUNTRY', 'SpatialDim': 'AUT', 'ParentLocationCode': 'EUR', 'TimeDimType': 'YEAR', 'ParentLocation': 'Europe', 'Dim1Type': 'SEX', 'Dim1': 'BTSX', 'TimeDim': 2019, 'Dim2Type': 'AGEGROUP', 'Dim2': 'YEARS15-24', 'Dim3Type': None, 'Dim3': None, 'DataSourceDimType': None, 'DataSourceDim': None, 'Value': '7.8', 'NumericValue': 7.77, 'Low': None, 'High': None, 'Comments': None, 'Date': '2021-07-06T14:13:43.367+02:00', 'TimeDimensionValue': '2019', 'TimeDimensionBegin': '2019-01-01T00:00:00+01:00', 'TimeDimensionEnd': '2019-12-31T00:00:00+01:00'}


As the name of the countries are shown only as abbreviations and the regions are not specified, we will get this from anoter set of data:

In [7]:
# retrieve country data
url = "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues"
response = urllib.request.urlopen(url)
raw_json2 = response.read().decode("utf-8")

In [11]:
# laod country data into a dict
country_data = json.loads(raw_json2)
# save relevant list of dicts (called "value")
country_data = country_data["value"]
# print list
country_data[0]

{'Code': 'ABW',
 'Title': 'Aruba',
 'Dimension': 'COUNTRY',
 'ParentDimension': 'REGION',
 'ParentCode': 'AMR',
 'ParentTitle': 'Americas'}

Make dict to store country/region abbreviations to country/region names:

In [12]:
dict_country = {}
for x in range(len(country_data)):
    dict_country[country_data[x]["Code"]] = country_data[x]["Title"]
    dict_country[country_data[x]["ParentCode"]] = country_data[x]["ParentTitle"]

# print examples from data
print("\n".join([f"{key}: {value}" for i, (key, value) in enumerate(dict_country.items()) if i < 5]))


ABW: Aruba
AMR: Americas
AFG: Afghanistan
EMR: Eastern Mediterranean
AGO: Angola


Make dict to store country to it's respective regions:

In [13]:
dict_region = {}
for x in range(len(country_data)):
        dict_region[country_data[x]["Code"]] = country_data[x]["ParentTitle"]
        dict_region[country_data[x]["ParentCode"]] = country_data[x]["ParentTitle"]

# print examples from data
print("\n".join([f"{key}: {value}" for i, (key, value) in enumerate(dict_region.items()) if i < 5]))


ABW: Americas
AMR: Americas
AFG: Eastern Mediterranean
EMR: Eastern Mediterranean
AGO: Africa


In [14]:
# make list of all valid values(codes) that SpatialDim should retunred
valid_list = list(dict_country)
valid_list[:10]

['ABW', 'AMR', 'AFG', 'EMR', 'AGO', 'AFR', 'AIA', 'ALB', 'EUR', 'AND']

Note: This process of adding countries and regions to our dicts could be done in Task 2 (under preprocessing) but it is logical to complete it here as we had to collect data from another dataset in order to do it and thus categorising countries in Task 1 will be more efficient instead of storing 'dict_country' and 'dict_region' as JSON files alongside the main data.  

We will spilt the 'Americas' region into North and South America.

*Although geographically in North America, we will include Mexico in the South American region as it is more culturally appropriate.  

In [15]:
# make list of South American countries
SA_list = ["Argentina", "Bolivia (Plurinational State of)", "Brazil", "Chile",
           "Colombia", "Costa Rica", "Ecuador", "El Salvador", "Guatemala",
           "Honduras", "Mexico", "Nicaragua", "Panama", "Paraguay", "Peru",
           "Dominican Republic", "Uruguay", "Guyana"]

Now we will add all country and regions to our list of dictionaries 'data':

In [16]:
# Note: Some values in SpatialDim are actually regions, we will still save these regions as countries.
# we will also seperate Americas accordingly
# if value is neither country or region(not in valid_list) then make value None
for x in range(len(data)):
    if data[x]["SpatialDim"] not in valid_list:
        data[x]["Country"] = None
        data[x]["Region"] = None
    elif dict_region[data[x]["SpatialDim"]] == "Americas":
        if dict_country[data[x]["SpatialDim"]] in SA_list:
            data[x]["Country"] = dict_country[data[x]["SpatialDim"]]
            data[x]["Region"] = "South America"
            data[x]["SpatialDim"] = "SA"
        else:
            data[x]["Country"] = dict_country[data[x]["SpatialDim"]]
            data[x]["Region"] = "North America"
            data[x]["SpatialDim"] = "NA"
    else:
        data[x]["Country"] = dict_country[data[x]["SpatialDim"]]
        data[x]["Region"] = dict_region[data[x]["SpatialDim"]]

# print example output with country and region added
data[14032]

{'Id': 30308242,
 'IndicatorCode': 'SDGSUICIDE',
 'SpatialDimType': 'COUNTRY',
 'SpatialDim': 'PRK',
 'ParentLocationCode': 'SEAR',
 'TimeDimType': 'YEAR',
 'ParentLocation': 'South-East Asia',
 'Dim1Type': 'SEX',
 'Dim1': 'BTSX',
 'TimeDim': 2001,
 'Dim2Type': 'AGEGROUP',
 'Dim2': 'YEARSALL',
 'Dim3Type': None,
 'Dim3': None,
 'DataSourceDimType': None,
 'DataSourceDim': None,
 'Value': '9.5 [5.5-15.5]',
 'NumericValue': 9.54,
 'Low': 5.48,
 'High': 15.4521,
 'Comments': None,
 'Date': '2022-12-08T16:19:00.19+01:00',
 'TimeDimensionValue': '2001',
 'TimeDimensionBegin': '2001-01-01T00:00:00+01:00',
 'TimeDimensionEnd': '2001-12-31T00:00:00+01:00',
 'Country': "Democratic People's Republic of Korea",
 'Region': 'South-East Asia'}

Collected data displayed in a Pandas dataframe:

In [None]:
# turn into pandas data frame and drop redunandant columns
# indexed by highest suicide rate per 100 000 pop (Numeric Value)
df = pd.DataFrame(data).drop(["IndicatorCode", "Id", "SpatialDimType", "TimeDimType", "Dim1Type",
         "Dim3Type", "Dim3", "DataSourceDimType",
        "DataSourceDim", "Date", "TimeDimensionBegin", "TimeDimensionEnd",
        "Comments", "TimeDimensionValue", "Low", "High"], axis=1).set_index("NumericValue")

df[df["Dim2Type"] != "AGEGROUP"].sort_values(by=["NumericValue"], ascending = False)


Unnamed: 0_level_0,SpatialDim,TimeDim,Dim1,Dim2Type,Dim2,Value,Country,Region
NumericValue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
147.8102,LSO,2014,MLE,,,147.8 [56.7-258.0],Lesotho,Africa
147.3488,LSO,2015,MLE,,,147.3 [56.2-258.8],Lesotho,Africa
139.8896,LSO,2013,MLE,,,139.9 [53.7-243.4],Lesotho,Africa
139.5874,LSO,2016,MLE,,,139.6 [52.5-244.8],Lesotho,Africa
127.2498,LSO,2017,MLE,,,127.2 [47.6-224.2],Lesotho,Africa
...,...,...,...,...,...,...,...,...
0.0000,,2018,MLE,,,0.0 [0.0-0.0],,
0.0000,,2007,FMLE,,,0.0 [0.0-0.0],,
0.0000,,2019,MLE,,,0.0 [0.0-0.0],,
0.0000,,2005,FMLE,,,0.0 [0.0-0.0],,


Finally, store data in JSON file:

In [None]:
# write it out to our raw dataset directory
out_path = dir_raw / "suicide_rates.json"

with open(out_path, "w") as file_object:
    json.dump(data, file_object)

# check data length and verify it in Task 2
print("Length of data in Task 1 = ", len(data))

Length of data in Task 1 =  17679
