# Explore SDG Groups


Obtain a list of the full hierarchy of StatVarGroup instances that are related to the SDG’s statistical variables in DC through the specializationOf relation?   This seems to be the way we should model the SDG indicator framework, from Goal > Target > Indicator > Series >> Variable.

In [1]:
# Import Data Commons

import pandas as pd
import numpy as np
import datacommons_pandas as dc

# Import other required libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd

import json
import time

import ast

import os
from dotenv import load_dotenv

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import clear_output, display
from IPython.core.display import HTML, JSON

import requests
from bs4 import BeautifulSoup


import sys
sys.path.append('../') # add the project's root directory to the system path

from utils_excel import * # import the utils_excel module from the project's root directory


---

In [2]:
load_dotenv()
api_key = os.getenv("DC_KEY")

True

### SPARQL Query

In [3]:
# set up the API endpoint URL
url = "https://api.datacommons.org/v1/query"
headers = {"X-API-Key": api_key}

### REST API

In [4]:
def call_api(endpoint, parameters):
    url = f"http://api.datacommons.org/{endpoint}{parameters}?key={api_key}"
    print(f"http://api.datacommons.org/{endpoint}{parameters}")
    response = requests.get(url)
    return json.loads(response.content)

---

In [5]:
variable_properties = pd.read_excel('../data/output/Variable_Properties.xlsx')
variable_properties.head(3)

Unnamed: 0,dcid,measuredProperty,measuredProperty__name,memberOf,memberOf__name,name,populationType,populationType__name,provenance,provenance__name,...,sdg_typeOfWasteTreatment,sdg_typeOfWasteTreatment__name,sdg_frequencyOfChlorophyllAConcentration,sdg_frequencyOfChlorophyllAConcentration__name,sdg_deviationLevel,sdg_deviationLevel__name,sdg_typeOfRenewableTechnology,sdg_typeOfRenewableTechnology__name,sdg_foodWasteSector,sdg_foodWasteSector__name
0,sdg/VC_DSR_AFFCT,value,value,"['dc/g/SDG_1.5.1', 'dc/g/SDG_11.5.1', 'dc/g/SD...","['1.5.1: Number of deaths, missing persons and...",Number of people affected by disaster,SDG_VC_DSR_AFFCT,Number of people affected by disaster,dc/base/HumanReadableStatVars,HumanReadableStatVars,...,,,,,,,,,,
1,sdg/VC_DSR_DAFF,value,value,"['dc/g/SDG_1.5.1', 'dc/g/SDG_11.5.1', 'dc/g/SD...","['1.5.1: Number of deaths, missing persons and...",Number of directly affected persons attributed...,SDG_VC_DSR_DAFF,Number of directly affected persons attributed...,dc/base/HumanReadableStatVars,HumanReadableStatVars,...,,,,,,,,,,
2,sdg/VC_DSR_IJILN,value,value,"['dc/g/SDG_1.5.1', 'dc/g/SDG_11.5.1', 'dc/g/SD...","['1.5.1: Number of deaths, missing persons and...",Number of injured or ill people attributed to ...,SDG_VC_DSR_IJILN,Number of injured or ill people attributed to ...,dc/base/HumanReadableStatVars,HumanReadableStatVars,...,,,,,,,,,,


### 1. Obtain groups directly linked to SDG statistical variables through the `memberOf` relation

In [6]:
SDG_Groups = list(variable_properties['memberOf'])


SDG_Groups_unique = []

for element in SDG_Groups:
    # Check if the element is a simple string
    if isinstance(element, str) and not element.startswith("["):
        if element not in SDG_Groups_unique:
            SDG_Groups_unique.append(element)
    # Check if the element is a complex string that can be parsed as an array of simple strings
    elif isinstance(element, str) and element.startswith("[") and element.endswith("]"):
        parsed_array = ast.literal_eval(element)
        for value in parsed_array:
            if value not in SDG_Groups_unique:
                SDG_Groups_unique.append(value)

SDG_Groups_unique = sorted(SDG_Groups_unique)
print(f"{len(SDG_Groups_unique)=}")

len(SDG_Groups_unique)=2045


In [7]:
x = pd.DataFrame(SDG_Groups_unique, columns=['SDG_Groups_from_variables'])
write_to_excel(x, '../data/output/SDG_Groups_from_variables.xlsx', 'SDG_Groups_from_variables', 90)

### 2. Obtain `specializationOf` groups

In [8]:
def get_parent_groups(group_id):
    endpoint = 'v1/property/values'
    parameters = f'/out/{group_id}/specializationOf'
    response_data = call_api(endpoint, parameters)
    try:
        x = pd.DataFrame(response_data['values'])
        x['child_group'] = group_id
    except:
        x = pd.DataFrame()
    return x

In [9]:
# Group_hierarchy = []
# Group_hierarchy_log = []

# for group_id in SDG_Groups_unique:
    
#     x = get_parent_groups(group_id)
#     Group_hierarchy.append(x)
#     log_entry = {'child_group': group_id, 'No_parent_groups': len(x)}
#     clear_output(wait=True)

# Group_hierarchy1 = pd.concat(Group_hierarchy)
# Group_hierarchy1_log =pd.DataFrame(Group_hierarchy_log)
    

In [10]:
def recursive_func(df, dfs):
    # Check if the dataframe is empty
    if df.empty:
        return pd.concat(dfs)
    
    # Get the unique values of the 'dcid' column
    unique_values = df['dcid'].unique()
    
    # Create an empty list to store the dataframes
    new_dfs = []
    
    # Loop through the unique values
    for value in unique_values:
        # Call the function on the subset of the dataframe with the current value
        x = get_parent_groups(value)
        
        # Append the result to the list of dataframes
        new_dfs.append(x)
    
    # Concatenate the list of dataframes
    new_df = pd.concat(new_dfs)
    
    # Append the new dataframe to the list of dataframes
    dfs.append(new_df)
    clear_output(wait=True)

    # Recursively call the function with the new dataframe
    return recursive_func(new_df, dfs)

In [11]:
dfs = []
df0 = pd.DataFrame({'dcid': SDG_Groups_unique})
result = recursive_func(df0, dfs)
result = result.astype(str) 
result = result.drop_duplicates()
write_to_excel(result, '../data/output/SDG_Groups_Hierarchy.xlsx', 'SDG_Groups_Hierarchy', 90)

http://api.datacommons.org/v1/property/values/out/dc/g/Root/specializationOf
