In [None]:
# Import Data Commons

import pandas as pd
import numpy as np
import datacommons_pandas as dc

# Import other required libraries
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd

import json
import time

import os
from dotenv import load_dotenv

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import clear_output, display
from IPython.core.display import HTML, JSON

import requests
from bs4 import BeautifulSoup


import sys
sys.path.append('../') # add the project's root directory to the system path

from utils_excel import * # import the utils_excel module from the project's root directory


In [None]:
load_dotenv()
api_key = os.getenv("DC_KEY")

In [None]:
sdg_series = pd.read_excel('../data/input/SDG_Series.xlsx')
sdg_series.head()

In [None]:
def call_api(endpoint, parameters):
    url = f"http://api.datacommons.org/{endpoint}{parameters}?key={api_key}"
    print(f"http://api.datacommons.org/{endpoint}{parameters}")
    response = requests.get(url)
    return json.loads(response.content)

# Local graph exploration

## 1. Get all properties associated with a specific node

Get all properties associated with a specific node.

More specifically, this endpoint returns the labels of the edges connected to a specific node in the Data Commons Knowledge Graph. Edges in the graph are directed, so properties can either be labels for edges towards or away from the node. Outgoing edges correspond to properties of the node. Incoming edges denote that the node is the value of this property for some other node.

In [None]:
call_api("v1/properties", "/in/SDG_SE_AGP_CPRA")
call_api("v1/properties", "/out/SDG_SE_AGP_CPRA")

## 2. Get the value for a property of a specific node

Get the values of a property for a specific node.

Data Commons represents properties as labels of directed edges between nodes, where the successor node is a value of the property. Thus, this endpoint returns nodes connected to the queried node via the property queried.

In [None]:

call_api("v1/property/values", "/out/SDG_SE_AGP_CPRA/name")
print('---')
call_api("v1/property/values", "/out/SDG_SE_AGP_CPRA/provenance")
print('---')
call_api("v1/property/values", "/out/SDG_SE_AGP_CPRA/typeOf")
print('---')
call_api("v1/property/values", "/in/SDG_SE_AGP_CPRA/populationType")


## 3. Variable info

Get basic information about a variable.

This API returns basic information on a variable, given the variable’s DCID. The information provided includes the number of entities that have data for the variable, the minimum and maximum value observed, and the name and DCID of the top 3 entities with highest observed values for that variable. The information is grouped by place type (country, state, county, etc.).

In [None]:
call_api("v1/info/variable", "/sdg/SE_AGP_CPRA_LOWSEC")

## 4. Triples
Get a triple.

Useful for finding local connections between nodes of the Data Commons knowledge graph.

In [None]:

#  The statistical variable "sdg/SE_AGP_CPRA_LOWSEC_Q2" has:
#
#   - name: 
#       "Adjusted gender parity index for completion rate: Lower secondary, Quantile 2"
#
#   - typeOf: 
#       StatisticalVariable (Class)
#
#   - populationType: 
#       SDG_SE_AGP_CPRA (SDG_Series)
#
#   - constraintProperties: 
#       [educationalAttainment (Property), sdq_quantile (Property)] 
#
#   - sdg_quantile: 
#       SDG_QuantileEnum_Q2 (SDG_QuantileEnum)
#
#   - educationalAttainment: 
#       SDG_EducationLevelEnum_LOWSEC (SDG_EducationLevelEnum)
#
#   - statType: 
#       measuredValue (Property)
#
#   - provenance: 
#       dc/base/HumanReadableStatVars (Provenance)
#
#   - memberOf: 
#       dc/g/SDGSEAGPCPRA_educationalAttainment-SDGEducationLevelEnumLOWSEC_sdgquantile-SDGQuantileEnumQ2 (StatVarGroup)

call_api("v1/triples", "/out/sdg/SE_AGP_CPRA_LOWSEC_Q2")

In [None]:
#  The StatVarGroup "/dc/g/SDGSEAGPCPRA_educationalAttainment-SDGEducationLevelEnumLOWSEC_sdgquantile-SDGQuantileEnumQ2" has:
#
#   - name: 
#       "Adjusted Gender Parity Index for Completion Rate With Educational Attainment = Lower Secondary, Quantile = Quantile 2"
#
#   - typeOf: 
#       StatVarGroup (Class)
#
#   - specializationOf: 
#       [
#        dc/g/SDGSEAGPCPRA_educationalAttainment-SDGEducationLevelEnumLOWSEC_sdgquantile (StatVarGroup) --> "Adjusted Gender Parity Index for Completion Rate With Educational Attainment = Lower Secondary, Quantile",
#        dc/g/SDGSEAGPCPRA_educationalAttainment_sdgquantile-SDGQuantileEnumQ2, (StatVarGroup) --> "Adjusted Gender Parity Index for Completion Rate With Educational Attainment, Quantile = Quantile 2"
#       ]
#
#   - provenance: 
#       dc/base/GeneratedGraphs (Provenance)


call_api("v1/triples", "/out/dc/g/SDGSEAGPCPRA_educationalAttainment-SDGEducationLevelEnumLOWSEC_sdgquantile-SDGQuantileEnumQ2")

In [None]:
call_api("v1/triples", "/out/StatVarGroup")

# SPARQL Query

In [None]:
# set up the API endpoint URL
url = "https://api.datacommons.org/v1/query"
headers = {"X-API-Key": api_key}

## 1. Get all resources of type `SDG_Series`

In [None]:
query = """
        SELECT ?dcid ?name
        WHERE {
          ?x typeOf SDG_Series .
          ?x dcid ?dcid .
          ?x name ?name .
        }
        ORDER BY DESC(?dcid)
        """
request_body = {"sparql": query}

# send the request to the API endpoint
response = requests.post(url, headers=headers, data=json.dumps(request_body))

results = json.loads(response.content)['rows']

pd.DataFrame([('SDG_Series', r['cells'][0]['value'], r['cells'][1]['value']) for r in results], columns=['typeOf', 'dcid', 'name'])


# # print the response content (in JSON format)
# print(response.json())

---

In [None]:
statistical_variables = []
statistical_variables_log = []
for index, sdg_series_id in sdg_series['dcid'].items():
    url = f"http://api.datacommons.org/v1/property/values/in/{sdg_series_id}/populationType?key={api_key}"
    response = requests.get(url)
    json_data = response.content
    try:
        x = pd.DataFrame(json.loads(json_data)['values'])
        x['SDG_Series'] = sdg_series_id
        statistical_variables.append(x)
        statistical_variables_log.append({'SDG_Series': sdg_series_id, 'No_variables': len(x)})
    except:
        statistical_variables_log.append({'SDG_Series': sdg_series_id, 'No_variables': 0})

statistical_variables = pd.concat(statistical_variables)
statistical_variables_log =pd.DataFrame(statistical_variables_log)


In [None]:
write_to_excel(statistical_variables, '../data/output/statistical_variables.xlsx', 'StatisticalVariables', 90)
write_to_excel(statistical_variables_log, '../data/output/statistical_variables_log.xlsx', 'StatisticalVariables', 90)

In [None]:
url = f"http://api.datacommons.org/v1/properties/out/sdg/SE_AGP_CPRA_LOWSEC_Q1?key={api_key}"
response = requests.get(url)
json_data = response.content
properties = json.loads(json_data)['properties']
properties





In [None]:
properties = []
       
for index, sdg_variable_id in statistical_variables['dcid'].items():
    url = f"http://api.datacommons.org/v1/property/values/out/{sdg_variable_id}/constraintProperties?key={api_key}"
    response = requests.get(url)
    json_data = response.content
    constraintProperties = json.loads(json_data)
    if 'values' in constraintProperties.keys():
        for cp in constraintProperties['values']:
            constraint_properties.append(
                {'statistical_variable': sdg_variable_id,
                 'constraintProperty': cp['dcid'],
                 'constraintProperty_name': cp['name']
                }
            )

constraint_properties = pd.DataFrame(constraint_properties)

In [None]:
constraint_properties = []
       
for index, sdg_variable_id in statistical_variables['dcid'].items():
    url = f"http://api.datacommons.org/v1/property/values/out/{sdg_variable_id}/constraintProperties?key={api_key}"
    response = requests.get(url)
    json_data = response.content
    constraintProperties = json.loads(json_data)
    if 'values' in constraintProperties.keys():
        for cp in constraintProperties['values']:
            constraint_properties.append(
                {'statistical_variable': sdg_variable_id,
                 'constraintProperty': cp['dcid'],
                 'constraintProperty_name': cp['name']
                }
            )

constraint_properties = pd.DataFrame(constraint_properties)


In [None]:
constraint_properties

In [None]:
constraint_properties = constraint_properties.pivot_table(
    index='statistical_variable',
    columns='constraintProperty',
    values='constraintProperty_name',
    aggfunc='first'
).fillna('')

# reset index
constraint_properties = constraint_properties.reset_index()


write_to_excel(constraint_properties, '../data/output/constraint_properties.xlsx', 'ConstraintProperties', 90)

In [None]:
constraint_properties