In [114]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS, XSD, BNode

In [115]:
DCT = Namespace('http://purl.org/dc/terms/')
DBPEDIA_RESOURCE = Namespace("http://dbpedia.org/resource/")
DBPEDIA_PAGE = Namespace("http://dbpedia.org/page/")
QB = Namespace("http://purl.org/linked-data/cube#")
SDMX_CONCEPT = Namespace("http://purl.org/linked-data/sdmx/2009/concept#")
SDMX_DIMENSION = Namespace("http://purl.org/linked-data/sdmx/2009/dimension#")
SDMX_MEASURE = Namespace("http://purl.org/linked-data/sdmx/2009/measure#")
MYNS = Namespace('http://dsci558.org/myfakenamespace#')
INTERVAL = Namespace("http://reference.data.gov.uk/def/intervals/")
SCHEMA = Namespace('http://schema.org/')

In [116]:
g = Graph()

In [117]:
g.bind("rdf", RDF)
g.bind("rdfs", RDFS)
g.bind("dct", DCT)
g.bind("dbpedia-resource", DBPEDIA_RESOURCE)
g.bind("dbpedia-page", DBPEDIA_PAGE)
g.bind("qb", QB)
g.bind("xsd", XSD)
g.bind("sdmx-concept", SDMX_CONCEPT)
g.bind("sdmx-dimension", SDMX_DIMENSION)
g.bind("sdmx-measure", SDMX_MEASURE)
g.bind("my_ns", MYNS)
g.bind("interval", INTERVAL)
g.bind("schema", SCHEMA)

In [118]:
import csv

In [119]:
rows = []
with open("../../hw7/Homework07/world-development-indicators.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        rows.append(row)
        

In [120]:
list(rows[1])[:10]

['\ufeffSeries Name',
 'Series Code',
 'Country Name',
 'Country Code',
 '1960 [YR1960]',
 '1961 [YR1961]',
 '1962 [YR1962]',
 '1963 [YR1963]',
 '1964 [YR1964]',
 '1965 [YR1965]']

In [121]:
node_uri = URIRef(MYNS['measure_femalePopulation'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDF.type, QB.MeasureProperty))
g.add((node_uri, RDFS.label, Literal("female population percentage", lang="en")))
g.add((node_uri, RDFS.subPropertyOf, SDMX_MEASURE.obsValue))
g.add((node_uri, RDFS.range, DBPEDIA_PAGE.Percentage))

measure_female = BNode()
g.add((measure_female, QB.measure, node_uri))

node_uri = URIRef(MYNS['measure_malePopulation'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDF.type, QB.MeasureProperty))
g.add((node_uri, RDFS.label, Literal("male population percentage", lang="en")))
g.add((node_uri, RDFS.subPropertyOf, SDMX_MEASURE.obsValue))
g.add((node_uri, RDFS.range, DBPEDIA_PAGE.Percentage))

measure_male = BNode()
g.add((measure_male, QB.measure, node_uri))

node_uri = URIRef(MYNS['measure_population'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDF.type, QB.MeasureProperty))
g.add((node_uri, RDFS.label, Literal("population count", lang="en")))
g.add((node_uri, RDFS.subPropertyOf, SDMX_MEASURE.obsValue))
g.add((node_uri, RDFS.range, DBPEDIA_RESOURCE.Person))

measure_population = BNode()
g.add((measure_population, QB.measure, node_uri))


node_uri = URIRef(MYNS['country'])
g.add((node_uri, RDF.type, SCHEMA.Class))
g.add((node_uri, RDFS.label, XSD.string))

node_uri = URIRef(MYNS['refArea'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDF.type, QB.DimensionProperty))
g.add((node_uri, RDFS.label, Literal("reference area", lang="en")))
g.add((node_uri, RDFS.range, MYNS.country))
g.add((node_uri, QB.concept, SDMX_CONCEPT.refArea))

refPeriod = BNode()
g.add((refPeriod, QB.dimension, node_uri))
g.add((refPeriod, QB.order, Literal(2)))

node_uri = URIRef(MYNS['refPeriod'])
g.add((node_uri, RDF.type, RDF.Property))
g.add((node_uri, RDF.type, QB.DimensionProperty))
g.add((node_uri, RDFS.label, Literal("reference period", lang="en")))
g.add((node_uri, RDFS.range, INTERVAL.Interval))
g.add((node_uri, QB.concept, SDMX_CONCEPT.refPeriod))

refArea = BNode()
g.add((refArea, QB.dimension, node_uri))
g.add((refArea, QB.order, Literal(1)))

node_uri = URIRef(MYNS['datastructure'])
g.add((node_uri, RDF.type, QB.DataStructureDefinition))
g.add((node_uri, QB.component, refArea))
g.add((node_uri, QB.component, refPeriod))
g.add((node_uri, QB.component, measure_female))
g.add((node_uri, QB.component, measure_male))
g.add((node_uri, QB.component, measure_population))


node_uri = URIRef(MYNS['dataset'])
g.add((node_uri, RDF.type, QB.DataSet))
g.add((node_uri, DCT.title, Literal("Country Population Statistics", lang="en")))
g.add((node_uri, RDFS.label, Literal("Country Population Statistics", lang="en")))
g.add((node_uri, QB.structure, MYNS.datastructure))

In [122]:
list(rows[1].items())[:10]

[('\ufeffSeries Name', 'Population, total'),
 ('Series Code', 'SP.POP.TOTL'),
 ('Country Name', 'Albania'),
 ('Country Code', 'ALB'),
 ('1960 [YR1960]', '1608800'),
 ('1961 [YR1961]', '1659800'),
 ('1962 [YR1962]', '1711319'),
 ('1963 [YR1963]', '1762621'),
 ('1964 [YR1964]', '1814135'),
 ('1965 [YR1965]', '1864791')]

In [123]:
# eg:obs1a  a qb:Observation;
#     qb:dataSet eg:dataset1;
#     sdmx-dimension:refTime "2010-07-30"^^xsd:date;
#     eg-measure:weight 1.3 ;
#     eg-measure:quantity 42 ;
#     . 

In [124]:
# eg:o13 a qb:Observation;
#     qb:dataSet  eg:dataset-le3 ;
#     eg:refArea                 ex-geo:monmouthshire_00pp ;                  
#     eg:lifeExpectancy          76.6 ;
#     .

In [125]:
# g.serialize('blah.ttl', format="turtle")

In [126]:
def create_key_from_name(name):
    return "-".join(name.split())

In [127]:
country_nodes = {}
country_year_nodes = {}
for i, row in enumerate(rows):
    country_name = row["Country Name"]
    if len(country_name):
        country_key = create_key_from_name(country_name)
        if country_key not in country_nodes:
            node_uri = URIRef(MYNS[country_key])
            g.add((node_uri, RDF.type, MYNS.country))
            g.add((node_uri, RDFS.label, Literal(country_name)))
            country_nodes[country_key] = node_uri

        country_node = country_nodes[country_key]

        measure = row['Series Code']
        if measure == "SP.POP.TOTL":
            measure_node = MYNS.measure_population
        elif measure == "SP.POP.TOTL.FE.ZS":
            measure_node = MYNS.measure_femalePopulation
        else:
            measure_node = MYNS.measure_malePopulation

        for j, entry in enumerate(list(row.items())[4:]):
            year = str(entry[0].split(" [")[0])
            value = entry[1]
            try:
                float_value = float(value)
            except:
                continue

            uri = country_key + "-" + year
            if uri not in country_year_nodes:
                node_uri = URIRef(MYNS[uri])
                g.add((node_uri, RDF.type, QB.Observation))
                g.add((node_uri, QB.dataSet, MYNS.dataset))
                g.add((node_uri, MYNS.refArea, country_node))
                date_string = "http://reference.data.gov.uk/id/gregorian-interval/{}-01-01T00:00:00/P3Y".format(year)
                date_node_uri = URIRef(date_string)
                g.add((node_uri, MYNS.refPeriod, date_node_uri))
                country_year_nodes[uri] = node_uri

            node_uri = country_year_nodes[uri]

            if measure_node == MYNS.measure_population:
                value = int(value)
            else:
                value = float(value)

            g.add((node_uri, measure_node, Literal(value)))


In [128]:
g.serialize('world_bank_data.ttl', format="turtle")