based on original code from "CSV2RDF-tutorial.ipynb".

Changed made by Roderick van der Weerdt & Nienke Speet.

# Handling the Data:

In [2]:
# Converts csv into dict.
from csv import DictReader

filename = "EtenDrinken_clean.csv"

with open(filename,'r') as csvfile:
    csv_contents = [{k: v for k, v in row.items()}
        for row in DictReader(csvfile, skipinitialspace=True, quotechar='"', delimiter=';')]
    
filename = "OVHALTES.csv"
    
with open(filename,'r') as stopfile:
    stop_contents = [{k: v for k, v in row.items()}
        for row in DictReader(stopfile, skipinitialspace=True, quotechar='"', delimiter=';')]

In [3]:
# Returns a dict where the key is the route identifier which holds all the 
# destinations of the route. Destinations are only added when they have 
# not been added allready (uses similarity.py to determine whether the
# destination was allready added).

from similarity import similarity

def checkDest(dest_dict, line_id, dest):
    if dest not in dest_dict[line_id]:
        new = True
        for other_dest in dest_dict[line_id]:
            if similarity(other_dest, dest) > 0.6:
                if len(other_dest) < len(dest):
                    dest_dict[line_id].remove(other_dest)
                    dest_dict[line_id].append(dest)
                new = False
        if new:
            dest_dict[line_id].append(dest)
            
def testInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

dicto = dict()

for row in stop_contents:
    lines = row['Lijnen_en_bestemming'].split(',')
#     print lines
    for line in lines:
        if line == '':
            continue
        line_split = line.split(' ')
        while '' in line_split:
            line_split.remove('')
        line_id = line_split[0]
        if len(line_id) > 3:
            continue
        destination = ""
        for word in line_split[1:]:
            destination += word + ' '
        destination = destination[:-1]
        if dicto.has_key(line_split[0]):
            checkDest(dicto, line_id, destination)
        else:
            dicto[line_id] = [destination]

route_dict = dicto

### Setting up stuff for RDF

We import the things we'll need from `rdflib`:

* `Dataset` is the object in which we will store our RDF graphs
* `URIRef` is the datatype for URI-resources
* `Literal` is the datatype for literal resources (strings, dates etc.)
* `Namespace` is used to create namespaces (parts of the URI's we are going to make)
* `RDF`, `RDFS`, `OWL` and `XSD` are built in namespaces

**NB**: We'll use "group 2", which is our groupname.

In [4]:
from rdflib import Dataset, URIRef, Literal, Namespace, RDF, RDFS, OWL, XSD

# A namespace for our resources
data = 'http://data.krw.d2s.labs.vu.nl/group2/resource/'
DATA = Namespace(data)
# A namespace for our vocabulary items (schema information, RDFS, OWL classes and properties etc.)
vocab = 'http://data.krw.d2s.labs.vu.nl/group2/resource/'
VOCAB = Namespace('http://data.krw.d2s.labs.vu.nl/group2/resource/')

GEO = Namespace('http://www.w3.org/2003/01/geo/wgs84_pos#')
DB_OWL = Namespace('http://dbpedia.org/ontology/')
SCHEMA = Namespace('http://schema.org/')
SWPO = Namespace('http://sw-portal.deri.org/ontologies/swportal#')
PLACE = Namespace('http://purl.org/ontology/places#')

# The URI for our graph
graph_uri = URIRef('http://data.krw.d2s.labs.vu.nl/resource/graph2')

# We initialize a dataset, and bind our namespaces
dataset = Dataset()
dataset.bind('g2data',DATA)
dataset.bind('g2vocab',VOCAB)

dataset.bind('geo',GEO)
dataset.bind('dbpedia-owl', DB_OWL)
dataset.bind('schema', SCHEMA)
dataset.bind('swpo', SWPO)
dataset.bind('place', PLACE)

# We then get a new graph object with our URI from the dataset.
graph = dataset.graph(graph_uri)

### Let's make some RDF from our CSV Dictionary

A straightforward conversion:

* Make sure you have URIRef objects for all resources you want to make
* Make sure you have Literal objects for all literal values you need. Be sure to use the proper datatype or a language tag.
* Decide on what URI will be the 'primary key' for each row.
* Decide on the terms you are going to use to create the relations (predicates, properties)
* Add the triples to the graph

In [5]:
#Make a graph from the venue data

# IRI baker is a library that reliably creates valid (parts of) IRIs from strings (spaces are turned into underscores, etc.).
from iribaker import to_iri

for row in csv_contents:
    toko = URIRef(to_iri(data + row['Title'] + '_' + row['Zipcode']))
    toko_name = Literal(row['Title'], datatype=XSD['string'])
    toko_type = URIRef(to_iri(VOCAB['Toko']))
    
    city = URIRef(to_iri(data + row['City']))
    city_type = URIRef(to_iri(DB_OWL['city']))
    city_name = Literal(row['City'], lang='nl')
    
    urls = row['Urls'].split(',')
    for url in urls:
        if len(url) > 0:
            url = Literal(url, datatype=XSD['string'])
            graph.add((toko, SCHEMA['url'], url))
    
    # Media are url-links to pictures of the venue
    medias = row['Media'].split(',')
    for media in medias:
        if len(media) > 0:
            media = Literal(media, datatype=XSD['string'])
            graph.add((toko, DB_OWL['Media'], media))
    
    calendar = row['Calendarsummary']
    if len(calendar) > 0:
        calendar = Literal(calendar, datatype=XSD['string'])
        graph.add((toko, VOCAB['opening_hours'], calendar))
        
    address = row['Adres']
    if len(address) > 0:
        address = Literal(address, datatype=XSD['string'])
        graph.add((toko, SWPO['hasAddress'], address))
        
    zipcode = row['Zipcode']
    if len(zipcode) > 0:
        zipcode = Literal(zipcode, datatype=XSD['string'])
        graph.add((toko, SWPO['hasZipCode'], zipcode))
    
    #change dutch decimal mark (',') to international ('.')
    lat = row['Latitude'].replace(',', '.')
    lon = row['Longitude'].replace(',', '.')
    
    latitude = Literal(float(lat), datatype=XSD['double'])
    longitude = Literal(float(lon), datatype=XSD['double'])
    
    use_lat = Literal(lat[:lat.find('.')+3], datatype=XSD['string'])
    use_long = Literal(lon[:lon.find('.')+3], datatype=XSD['string'])
    
    # All set... we are now going to add the triples to our graph
    graph.add((toko, RDF.type, toko_type))
    graph.add((toko, RDFS.label, toko_name))
    graph.add((toko, PLACE['in'], city))
    graph.add((toko, GEO['lat'], latitude))
    graph.add((toko, GEO['long'], longitude))
    
    graph.add((toko, VOCAB['use_lat'], use_lat))
    graph.add((toko, VOCAB['use_long'], use_long))
    
    graph.add((city, RDF.type, city_type))
    graph.add((city, RDFS.label, city_name))

In [6]:
# Make a graph from the stop data.

def getRouteIDs(routes):
    route_ids = []
    routes = routes.split(',')
    for route in routes:
        if route == '':
            continue
        route_split = route.split(' ')
        while '' in route_split:
            route_split.remove('')
        route_id = route_split[0]
        if len(route_id) > 3:
            continue
        else:
            route_ids.append(route_id)

    return list(set(route_ids))

for row in stop_contents: 
    stop = URIRef(to_iri(data + 'Stop' + row['\xef\xbb\xbfOBJECTNUMMER']))
    stop_id = Literal(row['\xef\xbb\xbfOBJECTNUMMER'], datatype=XSD['int'])
    
    stop_type = URIRef(to_iri(VOCAB['Stop']))
    
    route_ids = getRouteIDs(row['Lijnen_en_bestemming'])
    
    for route_id in route_ids:
        route = URIRef(to_iri(data + "ROUTE_" + route_id))
        graph.add((route, RDF.type, URIRef(to_iri(VOCAB['Route']))))
        graph.add((stop, VOCAB['Routes'], route))
        for dest in route_dict[route_id]:
            destination = Literal(dest, datatype=XSD['string'])
            graph.add((route, VOCAB['Destination'], destination))
    
    #change dutch decimal mark (',') to international ('.')
    lat = row['LAT'].replace(',', '.')
    lon = row['LNG'].replace(',', '.')
    
    latitude = Literal(float(lat), datatype=XSD['double'])
    longitude = Literal(float(lon), datatype=XSD['double'])
    
    use_lat = Literal(lat[:lat.find('.')+3], datatype=XSD['string'])
    use_long = Literal(lon[:lon.find('.')+3], datatype=XSD['string'])
    
    graph.add((stop, RDF.type, stop_type))
    graph.add((stop, RDFS.label, stop_id))
    graph.add((stop, GEO['lat'], latitude))
    graph.add((stop, GEO['long'], longitude))
    graph.add((stop, VOCAB['use_lat'], use_lat))
    graph.add((stop, VOCAB['use_long'], use_long))


Let's see how this turned out:

In [7]:
print dataset.serialize(format='trig')

@prefix dbpedia-owl: <http://dbpedia.org/ontology/> .
@prefix g2data: <http://data.krw.d2s.labs.vu.nl/group2/resource/> .
@prefix g2vocab: <http://data.krw.d2s.labs.vu.nl/group2/resource/> .
@prefix geo: <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix place: <http://purl.org/ontology/places#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix swpo: <http://sw-portal.deri.org/ontologies/swportal#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<ns1:default> = {}

<ns2:graph2> = {
    <http://data.krw.d2s.labs.vu.nl/group2/resource/'t_Beemster_Spijshuis_1462_HJ> a g2vocab:Toko ;
        rdfs:label "'t Beemster Spijshuis"^^xsd:string ;
        g2vocab:opening_hours "Ma: gesloten di-vr: 17:00 - 22:00 uur za, zo: 12:00 - 22:00 uur."^^xsd:string ;
        g2vocab:use_lat "52.54"^^xsd:string ;
        g2vocab:use_l

## Graph handling

In [45]:
# Saving the RDF to a file
with open('extended-going-out-stops_13-4_v2.ttl','w') as f:
    graph.serialize(f, format='turtle')

In [39]:
#CLEARING THE DATASET

# Clear the graph from the dataset (because we're going to start anew)
dataset.remove_graph(graph)
# And get a new object (with the same URI, to create some confusion)
graph = dataset.graph(graph_uri)
