In [57]:
import sys
sys.path.append("../")
from rdflib import Graph
from rdflib import URIRef, BNode, Literal
from rdflib import Namespace
from rdflib.namespace import OWL, RDF, RDFS, FOAF, XSD
from rdflib.util import guess_format
import pandas as pd


import csv
import owlrl
import numpy as np
import Levenshtein as lev 
import re


# Look-ups


In [58]:
# Downloading dataset

path = 'C:/Users/rahem/OneDrive/Desktop/Semantic KG/cw_data.csv'
cw_data = pd.read_csv(path)

In [59]:
cw_data.head(5)

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,OR,Pizza Place,Bianca Pizza,22.5,USD,
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,OR,Pizza Place,Cheese Pizza,18.95,USD,
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes"


In [60]:
# Checking missing values

cw_data.isnull().sum()

name                  0
address               0
city                  0
country               0
postcode             10
state                 0
categories            0
menu item             0
item value           78
currency             75
item description    325
dtype: int64

# Converting tabular data to RDF triples

In [66]:
class KnowledgeGraphTabData(object):
    
    def __init__(self, input_file):
           
        self.file = input_file
    
        #Dictionary that keeps the URIs. Specially useful if accessing a remote service to get a candidate URI to avoid repeated calls
        self.stringToURI = dict()
        #1. GRAPH INITIALIZATION
        #Empty graph
        self.g = Graph()
        #Note that this is the same namespace used in the ontology "ontology_lab5.ttl"
        self.namespace_string= "http://www.semanticweb.org/in3067-inm713/restaurants#"
        #Special namspaces class to create directly URIRefs in python.           
        self.cw_onto = Namespace(self.namespace_string)
        #Prefixes for the serialization
        self.g.bind("cw", self.cw_onto)
        
        #Load data in dataframe  
        self.data_frame = pd.read_csv(self.file, sep=',', quotechar='"',escapechar="\\")    
        #KG
        self.dbpedia = DBpediaLookup()
    
    
    def Task1(self):
        self.CovertCSVToRDF(False)
        
    def Task2(self):
        self.CovertCSVToRDF(True)

    def CovertCSVToRDF(self, useExternalURI):
        
        if 'name' in self.data_frame:
            self.mappingToCreateTypeTriple('name', self.cw_onto.Location, useExternalURI)
            self.mappingToCreateLiteralTriple('name', 'name', self.cw_onto.restaurantName,XSD.string)
            
            if 'address' in self.data_frame:
                self.mappingToCreateLiteralTriple('address', 'address', self.cw_onto.firstLineAddress, XSD.string)
                
                if 'postcode' in self.data_frame:
                    self.mappingToCreateLiteralTriple('address', 'postcode', self.cw_onto.postCode, XSD.string)

            if 'city' in self.data_frame:
                self.mappingToCreateLiteralTriple('name', 'city', self.cw_onto.City, XSD.string )

            if 'country' in self.data_frame:
                self.mappingToCreateLiteralTriple('name','country', self.cw_onto.Country, XSD.string)

            if 'state' in self.data_frame:
                self.mappingToCreateLiteralTriple('name','state', self.cw_onto.State, XSD.string)

            if 'categories' in self.data_frame:
                self.mappingToCreateLiteralTriple('name','categories', self.cw_onto.Restaurant, XSD.string)
        
        if 'menu item' in self.data_frame:
            self.mappingToCreateTypeTriple('menu item', self.cw_onto.MenuItem, useExternalURI)
            self.mappingToCreateLiteralTriple('menu item', 'menu item', self.cw_onto.itemName, XSD.string)
        
            if 'item value' in self.data_frame:
                self.mappingToCreateLiteralTriple('menu item', 'item value', self.cw_onto.amount, XSD.double)

            if 'currency' in self.data_frame:
                self.mappingToCreateLiteralTriple('menu item','currency', self.cw_onto.Currency, XSD.string)

            if 'item description' in self.data_frame:
                self.mappingToCreateLiteralTriple('menu item','item description', self.cw_onto.Ingredient, XSD.string)
                
                
    def createURIForEntity(self, name, useExternalURI):
        
        #We create fresh URI (default option)
        self.stringToURI[name] = self.namespace_string + name.replace(" ", "_")
        
        if useExternalURI: #We connect to online KG
            uri = self.getExternalKGURI(name)
            if uri!="":
                self.stringToURI[name]=uri
        
        return self.stringToURI[name]

        
    def getExternalKGURI(self, name):
        entities = self.dbpedia.getKGEntities(name, 5)
        #print("Entities from DBPedia:")
        current_sim = -1
        current_uri=''
        for ent in entities:           
            isub_score = isub(name, ent.label) 
            if current_sim < isub_score:
                current_uri = ent.ident
                current_sim = isub_score
        
            #print(current_uri)
        return current_uri 
            
    
    
    #Mapping to create triples like cw_onto:London rdf:type cw_onto:City
    #A mapping may create more than one triple
    #column: columns where the entity information is stored
    #useExternalURI: if URI is fresh or from external KG
 
    def mappingToCreateTypeTriple(self, subject_column, class_type, useExternalURI):
        
        for subject in self.data_frame[subject_column]:
                
            #We use the ascii name to create the fresh URI for a city in the dataset
            if subject.lower() in self.stringToURI:
                entity_uri=self.stringToURI[subject.lower()]
            else:
                entity_uri=self.createURIForEntity(subject.lower(), useExternalURI)
            
            #TYPE TRIPLE
            #For the individuals we use URIRef to create an object "URI" out of the string URIs
            #For the concepts we use the ones in the ontology and we are using the NameSpace class
            #Alternatively one could use URIRef(self.namespace_string+"City") for example 
            self.g.add((URIRef(entity_uri), RDF.type, class_type))
    def is_nan(self, x):
        return (x != x)
            
            
   
    #Mappings to create triples of the form cw_onto:london cw_onto:name "London"
    
    
    def mappingToCreateLiteralTriple(self, subject_column, object_column, predicate, datatype):
        
        for subject, lit_value in zip(self.data_frame[subject_column], self.data_frame[object_column]):
            
            if self.is_nan(lit_value) or lit_value==None or lit_value=="":
                pass
            
            else:
                #Uri as already created
                entity_uri=self.stringToURI[subject.lower()]
                    
                #Literal
                lit = Literal(lit_value, datatype=datatype)
                
                #New triple
                self.g.add((URIRef(entity_uri), predicate, lit))
    
    def mappingToCreateObjectTriple(self, subject_column, object_column, predicate):
        
        for subject, object in zip(self.data_frame[subject_column], self.data_frame[object_column]):
            
            if self.is_nan(object):
                pass
            
            else:
                #Uri as already created
                subject_uri=self.stringToURI[subject.lower()]
                object_uri=self.stringToURI[object.lower()]
                #New triple
                self.g.add((URIRef(subject_uri), predicate, URIRef(object_uri)))
    
    def performReasoning(self, ontology_file):    
        #We expand the graph with the inferred triples
        #We use owlrl library with OWL2 RL Semantics (instead of RDFS semantic as we saw in lab 4)
        #More about OWL 2 RL Semantics in lecture/lab 7
        
        print("Data triples from CSV: '" + str(len(self.g)) + "'.")
    
        #We should load the ontology first
        #print(guess_format(ontology_file))
        self.g.load(ontology_file,  format=guess_format(ontology_file)) #e.g., format=ttl
        
        print("Triples including ontology: '" + str(len(self.g)) + "'.")
        
        #We apply reasoning and expand the graph with new triples 
        owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=False, datatype_axioms=False).expand(self.g)
        
        print("Triples after OWL 2 RL reasoning: '" + str(len(self.g)) + "'.")
    
    def performSPARQLQuery(self, file_query_out):
        
        qres = self.g.query(
            """SELECT DISTINCT ?country ?city ?pop WHERE {
              ?city rdf:type cw_onto:City .
              ?city cw_onto:isCapitalOf ?country .
              ?city cw_onto:population ?pop .
              FILTER (xsd:integer(?pop) > 5000000)
        }
        ORDER BY DESC(?pop)
        """)


        print("%s capitals satisfying the query." % (str(len(qres))))
        
        f_out = open(file_query_out,"w+")

        for row in qres:
            #Row is a list of matched RDF terms: URIs, literals or blank nodes
            line_str = '\"%s\",\"%s\",\"%s\"\n' % (row.country, row.city, row.pop)


            f_out.write(line_str)
            
     
        f_out.close()       
        
        
    def performSPARQLQueryLab7(self):
        
        qres = self.g.query(
            """SELECT DISTINCT ?country (COUNT(?city) AS ?num_cities) WHERE { 
              ?country lab5:hasCity ?city .
        }
        GROUP BY ?country
        ORDER BY DESC(?num_cities)
        """)
        for row in qres:
            #Row is a list of matched RDF terms: URIs, literals or blank nodes
            line_str = '\"%s\",\"%s\"' % (row.country, row.num_cities)
            print(line_str)
    
    def saveGraph(self, file_output):
        
        ##SAVE/SERIALIZE GRAPH
        #print(self.g.serialize(format="turtle").decode("utf-8"))
        self.g.serialize(destination=file_output, format='ttl')
        
        
    

In [68]:
 print("Data triples from CSV: '" + str(len(self.g)) + "'.")

NameError: name 'self' is not defined