# Pre-processing

In [67]:
import sys
sys.path.append("../")
from rdflib import Graph
from rdflib import URIRef, BNode, Literal
from rdflib import Namespace
from rdflib.namespace import OWL, RDF, RDFS, FOAF, XSD
from rdflib.util import guess_format
import pandas as pd
from isub import isub
from lookup import DBpediaLookup
import csv
import owlrl
import numpy as np
import Levenshtein as lev 
import re
from isub import isub

# Look-ups
from lookup import DBpediaLookup, WikidataAPI, GoogleKGLookup
from endpoints import DBpediaEndpoint, WikidataEndpoint


Dataset

In [70]:
dataset_path = 'C:/Users/rahem/OneDrive/Desktop/Semantic_KG/cw_data.csv'
cw_data = pd.read_csv(dataset_path)

Detecting missing values

In [72]:
cw_data.isnull().sum()

name                  0
address               0
city                  0
country               0
postcode             10
state                 0
categories            0
menu item             0
item value           78
currency             75
item description    325
dtype: int64

In [73]:
cw_data.head(10)

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,OR,Pizza Place,Bianca Pizza,22.5,USD,
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701.0,OR,Pizza Place,Cheese Pizza,18.95,USD,
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes"
5,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Salami Piccante",15.0,USD,
6,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049.0,Brentwood,"American Restaurant,Bar,Bakery","Pizza, White Truffle Meat Sauce",15.0,USD,
7,Bravo Pizza Hollywood,5142 Hollywood Blvd,Los Angeles,US,90027.0,Los Feliz,Pizza Place,Cheese Pizza,10.99,USD,Choose a pizza size.
8,Bravo Pizza Hollywood,5142 Hollywood Blvd,Los Angeles,US,90027.0,Los Feliz,Pizza Place,Hawaiian Pizza,11.99,USD,"Canadian bacon, pineapple."
9,Bravo Pizza Hollywood,5142 Hollywood Blvd,Los Angeles,US,90027.0,Los Feliz,Pizza Place,Meat Lover Pizza,16.99,USD,


In [None]:
class TabularData(object):
    
    def __init__(self, input_file):
           
        self.file = input_file
    
        #Dictionary that keeps the URIs. Specially useful if accessing a remote service to get a candidate URI to avoid repeated calls
        self.stringToURI = dict()
        #1. GRAPH INITIALIZATION
        #Empty graph
        self.g = Graph()
        self.parse('cw_onto.ttl', format = "ttl")
        #Note that this is the same namespace used in the ontology "ontology_lab5.ttl"
        self.namespace_string= "http://www.semanticweb.org/in3067-inm713/restaurants#"
        #Special namspaces class to create directly URIRefs in python.           
        self.cw_onto = Namespace(self.namespace_string)
        #Prefixes for the serialization
        self.g.bind("cw", self.cw_onto)
        
        #Load data in dataframe  
        self.data_frame = pd.read_csv(self.file, sep=',', quotechar='"',escapechar="\\")    
        #KG
        self.dbpedia = DBpediaLookup()
    

    def SimpleUniqueMapping(self):
        #This mapping creates an several transformations (i.e., triples) in one go.
        #Unlike the modular approach (see ConvertCSVToRDF) this solution is less flexible to adaptations          
        #Format:
        #0        1             2    3        4        5           6              7            8         9          10
        #name     address    city  country   postcode   state   categories    menu item    item value    currency   item description                     
        for row in self.data_frame.itertuples(index=False):
            #print(row[0])
            #we avoid NaN values, one could add more safety filters. This case is problematic in this dataset                            
            if (self.is_nan(row[4]) or self.is_nan(row[8]) or self.is_nan(row[9]) or self.is_nan(row[10])): 
                continue
                
            entity_name_uri = self.namespace_string + row[0].lower().replace(" ", "_")
            entity_address_uri = self.namespace_string + row[1].lower().replace(" ","_")
            entity_city_uri = self.namespace_string + row[2].lower().replace(" ", "_")
            entity_country_uri = self.namespace_string + row[3].lower().replace(" ", "_")
            entity_state_uri = self.namespace_string + row[5].lower().replace(" ", "_")
            entity_categories_uri = self.namespace_string + row[6].lower().replace(" ", "_")
            entity_menu_item_uri = self.namespace_string + row[7].lower().replace(" ", "_")

                                
            #Types triples
            #Using self.lab5.City is equivalent to using URIRef(self.namespace_string = "City")
            self.g.add((URIRef(entity_currency_uri), RDF.type, self.cw_onto.Currency))
            self.g.add((URIRef(entity_food_uri), RDF.type, self.cw_onto.Food))
            self.g.add((URIRef(entity_itemvalue_uri), RDF.type, URIRef(self.namespace_string = "Item value")))   
            self.g.add((URIRef(entity_location_uri), RDF.type, self.cw_onto.Location)) 
            
            
            #Currency amount triples
            self.g.add((URIRef(entity_itemvalue_uri, self.cw_onto.amount, Literal(row[8], datatype = XSD.float))
                        
            #Food amount triples
            self.g.add((URIRef(entity_food_uri), self.cw_onto.))
 
            #City Names triples            
            self.g.add((URIRef(entity_city_uri), self.cw_onto.name_ascii, Literal(row[1], datatype=XSD.string)))
            if (not self.is_nan(row[0])):
                self.g.add((URIRef(entity_city_uri), self.cw_onto.name, Literal(row[0], datatype=XSD.string)))
            if (not self.is_nan(row[7])):
                self.g.add((URIRef(entity_city_uri), self.cw_onto.admin_name, Literal(row[7], datatype=XSD.string)))
                       
                       
            #Lat & long
            if (not self.is_nan(row[2])):
                self.g.add((URIRef(entity_city_uri), self.cw_onto.latitude, Literal(row[2], datatype=XSD.float)))
            if (not self.is_nan(row[3])):
                self.g.add((URIRef(entity_city_uri), self.cw_onto.longitude, Literal(row[3], datatype=XSD.float)))
            
            #population
            if (not self.is_nan(row[9])):
                self.g.add((URIRef(entity_city_uri), self.cw_onto.population, Literal(row[9], datatype=XSD.long)))
                       
            
            #Country name triple            
            self.g.add((URIRef(entity_country_uri), self.cw_onto.name, Literal(row[4], datatype=XSD.string)))
            
        
            #iso codes
            if (not self.is_nan(row[5])):
                self.g.add((URIRef(entity_country_uri), self.cw_onto.iso2code, Literal(row[5], datatype=XSD.string)))
            if (not self.is_nan(row[6])):
                self.g.add((URIRef(entity_country_uri), self.cw_onto.iso3code, Literal(row[6], datatype=XSD.string)))
             
        
            #Connection between cities and countries
            
            #Basic connection ignoring column "capital":                        
            #self.g.add((URIRef(entity_city_uri), self.lab5.cityIsLocatedIn, URIRef(entity_country_uri)))
            
            
            #Exploiting 'capital' column (it can be empty)            
                
            #(default) if value is empty or not expected
            predicate = self.cw_onto.cityIsLocatedIn
                
            if row[8]=="admin":                      
                predicate = self.cw_onto.isFirstLevelAdminCapitalOf
            elif row[8]=="primary":
                predicate = self.cw_onto.isCapitalOf                        
            elif row[8]=="minor":
                predicate = self.cw_onto.isSecondLevelAdminCapitalOf
                
            
            #Note that the ontology in lab5.ttl contains a hierarchy of properties, range and domain axioms and inverses
            #Via reasoning this triple will lead to several entailments
            self.g.add((URIRef(entity_city_uri), predicate, URIRef(entity_country_uri)))
            
        
    def createURIForEntity(self, name, useExternalURI):
        
        #We create fresh URI (default option)
        self.stringToURI[name] = self.namespace_string + name.replace(" ", "_")
        
        if useExternalURI: #We connect to online KG
            uri = self.getExternalKGURI(name)
            if uri!="":
                self.stringToURI[name]=uri
        
        return self.stringToURI[name]

        
    def getExternalKGURI(self, name):
        entities = self.dbpedia.getKGEntities(name, 5)
        #print("Entities from DBPedia:")
        current_sim = -1
        current_uri=''
        for ent in entities:           
            isub_score = isub(name, ent.label) 
            if current_sim < isub_score:
                current_uri = ent.ident
                current_sim = isub_score
        
            #print(current_uri)
        return current_uri 
            
    
    '''
    Mapping to create triples like cw_onto:London rdf:type cw_onto:City
    A mapping may create more than one triple
    column: columns where the entity information is stored
    useExternalURI: if URI is fresh or from external KG
    '''
                        #!!!!
    def mappingToCreateTypeTriple(self, subject_column, class_type, useExternalURI):
        
        for subject in self.data_frame[subject_column]:
                
            #We use the ascii name to create the fresh URI for a city in the dataset
            if subject.lower() in self.stringToURI:
                entity_uri=self.stringToURI[subject.lower()]
            else:
                entity_uri=self.createURIForEntity(subject.lower(), useExternalURI)
            
            #TYPE TRIPLE
            #For the individuals we use URIRef to create an object "URI" out of the string URIs
            #For the concepts we use the ones in the ontology and we are using the NameSpace class
            #Alternatively one could use URIRef(self.namespace_string+"City") for example 
            self.g.add((URIRef(entity_uri), RDF.type, class_type))
    def is_nan(self, x):
        return (x != x)
            
            
    '''
    Mappings to create triples of the form cw_onto:london cw_onto:name "London"
    '''    
    
    def mappingToCreateLiteralTriple(self, subject_column, object_column, predicate, datatype):
        
        for subject, lit_value in zip(self.data_frame[subject_column], self.data_frame[object_column]):
            
            if self.is_nan(lit_value) or lit_value==None or lit_value=="":
                pass
            
            else:
                #Uri as already created
                entity_uri=self.stringToURI[subject.lower()]
                    
                #Literal
                lit = Literal(lit_value, datatype=datatype)
                
                #New triple
                self.g.add((URIRef(entity_uri), predicate, lit))
    
    def mappingToCreateObjectTriple(self, subject_column, object_column, predicate):
        
        for subject, object in zip(self.data_frame[subject_column], self.data_frame[object_column]):
            
            if self.is_nan(object):
                pass
            
            else:
                #Uri as already created
                subject_uri=self.stringToURI[subject.lower()]
                object_uri=self.stringToURI[object.lower()]
                #New triple
                self.g.add((URIRef(subject_uri), predicate, URIRef(object_uri)))
    
    
    def mappingToCreateCapitalTriple(self, subject_column, object_column, capital_value_column):
        
        for subject, object, value in zip(self.data_frame[subject_column], self.data_frame[object_column], self.data_frame[capital_value_column]):
            
            #URI as already created
            subject_uri=self.stringToURI[subject.lower()]
            object_uri=self.stringToURI[object.lower()]
            
            
            #(default) if value is empty or not expected
            predicate = self.cw_onto.cityIsLocatedIn
            if value=="admin":                      
                predicate = self.cw_onto.isFirstLevelAdminCapitalOf
            elif value=="primary":
                predicate = self.cw_onto.isCapitalOf                        
            elif value=="minor":
                predicate = self.cw_onto.isSecondLevelAdminCapitalOf
            #New triple
            #Note that the ontology in lab5.ttl contains a hierarchy of properties, range and domain axioms and inverses
            #Via reasoning this triple will lead to several entailments
            self.g.add((URIRef(subject_uri), predicate, URIRef(object_uri)))
    
    def performReasoning(self, ontology_file):    
        #We expand the graph with the inferred triples
        #We use owlrl library with OWL2 RL Semantics (instead of RDFS semantic as we saw in lab 4)
        #More about OWL 2 RL Semantics in lecture/lab 7
        
        print("Data triples from CSV: '" + str(len(self.g)) + "'.")
    
        #We should load the ontology first
        #print(guess_format(ontology_file))
        self.g.load(ontology_file,  format=guess_format(ontology_file)) #e.g., format=ttl
        
        print("Triples including ontology: '" + str(len(self.g)) + "'.")
        
        #We apply reasoning and expand the graph with new triples 
        owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=False, datatype_axioms=False).expand(self.g)
        
        print("Triples after OWL 2 RL reasoning: '" + str(len(self.g)) + "'.")
    
    def performSPARQLQuery(self, file_query_out):
        
        qres = self.g.query(
            """SELECT DISTINCT ?country ?city ?pop WHERE {
              ?city rdf:type cw_onto:City .
              ?city cw_onto:isCapitalOf ?country .
              ?city cw_onto:population ?pop .
              FILTER (xsd:integer(?pop) > 5000000)
        }
        ORDER BY DESC(?pop)
        """)


        print("%s capitals satisfying the query." % (str(len(qres))))
        
        f_out = open(file_query_out,"w+")

        for row in qres:
            #Row is a list of matched RDF terms: URIs, literals or blank nodes
            line_str = '\"%s\",\"%s\",\"%s\"\n' % (row.country, row.city, row.pop)


            f_out.write(line_str)
            
     
        f_out.close()       
        
        
    def performSPARQLQueryLab7(self):
        
        qres = self.g.query(
            """SELECT DISTINCT ?country (COUNT(?city) AS ?num_cities) WHERE { 
              ?country lab5:hasCity ?city .
        }
        GROUP BY ?country
        ORDER BY DESC(?num_cities)
        """)
        for row in qres:
            #Row is a list of matched RDF terms: URIs, literals or blank nodes
            line_str = '\"%s\",\"%s\"' % (row.country, row.num_cities)
            print(line_str)
    
    def saveGraph(self, file_output):
        
        ##SAVE/SERIALIZE GRAPH
        #print(self.g.serialize(format="turtle").decode("utf-8"))
        self.g.serialize(destination=file_output, format='ttl')
        