In [None]:
! pip install rdflib
! pip install rdfextras
import pandas as pd
import rdflib
import rdfextras
import rdflib.plugins.sparql as sparql
from rdflib import Graph
from rdflib.namespace import RDF
rdfextras.registerplugins()

In [None]:
g1=rdflib.Graph()
g1.parse("./graph_data/locations_events.ttl", format='turtle')

In [None]:
g2=rdflib.Graph()
g2.parse("./graph_data/companies_features.ttl", format='turtle')

In [None]:
g3=rdflib.Graph()
g3.parse("./graph_data/export.xml", format='xml')

In [5]:
scm = pd.read_csv('../data_preparation/data/SCM_prep_data_ID.csv')

Add Latitude and Longitude

In [6]:
def add_lat_long(graph, id, lat, long):
    print(id, lat, long)
    if pd.isna(lat) or pd.isna(long):
        return
    graph.update("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX schema: <neo4j://graph.schema#>
        PREFIX individual: <neo4j://graph.individuals#>

        INSERT {{
            individual:{id} schema:hasLatitude "{lat}"^^xsd:decimal .
            individual:{id} schema:hasLongitude "{long}"^^xsd:decimal
        }}
        WHERE {{
            individual:{id} ?p ?o
        }}
    """.format(id = id, lat = lat, long = long))

In [None]:
for i in range(len(scm)):
    add_lat_long(g3, scm.ID[i], scm.lat[i], scm.long[i])

Export Graph with Latitude and Longitude

In [8]:
scm_latlong = g3.serialize(format="turtle")

with open("output_scm.ttl", "w", encoding= "utf-8") as file:
    file.write(scm_latlong)

In [9]:
g_merged = g1 + g3

Create Supplier Event Nodes

In [10]:
company_events = g_merged.query("""
      PREFIX owl: <http://www.w3.org/2002/07/owl#>
      PREFIX coy: <https://schema.coypu.org/global#>
      PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      PREFIX geof: <http://www.opengis.net/def/function/geosparql/>
      PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
      PREFIX schema: <neo4j://graph.schema#>
      PREFIX individual: <neo4j://graph.individuals#>

      SELECT ?company ?event
      WHERE {
      # Events
      ?event a coy:Event ;
            coy:hasLatitude ?eventLat ;
            coy:hasLongitude ?eventLong .

      # Companies
      ?company a schema:Supplier ;
            schema:hasLatitude ?companyLat ;
            schema:hasLongitude ?companyLong .

      FILTER(
            ?eventLat  <= ?companyLat + 2 && ?eventLat >= ?companyLat - 2  &&
            ?eventLong  <= ?companyLong + 2 && ?eventLong >= ?companyLong - 2
            )

      }
""").bindings

In [11]:
def to_company_uri(id, company_uri):
    company_id = rdflib.Literal(int(str(company_uri.split('#')[1])))

    subject = rdflib.URIRef('neo4j://graph.individuals/events#'+str(id))
    comp_predicate = rdflib.URIRef('neo4j://graph.schema#supplier_id')

    return (subject, comp_predicate, company_id)


In [12]:
def to_event_uri(id, event_uri):
    subject = rdflib.URIRef('neo4j://graph.individuals/events#'+str(id))
    event_predicate = rdflib.URIRef('neo4j://graph.schema#HAS_EVENT')
    return (subject, event_predicate, event_uri)

In [13]:
def get_type_property(id):
    subject = rdflib.URIRef('neo4j://graph.individuals/events#'+str(id))
    pred = RDF.type
    type_uri = rdflib.URIRef('neo4j://graph.schema#SupplierEvent')
    return (subject, pred, type_uri)

In [14]:
id = 1
g4 = rdflib.Graph()
for binding in company_events:
    g4.add(get_type_property(id))
    g4.add(to_company_uri(id, binding['company']))
    g4.add(to_event_uri(id, binding['event']))

    id += 1


Export the graph as Turtle

In [15]:
supplier_events = g4.serialize(format="turtle")

with open("supplier_events.ttl", "w", encoding= "utf-8") as file:
    file.write(supplier_events)

Join Dedupe Results

In [None]:
def parse_matches(df): 
   return df.loc[(df['company'].isna() == False) & (df['index_scm'].isna() == False)]

dedupe = pd.concat([
    parse_matches(pd.read_csv('../dedupe/output_cn.csv')),
    parse_matches(pd.read_csv('../dedupe/output_de.csv')),
    parse_matches(pd.read_csv('../dedupe/output_us.csv'))
])

In [326]:
dedupe.reset_index(inplace = True)

In [256]:
idx = {}
for i in range(len(scm)):
    idx[scm["index"][i]] = scm["ID"][i]

dedupe["scm_ID"] = dedupe["index_scm"].map(lambda index : idx[index])

In [330]:
def get_lei_city(uri):
    bind = g2.query("""
    SELECT ?lei ?city
    WHERE {{
        <{uri}> <https://schema.coypu.org/global#hasLeiCode> ?lei . 
        <{uri}> <https://schema.coypu.org/global#hasCity> ?city .   
        }}""".format(uri = uri)).bindings
    
    return (bind[0]['city'], bind[0]['lei'])

In [None]:
g5=rdflib.Graph()
g5.parse("./output_scm.ttl", format='turtle')

In [347]:
def update_company_fields(scm_id_, coy_id_, lei_, city_):
    print(scm_id_, coy_id_, lei_, city_)
    g5.update("""
            PREFIX schema: <neo4j://graph.schema#>
            PREFIX individual: <neo4j://graph.individuals#>
            
            INSERT {{
                    individual:{scm_id} schema:scm_id "{scm_id}" .
                    individual:{scm_id} schema:lei "{lei}" .
                    individual:{scm_id} schema:city "{city}" .
                    individual:{scm_id} schema:HAS_COYPU "{coy_id}"
                }}
            WHERE {{
                    individual:{scm_id} ?p ?o 
                }}""".format(scm_id = scm_id_, coy_id = coy_id_, lei = lei_, city = city_)
            )

Extend SCM Graph by LEI CODE, CITY Info and COYPU ID from COYPU KG


In [None]:
for i in range(len(dedupe)):
    print("index", i)
    city, lei = get_lei_city(dedupe['company'][i])
    update_company_fields(dedupe.scm_ID[i], str(dedupe.company[i]), lei, city)

In [351]:
scm_done = g5.serialize(format="turtle")

with open("scm_done.ttl", "w", encoding= "utf-8") as file:
    file.write(scm_done)