In [None]:
pip install -q kglab;

In [None]:
import kglab
import pickle
import rdflib 
import re

import networkx as nx
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

In [None]:
df_cv_vac_pairs = pd.read_csv("./cleaner_data/cv-vacancy-pairs.csv")

In [None]:
candidates_with_data = set(df_cv_vac_pairs["candidate_id"].unique())

In [None]:
namespaces = {
    "rnp" : "http://randstad.com/property/",
    "rnd" : "http://randstad.com/ontology/",
    "rne" : "http://randstad.com/entity/",
    "dbp": "http://dbpedia.org/property/",
    "dbo": "http://dbpedia.org/ontology/",
    "dbr": "http://dbpedia.org/resource/",
    "owl": "http://www.w3.org/2002/07/owl#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "foaf": "http://xmlns.com/foaf/0.1/"
}

In [None]:
kg = kglab.KnowledgeGraph(
    name = "Randstad KG",
    namespaces = namespaces,
    base_uri = "https://www.example.com/entity/")

In [None]:
iscos = pd.read_csv("isco.csv", encoding="cp850")
iscos = iscos[iscos["ISCO_version"] == "ISCO-08"]

iscos.head()

In [None]:
# Define ontology lay-out

# Classes

# Candidate is a class, and a candidate is a person
kg.add(kg.get_ns("rnd").Candidate, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("rnd").Candidate, kg.get_ns("rdfs").subClassOf, kg.get_ns("foaf").Person)

# Function is a class, and is equivalent to employment
kg.add(kg.get_ns("rnd").Function, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("rnd").Function, kg.get_ns("owl").equivalentClass, kg.get_ns("dbr").Employment)

# Position is a class
kg.add(kg.get_ns("rnd").Position, kg.get_ns("rdf").type, kg.get_ns("owl").Class)

# Add educations and their ordering
edus = ["University", "Vocational_university", "Vocational_school", "Secondary_education", "Primary_education"] # ["Primary_education", "Secondary_education", "Vocational_school", "Vocational_university", "University"]

for i, edu in enumerate(edus):
    kg.add(eval(f"kg.get_ns('rne').{edu}"), kg.get_ns("rdf").type, kg.get_ns("rnd").Education)
    
    for edu2 in edus[i+1:]:
        kg.add(eval(f"kg.get_ns('rne').{edu}"), kg.get_ns("rnp").supersedes, eval(f"kg.get_ns('rne').{edu2}"))

# Company is a class, it offers functions, and candidates work there
kg.add(kg.get_ns("rnd").Company, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("rnd").Company, kg.get_ns("rnp").offers_function, kg.get_ns("rnd").Function)
kg.add(kg.get_ns("rnd").Candidate, kg.get_ns("rnp").has_worked_at, kg.get_ns("rnd").Company)

# An ISCO code is a class, and all sub-codes fall under that class
kg.add(kg.get_ns("rnd").ISCO_code, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("rnd").ISCO_unit, kg.get_ns("rdfs").subClassOf, kg.get_ns("rnd").ISCO_code)
kg.add(kg.get_ns("rnd").ISCO_minor, kg.get_ns("rdfs").subClassOf, kg.get_ns("rnd").ISCO_code)
kg.add(kg.get_ns("rnd").ISCO_sub_major, kg.get_ns("rdfs").subClassOf, kg.get_ns("rnd").ISCO_code)
kg.add(kg.get_ns("rnd").ISCO_major, kg.get_ns("rdfs").subClassOf, kg.get_ns("rnd").ISCO_code)

# Add skill, language, and license class
kg.add(kg.get_ns("rnd").Skill, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("rnd").Language, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("rnd").License, kg.get_ns("rdf").type, kg.get_ns("owl").Class)

# Isco levels
for row in iscos.itertuples():
    # Unit
    kg.add(eval(f"kg.get_ns('rne').isco{row[8]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").ISCO_unit)
    kg.add(eval(f"kg.get_ns('rne').isco{row[8]}"), kg.get_ns("rnp").falls_under, eval(f"kg.get_ns('rne').isco{row[6]}"))
    kg.add(eval(f"kg.get_ns('rne').isco{row[8]}"), kg.get_ns("rdfs").comment, rdflib.Literal(row[9]))
    
    # Minor
    kg.add(eval(f"kg.get_ns('rne').isco{row[6]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").ISCO_minor)
    kg.add(eval(f"kg.get_ns('rne').isco{row[6]}"), kg.get_ns("rnp").falls_under, eval(f"kg.get_ns('rne').isco{int(row[4])}"))
    kg.add(eval(f"kg.get_ns('rne').isco{row[6]}"), kg.get_ns("rdfs").comment, rdflib.Literal(row[7]))

    # Sub_major
    kg.add(eval(f"kg.get_ns('rne').isco{int(row[4])}"), kg.get_ns("rdf").type, kg.get_ns("rnd").ISCO_sub_major)
    kg.add(eval(f"kg.get_ns('rne').isco{int(row[4])}"), kg.get_ns("rnp").falls_under, eval(f"kg.get_ns('rne').isco{row[2]}"))
    kg.add(eval(f"kg.get_ns('rne').isco{int(row[4])}"), kg.get_ns("rdfs").comment, rdflib.Literal(row[5]))

    # Major
    kg.add(eval(f"kg.get_ns('rne').isco{row[2]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").ISCO_major)
    kg.add(eval(f"kg.get_ns('rne').isco{row[2]}"), kg.get_ns("rdfs").comment, rdflib.Literal(row[3]))
    
# Properties
kg.add(kg.get_ns("rnp").offers_function, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").function_is_offered_by)

kg.add(kg.get_ns("rnp").has_worked_at, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").has_employed)

kg.add(kg.get_ns("rnp").supersedes, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").subsedes)

kg.add(kg.get_ns("rnp").falls_under, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").encompasses)


# All the inverses of the regular edges
# kg.add(kg.get_ns("rnp").has_skill, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_skill_of)

# kg.add(kg.get_ns("rnp").code_is_level, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").level_of_code)

# kg.add(kg.get_ns("rnp").worked_at_company, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").has_employed)

# kg.add(kg.get_ns("rnp").has_worked_function, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").has_been_function_of)

# kg.add(kg.get_ns("rnp").has_function_id, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_function_id_of)

# kg.add(kg.get_ns("rnp").has_isco_level, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_isco_level_of)

# kg.add(kg.get_ns("rnp").has_isco_code, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_isco_code_of)

# kg.add(kg.get_ns("rnp").part_of_workgroup, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_workgroup_of)

# kg.add(kg.get_ns("rnp").has_function_name, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_function_name_of)

# kg.add(kg.get_ns("rnp").has_company_name, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_company_name_of)

# kg.add(kg.get_ns("rnp").requires_education, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_required_education_of)

# kg.add(kg.get_ns("rnp").has_isco, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_isco_of)

# kg.add(kg.get_ns("rnp").speaks_language, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_spoken_by)

# kg.add(kg.get_ns("rnp").has_license, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_license_of)

# kg.add(kg.get_ns("rnp").cv, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_cv_of)

# kg.add(kg.get_ns("rnp").vacancy, kg.get_ns("owl").inverseOf, kg.get_ns("rnp").is_vacancy_of)

# Falls under is transitive
kg.add(kg.get_ns("rnp").falls_under, kg.get_ns("rdf").type, kg.get_ns("owl").TransitiveProperty)

In [None]:
measure = kglab.Measure()
measure.measure_graph(kg)

print("edges before inference", measure.get_edge_count())
print("nodes before inference", measure.get_node_count())

# Do all the nifty inference
kg.infer_owlrl_closure()

measure.measure_graph(kg)

print()
print("edges after inference", measure.get_edge_count())
print("nodes after inference", measure.get_node_count())

## Candidates and experience

In [None]:
# Normalize candidate work experience
df_exp = pd.read_csv("./cleaner_data/work_experience.csv")

df_exp = df_exp.set_index("Unnamed: 0")

df_exp["function_name_self"] = df_exp["function_name_self"].str.lower()
df_exp["function_name_self"] = df_exp["function_name_self"].str.replace('[^a-zA-Z0-9]', '_', regex=True).str.strip()

df_exp["company_name"] = df_exp["company_name"].str.lower()
df_exp["company_name"] = df_exp["company_name"].str.replace('[^a-zA-Z0-9]', '_', regex=True).str.strip()

In [None]:
df_exp["isco_code4"] = df_exp["isco_code4"].fillna(-1)
df_exp["isco_code4"] = df_exp["isco_code4"].astype(int)

df_exp["function_id"] = df_exp["function_id"].fillna(-1)
df_exp["function_id"] = df_exp["function_id"].astype(int)

df_exp["isco_functie_niveau"] = df_exp["isco_functie_niveau"].fillna(-1)
df_exp["isco_functie_niveau"] = df_exp["isco_functie_niveau"].astype(int)



for row in tqdm(df_exp.itertuples(), total=1863083):
    # Skip candidates with no data
    if not row[1] in candidates_with_data:
        continue
        
    # Assign types to candidate and company
    kg.add(eval(f"kg.get_ns('rne').c{row[1]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").Candidate)
    
    # Store function level of isco code
    if row[6] != -1 and row[7] != -1:
        kg.add(eval(f"kg.get_ns('rne').isco{row[7]}"), kg.get_ns('rnp').code_is_level, eval(f"kg.get_ns('rne').level{row[6]}"))
    
    if not pd.isna(row[9]):
        # Process company names to fit naming scheme
        comp_name = row[9].lstrip("_")
        
        if comp_name == "":
            comp_name = "blank_company"        
        elif comp_name[0].isdigit():
            comp_name = "_" + comp_name
            
        if comp_name == "import":
            comp_name = "_import"

        kg.add(eval(f"kg.get_ns('rne').c{row[1]}"), kg.get_ns('rnp').worked_at_company, eval(f"kg.get_ns('rne').comp_{comp_name}"))
        kg.add(eval(f"kg.get_ns('rne').comp_{comp_name}"), kg.get_ns('rdf').type, kg.get_ns("rnd").Company)
    
    if not pd.isna(row[8]):
        
        func_name = row[8].lstrip("_")
        
        if func_name == "":
            func_name = "blank_function"   
        elif func_name[0].isdigit():
            func_name = "_" + func_name
            
        kg.add(eval(f"kg.get_ns('rne').func_{func_name}"), kg.get_ns('rdf').type, kg.get_ns('rnd').Function)
            
        # Candidate worked a certain function
        kg.add(eval(f"kg.get_ns('rne').c{row[1]}"), kg.get_ns('rnp').has_worked_function, eval(f"kg.get_ns('rne').func_{func_name}"))

        # Add id to function name
        if row[5] != -1:
            kg.add(eval(f"kg.get_ns('rne').func_{func_name}"), kg.get_ns('rnp').has_function_id, eval(f"kg.get_ns('rne').fid{row[5]}"))        

        # Add isco level to function name
        if row[6] != -1:
            kg.add(eval(f"kg.get_ns('rne').func_{func_name}"), kg.get_ns('rnp').has_isco_level, eval(f"kg.get_ns('rne').level{row[6]}"))

        # Add isco code to function name
        if row[7] != -1:
            kg.add(eval(f"kg.get_ns('rne').isco{row[7]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").ISCO_unit)
            kg.add(eval(f"kg.get_ns('rne').func_{func_name}"), kg.get_ns('rnp').has_isco_code, eval(f"kg.get_ns('rne').isco{row[7]}"))

del df_exp

## Education

In [None]:
df_education = pd.read_csv("./cleaner_data/education.csv").set_index("Unnamed: 0")

In [None]:
df_education["education_level"] = df_education["education_level"].map({1: "Primary_education",
                                                                       2: "Secondary_education", 
                                                                       3: "Vocational_school",
                                                                       4: "Vocational_university",
                                                                       5: "University"}
                                                                     )

In [None]:
for row in tqdm(df_education.itertuples(), total=799014):
    
    if row[1] not in candidates_with_data:
        continue
        
    node = eval(f"kg.get_ns('rne').c{row[1]}")

    kg.add(node, kg.get_ns("rdf").type, kg.get_ns("rnd").Candidate)
    
    if (row[-1] == 1) or pd.isna(row[4]):  
        kg.add(node, kg.get_ns("dbo").education, eval(f"kg.get_ns('dbr').{row[2]}"))

## Requests

In [None]:
df_req = pd.read_csv("./cleaner_data/requests.csv").set_index("Unnamed: 0")

df_req["request_function_name"] = df_req["request_function_name"].str.lower().str.strip()
df_req["request_function_name"] = df_req["request_function_name"].str.replace('[^a-zA-Z0-9]', '_', regex=True).str.strip()

df_req["request_company_name"] = df_req["request_company_name"].str.lower().str.strip()
df_req["request_company_name"] = df_req["request_company_name"].str.replace('[^a-zA-Z0-9]', '_', regex=True).str.strip()

df_req["request_isco_code4"] = df_req["request_isco_code4"].fillna(-1)
df_req["request_isco_code4"] = df_req["request_isco_code4"].astype(int)

In [None]:
df_req.info()

In [None]:
requests_with_data = set(df_cv_vac_pairs["request_mondriaan_number"].unique())

edu_map = {"geen": "Primary_education",
           "basisschool": "Primary_education",
           "vmbo": "Secondary_education", 
           "mavo": "Secondary_education", 
           "havo":"Secondary_education", 
           "vwo":"Secondary_education", 
           "atheneum":"Secondary_education", 
           "gymnasium":"Secondary_education", 
           "mbo": "Vocational_school",
           "hbo": "Vocational_university",
           "wo": "University"}


for row in tqdm(df_req.itertuples(), total=588396):
    
    if not row[3] in requests_with_data:
        continue
        
    node = eval(f"kg.get_ns('rne').r{row[3]}")

    kg.add(node, kg.get_ns("rdf").type, kg.get_ns("rnd").Position)
    
    # Store the workgroup that the request was a part of (RGN, tempo team, yacht), and general info on the job
    kg.add(node, kg.get_ns("rnp").part_of_workgroup, eval(f"kg.get_ns('rne').{row[1]}"))
    
    if not pd.isna(row[15]):
        kg.add(node, kg.get_ns("rnp").number_working_hours, rdflib.Literal(int(row[15]), datatype=kg.get_ns("xsd").integer))
    if not pd.isna(row[16]):
        kg.add(node, kg.get_ns("rnp").number_working_days, rdflib.Literal(int(row[16]), datatype=kg.get_ns("xsd").integer))
    if not pd.isna(row[17]):
        kg.add(node, kg.get_ns("rnp").has_salary, rdflib.Literal(float(row[17]), datatype=kg.get_ns("xsd").double))
    if not pd.isna(row[24]):
        kg.add(node, kg.get_ns("rnp").requires_license, rdflib.Literal(int(row[24]), datatype=kg.get_ns("xsd").integer))
    if not pd.isna(row[34]):
        kg.add(node, kg.get_ns("rnp").requires_work_think_level, rdflib.Literal(int(row[34]), datatype=kg.get_ns("xsd").integer))
               
    if not np.isnan(row[28]):
        # Store function id (as a node rather than a property, as there are relations between IDs) TODO: check
        kg.add(node, kg.get_ns("rnp").has_function_id, eval(f"kg.get_ns('rne').fid{row[28]}"))
            
    if (type(row[29]) == str and len(row[29])) or (not np.isnan(row[29])):
        if row[29][0].isdigit():
            func_name = "_" + row[29]
        else:
            func_name = row[29]
        
        # Store function name
        kg.add(node, kg.get_ns("rnp").has_function_name, eval(f"kg.get_ns('rne').func_{func_name}"))
    
    if (type(row[33]) == str and len(row[33])) or (not np.isnan(row[33])):
        # Company names can start with a digit, which is not allowed by naming conventions
        comp_name = row[33].lstrip("_")

        if comp_name[0].isdigit():
            comp_name = "_" + comp_name
        
        # Store company name
        kg.add(node, kg.get_ns("rnp").has_company_name, eval(f"kg.get_ns('rne').comp_{comp_name}"))
        
    if not np.isnan(row[40]) and not row[40] == -1:
        # Store isco
        kg.add(node, kg.get_ns("rnp").has_isco, eval(f"kg.get_ns('rne').isco{row[40]}"))
    
    if (type(row[41]) == str and len(row[41])) or (not np.isnan(row[41])):
        
        educations = row[41].split(",") if "," in row[41] else row[41].split("/")
        
        # Store education                
        for edu in educations:
            edu = edu_map[edu.strip().lower()] if edu.strip().lower() in edu_map else "no_education"
            kg.add(node, kg.get_ns("rnp").requires_education, eval(f"kg.get_ns('dbr').{edu}"))
            

## Additional candidate data

In [None]:
df_skills = pd.read_csv("./cleaner_data/skills.csv").set_index("Unnamed: 0")

df_skills["skill_id"] = df_skills["skill_id"].str.lower().str.strip()
df_skills["skill_id"] = df_skills["skill_id"].str.replace('[^a-zA-Z0-9]', '_', regex=True).str.strip()

In [None]:
for row in tqdm(df_skills.itertuples(), total=816735):
    if not row[1] in candidates_with_data:
        continue
        
    node = eval(f"kg.get_ns('rne').c{row[1]}")
    
    kg.add(node, kg.get_ns("rnp").has_skill, eval(f"kg.get_ns('rne').skill_{row[2]}"))
    kg.add(eval(f"kg.get_ns('rne').skill_{row[2]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").Skill)

In [None]:
df_languages = pd.read_csv("./cleaner_data/languages.csv").set_index("Unnamed: 0")

In [None]:
for row in tqdm(df_languages.itertuples(), total=663315):
    if not row[1] in candidates_with_data:
        continue
        
    node = eval(f"kg.get_ns('rne').c{row[1]}")
    
    if not "/" in row[2]:
        kg.add(node, kg.get_ns("rnp").speaks_language, eval(f"kg.get_ns('rne').lang_{row[2]}"))
        kg.add(eval(f"kg.get_ns('rne').lang_{row[2]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").Language)
    else:
        langs = row[2].split("/")
        
        for lang in langs:
            kg.add(node, kg.get_ns("rnp").speaks_language, eval(f"kg.get_ns('rne').lang_{lang}"))
            kg.add(eval(f"kg.get_ns('rne').lang_{lang}"), kg.get_ns("rdf").type, kg.get_ns("rnd").Language)

In [None]:
df_license = pd.read_csv("./cleaner_data/driving_licenses.csv").set_index("Unnamed: 0")

In [None]:
for row in tqdm(df_license.itertuples(), total=536359):
    if not row[1] in candidates_with_data:
        continue
        
    node = eval(f"kg.get_ns('rne').c{row[1]}")
    
    kg.add(node, kg.get_ns("rnp").has_license, eval(f"kg.get_ns('rne').rijbewijs_{row[2]}"))
    kg.add(eval(f"kg.get_ns('rne').rijbewijs_{row[2]}"), kg.get_ns("rdf").type, kg.get_ns("rnd").License)

## Add textual data to candidates and CVs

In [None]:
for row in tqdm(df_cv_vac_pairs.itertuples(), total=274407):
    # TODO: CVs should be added later to preserve memory
    #     cv = re.sub("\n+", " \n ", row[8]).lower()
#     vacancy = re.sub("\n+", " \n ", row[7]).lower()  
    
#     kg.add(eval(f"kg.get_ns('rne').c{row[4]}"), kg.get_ns("rnp").cv, rdflib.Literal(cv))
#     kg.add(eval(f"kg.get_ns('rne').r{row[5]}"), kg.get_ns("rnp").vacancy, rdflib.Literal(vacancy))
    
    match = f"match_{row[6]}".replace("-", "neg")
        
    kg.add(eval(f"kg.get_ns('rne').c{row[4]}"), eval(f"kg.get_ns('rnp').{match}"), eval(f"kg.get_ns('rne').r{row[5]}"))

In [None]:
measure.measure_graph(kg)
print("edges before inference", measure.get_edge_count())
print("nodes before inference", measure.get_node_count())

# Do all the nifty inference
kg.infer_owlrl_closure()

measure.measure_graph(kg)

print()
print("edges after inference", measure.get_edge_count())
print("nodes after inference", measure.get_node_count())

In [None]:
kgt = kglab.SubgraphTensor(kg)

del kg

with open("kg.edgelist", "w+") as f:
    
    for i, (s, p, o) in tqdm(enumerate(kgt.as_tuples())):
            s_label = kgt.n3fy(s)
            # s_id = kgt.transform(s_label)

            p_label = kgt.n3fy(p)
            # p_id = kgt.transform(p_label)

            o_label = kgt.n3fy(o)
            # o_id = kgt.transform(o_label)

            # print([s_id, o_id, 2 * p_id])
            # print([o_id, s_id, 2 * p_id + 1])

            f.write(f"['{s_label}', '{p_label}', '{o_label}']\n")

In [None]:
kgt = kglab.SubgraphTensor(kg)

del kg

edge_list = kgt.as_tensor(quiet=False)

In [None]:
torch.save(edge_list, 'tensor.pt')

In [None]:
# G = kg.build_nx_graph(nx.DiGraph())

In [None]:
# kg.save_parquet("kg.parquet")