In [None]:
import pandas as pd
import numpy as np
import networkx as nx

import pickle
import rdflib 
import re

import itertools
import matplotlib
import time
import json

from urllib3.exceptions import ProtocolError
# from ssl.exceptions import ConnectionAbortedError
from requests.exceptions import ChunkedEncodingError
from tqdm.notebook import tqdm
from collections import defaultdict

# Loading and cleaning data

In [None]:
users = pd.read_csv("../source_data/table1_user.txt", delimiter="\t")
users.head()

In [None]:
len(users["user_id"].unique())

In [None]:
users.size

In [None]:
def splitter(x):    
    x = str(x)
    
    # Some ranges can be even, in which case we can simply turn the middle 0 into a -
    if len(x) % 2 == 0:
        return x[:len(x)//2] + "-" + x[len(x)//2:] 
    else: # IDs of uneven length are trickier
        y = list(x)
        
        # Some ranges do not have the 'inbetween 0', so we need to add, rather than replace
        if y[len(x)//2] != "0":
            return x[:len(x)//2] + "-" + x[len(x)//2:] 

        # Those that do have the inbetween 0, can just have it replaced
        y[len(x)//2] = "-"
        x = "".join(y)
        return x
    
# Apply
users["desire_jd_salary_id"] = users["desire_jd_salary_id"].apply(lambda x: splitter(x))
users["cur_salary_id"] = users["cur_salary_id"].apply(lambda x: splitter(x))

In [None]:
jds = pd.read_csv("../source_data/table2_jd.txt", delimiter="\t", on_bad_lines="skip")
jds.head()

In [None]:
len(jds["jd_no"].unique())

In [None]:
jds.size

In [None]:
actions = pd.read_csv("../source_data/table3_action.txt", delimiter="\t")
actions.head()

In [None]:
actions.size

In [None]:
# Store the hexadecimal IDs as regular numbers
uid_to_num = dict(zip(users["user_id"], [f"u{i}" for i in range(len(users))]))
jid_to_num = dict(zip(jds["jd_no"], [f"j{i}" for i in range(len(jds))]))

In [None]:
users["user_id"] = users["user_id"].apply(lambda x: uid_to_num[x])
jds["jd_no"] = jds["jd_no"].apply(lambda x: jid_to_num[x])

actions["user_id"] = actions["user_id"].apply(lambda x: uid_to_num[x] if x in uid_to_num else np.nan)
actions["jd_no"] = actions["jd_no"].apply(lambda x: jid_to_num[x] if x in jid_to_num else np.nan)

In [None]:
users[["user_id", "experience"]].to_csv("cv_data.csv")
jds[["jd_no", "jd_title", "job_description"]].to_csv("jd_data.csv")

In [None]:
cvs = users[["user_id", "experience"]]
descriptions = jds[["jd_no", "jd_title", "job_description"]]

descriptions.loc[:, "full_text"] = jds["jd_title"].str.cat(jds['job_description'], sep='\n')

descriptions = descriptions[["jd_no", "full_text"]]

In [None]:
actions["label"] = actions[["browsed", "delivered", "satisfied"]].sum(axis=1)
labels = actions[["user_id", "jd_no", "label"]]

textual_dataset = pd.merge(labels, cvs, left_on="user_id", right_on="user_id")
textual_dataset = pd.merge(textual_dataset, descriptions, left_on="jd_no", right_on="jd_no")
textual_dataset = textual_dataset[["user_id", "experience", "jd_no", "full_text", "label"]]

textual_dataset.to_csv("cv_vacancy_data.tsv", sep="\t")

In [None]:
actions.to_csv("actions_updated.csv")

In [None]:
user_desire_industry = users[["user_id", "desire_jd_industry_id"]].set_index("user_id").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
user_desire_type = users[["user_id", "desire_jd_type_id"]].set_index("user_id").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
user_current_industry = users[["user_id", "cur_industry_id"]].set_index("user_id").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
user_cur_type = users[["user_id", "cur_jd_type"]].set_index("user_id").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
user_degree = users[["user_id", "cur_degree_id"]].set_index("user_id").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
user_experience = users[["user_id", "experience"]].set_index("user_id").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()

In [None]:
jds_sub_type = jds[["jd_no", "jd_sub_type"]].set_index("jd_no").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
jds_min_degree = jds[["jd_no", "min_edu_level"]].set_index("jd_no").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()
jds_max_degree = jds[["jd_no", "max_edu_level"]].set_index("jd_no").squeeze().replace('[(\（\().*\)]','', regex=True).str.split(r"[/|,、]").explode()

# Creating the Knowledge Graph

In [None]:
namespaces = {
    "znp" : "http://zhaopin.com/property/",
    "znd" : "http://zhaopin.com/ontology/",
    "zne" : "http://zhaopin.com/entity/",
    "dbp": "http://dbpedia.org/property/",
    "dbo": "http://dbpedia.org/ontology/",
    "dbr": "http://dbpedia.org/resource/",
    "owl": "http://www.w3.org/2002/07/owl#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "foaf": "http://xmlns.com/foaf/0.1/"
}

In [None]:
kg = kglab.KnowledgeGraph(
    name = "Zhaopin KG",
    namespaces = namespaces,
    base_uri = "https://www.example.com/entity/")

In [None]:
users["cur_degree_id"].value_counts()

In [None]:
# Define ontology lay-out

# Classes

# Candidate is a class, and a candidate is a person
kg.add(kg.get_ns("znd").Candidate, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("znd").Candidate, kg.get_ns("rdfs").subClassOf, kg.get_ns("foaf").Person)

# Function is a class, and is equivalent to employment
kg.add(kg.get_ns("znd").Function, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("znd").Function, kg.get_ns("owl").equivalentClass, kg.get_ns("dbr").Employment)

# Position is a class
kg.add(kg.get_ns("znd").Position, kg.get_ns("rdf").type, kg.get_ns("owl").Class)

# Add educations and their ordering
# PhD, Master's, Bachelor's, college, MBA, EMBA, Technical secondary school, Senior high school, junior high school, other
edus = ["博士", "硕士", "本科", "大专", "MBA", "EMBA", "中专", "中技", "高中", "初中", "其他"] 

for i, edu in enumerate(edus):
    kg.add(eval(f"kg.get_ns('zne').edu_{edu}"), kg.get_ns("rdf").type, kg.get_ns("znd").Education)
    
    for edu2 in edus[i+1:]:
        kg.add(eval(f"kg.get_ns('zne').edu_{edu}"), kg.get_ns("znp").supersedes, eval(f"kg.get_ns('zne').edu_{edu2}"))

# Company is a class, it offers functions, and candidates work there
kg.add(kg.get_ns("znd").Company, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("znd").Company, kg.get_ns("znp").offers_function, kg.get_ns("znd").Function)
kg.add(kg.get_ns("znd").Candidate, kg.get_ns("znp").has_worked_at, kg.get_ns("znd").Company)

# Add city, job type, and industry
kg.add(kg.get_ns("znd").City, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("znd").Job_type, kg.get_ns("rdf").type, kg.get_ns("owl").Class)
kg.add(kg.get_ns("znd").Industry, kg.get_ns("rdf").type, kg.get_ns("owl").Class)

# Properties
kg.add(kg.get_ns("znp").offers_function, kg.get_ns("owl").inverseOf, kg.get_ns("znp").function_is_offered_by)

kg.add(kg.get_ns("znp").has_worked_at, kg.get_ns("owl").inverseOf, kg.get_ns("znp").has_employed)

kg.add(kg.get_ns("znp").supersedes, kg.get_ns("owl").inverseOf, kg.get_ns("znp").subsedes)

kg.add(kg.get_ns("znp").falls_under, kg.get_ns("owl").inverseOf, kg.get_ns("znp").encompasses)

# Falls under is transitive
kg.add(kg.get_ns("znp").falls_under, kg.get_ns("rdf").type, kg.get_ns("owl").TransitiveProperty)

In [None]:
measure = kglab.Measure()
measure.measure_graph(kg)

print("edges before inference", measure.get_edge_count())
print("nodes before inference", measure.get_node_count())

# Do all the nifty inference
kg.infer_owlrl_closure()

measure.measure_graph(kg)

print()
print("edges after inference", measure.get_edge_count())
print("nodes after inference", measure.get_node_count())

# Add candidate data

In [None]:
for row in tqdm(actions.itertuples(), total=700938):
    
    # Users/positions for which we do not have any data get ignored
    if (type(row[1]) == float) or (type(row[2]) == float):
        continue
    
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns("rdf").type, kg.get_ns("znd").Candidate)
    kg.add(eval(f"kg.get_ns('zne').{row[2]}"), kg.get_ns("rdf").type, kg.get_ns("znd").Position)
    
    if row[3] == 1:
        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns('znp').browsed, eval(f"kg.get_ns('zne').{row[2]}"))
    if row[4] == 1:
        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns('znp').applied, eval(f"kg.get_ns('zne').{row[2]}"))
    if row[5] == 1:
        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns('znp').fulfilled, eval(f"kg.get_ns('zne').{row[2]}"))

In [None]:
for row in tqdm(users.itertuples(), total=4500):

    # Add current city data
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns('znp').lives_in, eval(f"kg.get_ns('zne').city_{row[2]}"))
    kg.add(eval(f"kg.get_ns('zne').city_{row[2]}"), kg.get_ns('rdf').type, kg.get_ns('znd').City)

    # Add desired city data
    if type(row[3]) == str:
        for desired_city in row[3].split(","):
            if desired_city != "-":
                kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                       kg.get_ns('znp').wants_city, 
                       eval(f"kg.get_ns('zne').city_{desired_city}"))
                
                kg.add(eval(f"kg.get_ns('zne').city_{desired_city}"), 
                       kg.get_ns('rdf').type, 
                       kg.get_ns('znd').City)

    # Add desired industry data
    if type(row[4]) == str:
        for desired_industry in row[4].split("/"):       
            desired_industry = re.sub('[(\（\().*\,）|、)]', '', desired_industry)
            
            if desired_industry[0].isdigit():
                desired_industry = "_" + desired_industry
            
            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').wants_industry, 
                   eval(f"kg.get_ns('zne').industry_{desired_industry}"))
            
            kg.add(eval(f"kg.get_ns('zne').industry_{desired_industry}"), 
                   kg.get_ns('rdf').type, 
                   kg.get_ns('znd').Industry)

    # Add desired job type data
    if type(row[5]) == str:
        for desired_type in row[5].split("/"):
            desired_type = re.sub('[(\（\().*\,）|、)]', '', desired_type)

            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').wants_job_type, 
                   eval(f"kg.get_ns('zne').job_type_{desired_type}"))
            
            kg.add(eval(f"kg.get_ns('zne').job_type_{desired_type}"), 
                   kg.get_ns('rdf').type, 
                   kg.get_ns('znd').Job_type)

    # Add current industry data
    if type(row[7]) == str:
        for current_industry in row[7].split("/"):
            current_industry = re.sub('[(\（\().*\,）|、)]', '', current_industry)

            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').works_in_industry, 
                   eval(f"kg.get_ns('zne').industry_{current_industry}"))
            
            kg.add(eval(f"kg.get_ns('zne').industry_{current_industry}"), 
                   kg.get_ns('rdf').type, 
                   kg.get_ns('znd').Industry)

    # Add current job type data
    if (type(row[8]) != str) and (not np.isnan(row[8])):
        for current_type in row[8].split("/"):
            
            current_type = re.sub('[(\（\().*\,）|、)]', '', current_type)

            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').works_in_job_type, 
                   eval(f"kg.get_ns('zne').job_type_{current_type}"))
            
            kg.add(eval(f"kg.get_ns('zne').job_type_{current_type}"), 
                   kg.get_ns('rdf').type, 
                   kg.get_ns('znd').Job_type)

    # Add education data
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns('znp').has_degree, eval(f"kg.get_ns('zne').edu_{row[10]}"))

    # Add current salary data
    if row[9] != "-" and row[9] != "--":
        cur_salary_low, cur_salary_high = (int(i) for i in row[9].split("-"))
    
        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
               kg.get_ns("znp").has_min_salary, 
               rdflib.Literal(cur_salary_low, datatype=kg.get_ns("xsd").integer))

        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
               kg.get_ns("znp").has_max_salary, 
               rdflib.Literal(cur_salary_high, datatype=kg.get_ns("xsd").integer))

    # Add desired salary data
    if row[6] != "-" and row[6] != "--":
        desired_salary_low, desired_salary_high = (int(i) for i in row[6].split("-"))

        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
               kg.get_ns("znp").wants_min_salary, 
               rdflib.Literal(desired_salary_low, datatype=kg.get_ns("xsd").integer))

        kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
               kg.get_ns("znp").wants_max_salary, 
               rdflib.Literal(desired_salary_high, datatype=kg.get_ns("xsd").integer))
    
    # Add birthday data
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
           kg.get_ns("znp").has_birthday, 
           rdflib.Literal(int(row[11]), datatype=kg.get_ns("xsd").integer))

    # Add years expereince data
    if row[12] != "-":   
        kg.add(eval(f"kg.get_ns('zne').{row[1]}"),
               kg.get_ns("znp").started_work_in,
               rdflib.Literal(int(row[12]), datatype=kg.get_ns("xsd").integer))
    

In [None]:
# Count nodes and edges
measure.measure_graph(kg)

print()
print("edges:", measure.get_edge_count())
print("nodes:", measure.get_node_count())

# Add vacancy data

In [None]:
for row in tqdm(jds.itertuples(), total=265690):    
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), kg.get_ns('znp').is_stationed_in, eval(f"kg.get_ns('zne').city{row[4]}"))
    kg.add(eval(f"kg.get_ns('zne').city{row[4]}"), kg.get_ns('rdf').type, kg.get_ns('znd').City)

    # Add job type data
    if type(row[5]) == str and (row[5] != r"\N"):
        for job_type in row[5].split("/"):
            job_type = re.sub('[(\（\().*\,）|、)]', '', job_type)
            
            if job_type[0].isdigit():
                job_type = "_" + job_type

            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').is_job_type, 
                   eval(f"kg.get_ns('zne').job_type_{job_type}"))
            
            kg.add(eval(f"kg.get_ns('zne').job_type_{job_type}"), 
                   kg.get_ns('rdf').type, 
                   kg.get_ns('znd').Job_type)

    # Add education data
    if type(row[14]) == str:
        if row[14] != "\\N":
            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').requires_min_edu, 
                   eval(f"kg.get_ns('zne').edu_{row[14]}"))
        
    if type(row[15]) == str:
        if row[15] != "\\N":
            kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
                   kg.get_ns('znp').max_edu_level, 
                   eval(f"kg.get_ns('zne').edu_{row[15]}"))

    # Add salary data
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
           kg.get_ns("znp").offers_min_salary, 
           rdflib.Literal(int(row[8]), datatype=kg.get_ns("xsd").integer))
    
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
           kg.get_ns("znp").offers_max_salary, 
           rdflib.Literal(int(row[7]), datatype=kg.get_ns("xsd").integer))

    # Add travel data
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"), 
           kg.get_ns("znp").requires_travel, 
           rdflib.Literal(bool(row[11]), datatype=kg.get_ns("xsd").bool))

    # Add required years of experience data
    kg.add(eval(f"kg.get_ns('zne').{row[1]}"),
           kg.get_ns("znp").requires_years,
           rdflib.Literal(int(row[12]), datatype=kg.get_ns("xsd").integer))

In [None]:
# Check the increase in nodes/edges after inference

measure = kglab.Measure()
measure.measure_graph(kg)

print("edges before inference", measure.get_edge_count())
print("nodes before inference", measure.get_node_count())

# Do all the nifty inference
kg.infer_owlrl_closure()

measure.measure_graph(kg)

print()
print("edges after inference", measure.get_edge_count())
print("nodes after inference", measure.get_node_count())

# Store the knowledge graph 

In [None]:
kgt = kglab.SubgraphTensor(kg)

del kg

with open("kg.edgelist", "w+", encoding="utf-8") as f:
    
    for i, (s, p, o) in tqdm(enumerate(kgt.as_tuples()), total=3053437):
            s_label = kgt.n3fy(s)
            # s_id = kgt.transform(s_label)

            p_label = kgt.n3fy(p)
            # p_id = kgt.transform(p_label)

            o_label = kgt.n3fy(o)
            # o_id = kgt.transform(o_label)

            # print([s_id, o_id, 2 * p_id])
            # print([o_id, s_id, 2 * p_id + 1])

            f.write(f"['{s_label}', '{p_label}', '{o_label}']\n")