In [None]:
import os
import torch
import json 
import re

import pandas as pd
import numpy as np
import networkx as nx

from collections import defaultdict
from torch_geometric.data import Data, HeteroData, Batch
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_dense_adj
from torch_geometric.nn import to_hetero
from transformers import AutoTokenizer, AutoModel
from pathlib import Path

In [None]:
truth_values = defaultdict(list)

li = []

# Retrieve ground truth values for each candidate-vacancy pair
for truth in os.listdir("../graph_data/ground_truth/"):
    if ".csv" in truth:
        df = pd.read_csv(f"../graph_data/ground_truth/{truth}", header=None)
        li.append(df)
        
        
truths = pd.concat(li, axis=0, ignore_index=True)

truth_dict = {key1: dict(group[[1, 2]].values) for key1, group in truths.groupby(0)}
truth_dict = {k: v for k, v in truth_dict.items() if not all([i <= 0 for i in v.values()])}

In [None]:
hits = defaultdict(lambda : defaultdict(lambda : defaultdict))
misses = defaultdict(lambda : defaultdict(lambda : defaultdict))

# Load data
for i in os.listdir("../graph_data/hits"):
    if ".json" in i:
        if i.split(".")[0] in truth_dict:
            full = json.load(open(f"../graph_data/hits/{i}"))
            for k, v in full.items():
                g = nx.node_link_graph(v)
                hits[i.split(".")[0]][k] = g
            
                
for i in os.listdir("../graph_data/misses"):
    if ".json" in i:
        if i.split(".")[0] in truth_dict:
            full = json.load(open(f"../graph_data/misses/{i}"))
            for k, v in full.items():
                misses[i.split(".")[0]][k] = nx.node_link_graph(v)   

In [None]:
emb_size = 30

label_types = {"rnp:supersedes" : torch.rand(emb_size), "rnp:offers_function" : torch.rand(emb_size), "rnp:has_worked_at" : torch.rand(emb_size), 
               "rnp:falls_under" : torch.rand(emb_size), "rnp:has_skill" : torch.rand(emb_size), "rnp:code_is_level" : torch.rand(emb_size), 
               "rnp:worked_at_company" : torch.rand(emb_size), "rnp:has_worked_function" : torch.rand(emb_size), "rnp:has_function_id" : torch.rand(emb_size), 
               "rnp:has_isco_level" : torch.rand(emb_size), "rnp:has_isco_code" : torch.rand(emb_size), "rnp:part_of_workgroup" : torch.rand(emb_size),
               "rnp:number_working_hours" : torch.rand(emb_size), "rnp:number_working_days" : torch.rand(emb_size), "rnp:has_salary" : torch.rand(emb_size), 
               "rnp:requires_license" : torch.rand(emb_size), "rnp:requires_work_think_level" : torch.rand(emb_size), "rnp:has_function_id" : torch.rand(emb_size), 
               "rnp:has_function_name" : torch.rand(emb_size), "rnp:has_company_name" : torch.rand(emb_size), "rnp:has_isco" : torch.rand(emb_size), 
               "rnp:requires_education" : torch.rand(emb_size), "rnp:speaks_language" : torch.rand(emb_size), "rnp:has_license" : torch.rand(emb_size), 
               "rnp:cv" : torch.rand(emb_size), "rnp:vacancy" : torch.rand(emb_size), "rnp:match_neg1": torch.rand(emb_size), "rnp:match_0": torch.rand(emb_size),
               "rnp:match_1": torch.rand(emb_size), "rnp:match_2": torch.rand(emb_size), "rnp:match_3": torch.rand(emb_size), "rnp:match_4": torch.rand(emb_size),
               "rnp:match_5": torch.rand(emb_size), "rdf:type": torch.rand(emb_size), "dbo:education": torch.rand(emb_size), "owl:sameAs": torch.rand(emb_size),
               "rnp:encompasses": torch.rand(emb_size), "rdfs:subClassOf": torch.rand(emb_size), "rdfs:comment": torch.rand(emb_size)}

In [None]:
hits = {user: {job: hits[user][job] for job in hits[user]} for user in hits}        

misses = {user: {job: misses[user][job] for job in misses[user]} for user in misses}       

hits_misses = {user: {**hits[user], **misses[user]} for user in {**hits, **misses}.keys() if (user in hits) and (user in misses)} 

In [None]:
hits_misses_filtered = {}

# Some sub-graphs do not actually include the tail node (occurs when the only link to the tail is through the "match" edge, which is removed as that is the ground truth value of the graph
# So, we ignore those as they are redundant
for k, v in hits_misses.items():
    hits_misses_filtered[k] = {tail: graph for tail, graph in v.items() if f"rne:r{tail}" in graph and f"r{tail}" in truth_dict[k]}
    
filtered_truth_dict = {k: {tail: truth_dict[k]["r" + tail] for tail in v if "r" + tail in truth_dict[k]} for k, v in hits_misses_filtered.items()}
truth_dict = {k: v for k, v in filtered_truth_dict.items() if not all([i <= 0 for i in v.values()])}

hits_misses_filtered = {k: v for k, v in hits_misses_filtered.items() if k in truth_dict}
len(hits_misses_filtered)

In [None]:
if (Path.cwd().parent / "data_subset.json").exists()
    data_subset = {k : ["r" + graph for graph in v] for k, v in hits_misses_filtered.items()}

    with open("../data_subset.json", "w+") as f:
        json.dump(data_subset, f)

In [None]:
def create_dataloaders(hits_misses, train_size=0.8, val_size=0.1, create_train = True, create_val = True, create_test = True, shuffle=False):

    # Init
    embedding_storage = defaultdict(lambda : defaultdict(lambda : defaultdict(torch.Tensor)))
    
    # Textual features (CVs and vacancies) should be tokenized so that they can be embedded later on
    tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")

    # Final data list
    train_loader = []
    val_loader = []
    test_loader = []

    # Ways to return nodes to their graphs and vice versa
    graph_finder = {}
    target_finder = {}

    # Each node has a unique ID, this is to identify those
    node_id_to_node = {}
    node_to_node_id = {}

    # Each graph in the dataset stored as a list of edges
    og_graphs = {}

    total_graphs = 0
    nodes_per_graph = defaultdict(list)

    # Start counting nodes at 1, since we have a single dummy node at index 0
    total_nodes = 1
    
    train_cutoff = int(len(hits_misses) * train_size)
    val_cutoff = int(len(hits_misses) * (train_size + val_size))

    # Create a tensor for each user
    for usr_idx, (user, graphs) in enumerate(hits_misses.items()):
        
        # Shuffle randomizes the samples in each loader
        if shuffle:
            bucket = np.random.random()

            # Only create the requested loaders
            if not create_train and (in_train := bucket < train_size):
                continue
            if not create_val and (in_val := train_size < bucket < (train_size + val_size)):
                continue
            if not create_test and (in_test := (train_size + val_size) < bucket):
                continue
        else: # Without shuffle, the samples in each loader are set
            if not create_train and usr_idx < train_cutoff:
                continue
            if not create_val and usr_idx < val_cutoff:
                continue
            elif not create_test:
                break


        print(f"{usr_idx}/{len(hits_misses)}", end="\r")

        user_graphs = []

        temp = []
        truth_values = []
        heads = []
        tails = []
        targets = []
        typing = {}

        tail_nodes = []
        head_nodes = []

        node_graphs = {}
        sg_counter = 0

        edges_candidate_company = []
        
        for i, (tail, graph) in enumerate(graphs.items()):
            
            if graph:            
                # Every node should be uniquely stored in each sub-graph, even if it's used in multiple graphs
                graph = nx.relabel_nodes(graph, {node: f"{node}_{total_graphs:08}" for node in graph})

                nx.set_edge_attributes(graph, {edge:i for edge in graph.edges()}, name="weight")
                relations = nx.get_edge_attributes(graph, "edge_type")

                embs = {}
                mask = {}

                edge_embs = {}

                # experiences = nx.get_node_attributes(graph, "CV")

                # Store node embeddings
                for node in graph:                
                    if (":" in node) and (re.match("rne:r([0-9])+", node) or re.match("rne:c([0-9])+", node)):                        
                        # If our current node is the head/tail of the sub-graph, store it for later use
                        if node.split(":")[1].split("_")[0][1:] == tail:
                            tail_nodes.append(total_nodes)
                        elif node.split(":")[1].split("_")[0] == user:
                            head_nodes.append(total_nodes)

                    # If we're dealing with a candidate
                    if re.match("rne:c([0-9])+", node):

                        ### The textual features need to be locked in terms of size
                        # CV = experiences[node].replace("|", " ") if type(experiences[node]) == str else " "
                        CV_embedding_size = 96

                        CV_encoded = tokenizer("query: " + graph.nodes[node].get("CV", ""), padding="max_length", truncation=True, max_length=CV_embedding_size, return_tensors='pt')

                        embs[node] = CV_encoded["input_ids"]
                        mask[node] = CV_encoded["attention_mask"]

                        typing["_".join(node.split("_")[:-1])] = "candidate"

                    elif re.match("rne:r([0-9])+", node):

                        vacancy_embedding_size = 96

                        vacancy_encoded = tokenizer("passage: " + graph.nodes[node].get("vacancy", ""), padding="max_length", truncation=True, max_length=vacancy_embedding_size, return_tensors='pt')

                        embs[node] = vacancy_encoded["input_ids"]
                        mask[node] = vacancy_encoded["attention_mask"]

                        typing["_".join(node.split("_")[:-1])] = "request"

                    else:
                        # Any other node type simply starts with a random embedding
                        random_embedding_size = 16

                        embs[node] = torch.rand(random_embedding_size)


                        if node.startswith("rne:func_"):
                            typing["_".join(node.split("_")[:-1])] = "function_name"
                        elif re.match("rne:isco([0-9])+", node):
                            typing["_".join(node.split("_")[:-1])] = "isco_code"
                        elif ":" in node and node.split(":")[1] in {"Primary_education", "Secondary_education", "Vocational_school", "Vocational_university", "University"}:
                            typing["_".join(node.split("_")[:-1])] = "education"
                        elif node.startswith("rne:lang_"):
                            typing["_".join(node.split("_")[:-1])] = "language"
                        elif node.startswith("rne:rijbewijs_"):
                            typing["_".join(node.split("_")[:-1])] = "license"
                        elif node.startswith("rne:skill_"):
                            typing["_".join(node.split("_")[:-1])] = "skill"
                        elif node.startswith("rne:comp_"):
                            typing["_".join(node.split("_")[:-1])] = "company_name"
                        elif re.match("rne:fid([0-9])+", node):
                            typing["_".join(node.split("_")[:-1])] = "function_id"
                        elif re.match("rne:level([0-9])+", node):
                            typing["_".join(node.split("_")[:-1])] = "isco_level"
                        elif node in {"rne:RNL", "rne:TTG", "rne:YHT"}:
                            typing["_".join(node.split("_")[:-1])] = "workgroup"
                        elif node.startswith("rnd:") or node.startswith("owl:") or node.startswith("foaf:"): 
                            typing["_".join(node.split("_")[:-1])] = "klass"
                        else:
                            typing["_".join(node.split("_")[:-1])] = "literal"

                    # Give each node a fully unique ID
                    node_id_to_node[total_nodes] = node
                    node_to_node_id[node] = total_nodes


                    # Store to which graph that ID belongs
                    node_graphs[node] = sg_counter

                    # nodes_per_graph[total_graphs].append(total_nodes)

                    total_nodes += 1


                # Store edge embeddings
                for edge in graph.edges():
                    edge_embs[edge] = label_types[relations[edge]]

                # Add embeddings to graph
                nx.set_node_attributes(graph, embs, name="embedding")
                nx.set_node_attributes(graph, mask, name="att_mask")



                # Store information (head, tail, and structure) of current graph based on its id
                target_finder[total_graphs] = (user, tail)
                og_graphs[total_graphs] = [(edge[0].split("_")[0], edge[1].split("_")[0]) for edge in graph.edges()]


                nx.set_edge_attributes(graph, {edge: total_graphs for edge in graph.edges()}, 
                                       name="sub_graph")

                # Store which sub-graph IDs relate to what candidate
                user_graphs.append(total_graphs)

                total_graphs += 1

                nx.set_edge_attributes(graph, edge_embs, name="embedding")


                temp.append(graph)
                      
                if user in truth_dict and tail in truth_dict[user]:
                    truth_values.append(truth_dict[f"{user}"][tail])
                    heads.append(f"{user}")
                    tails.append(f"r{tail}")
                else:
                    truth_values.append(0)

                sg_counter += 1

        # Ground truths
        y = torch.Tensor([i if i >= 0 else 0 for i in truth_values])  

        # If we have no sorting to do, ignore the current batch
        if all(y == 0):
            print("skipping")
            continue

        # Combine sub-graphs
        G = nx.compose_all(temp)

        data = HeteroData()

        data_dict = defaultdict(list)
        mask_dict = defaultdict(list)
        sg_dict = defaultdict(list)

        index_dict = defaultdict(list)
        unique_index_dict = defaultdict(list)

        typings = ["candidate", "request", "function_name", "isco_code", "education", "language", "license", "skill", "company_name", "function_id", "isco_level", "workgroup", "klass", "literal"]
        # typings = set(typing.values())

        dummy_node_id = 0

        # Store node data in the appropriate data lists
        for k, v in G.nodes(data=True):

            node, node_id = "_".join(k.split("_")[:-1]), k.split("_")[-1]


            data_dict[typing[node]].append(v["embedding"])

            if "att_mask" in v:
                mask_dict[typing[node]].append(v["att_mask"])

            sg_dict[typing[node]].append(node_graphs[k])

            index_dict[typing[node]].append(node_id)
            unique_index_dict[typing[node]].append(node_to_node_id[k])

        for node_type in data_dict.keys():
            data[node_type].node_id = torch.Tensor([int(i) for i in index_dict[node_type]])
            data[node_type].unique_node_id = torch.Tensor([int(i) for i in unique_index_dict[node_type]])
            data[node_type].sub_graph = torch.Tensor([int(i) for i in sg_dict[node_type]])
            data[node_type].x = torch.stack(data_dict[node_type]).squeeze(1)
            data[node_type].num_nodes = len(index_dict[node_type])

            if node_type in ["candidate", "request"]:
                data[node_type].att_mask = torch.stack(mask_dict[node_type]).squeeze(1)

        for node_type in typings:
            if node_type not in data_dict:
                data[node_type].node_id = torch.Tensor([dummy_node_id])
                data[node_type].unique_node_id = torch.Tensor([dummy_node_id])
                data[node_type].x = torch.zeros(random_embedding_size)
                data[node_type].num_nodes = 1

                if node_type in ["candidate", "request"]:
                    data[node_type].att_mask = torch.zeros(random_embedding_size) + 1

        # Now do the same for the edges
        edge_data = defaultdict(list)
        edge_ids = defaultdict(list)

        H = G.copy()
        H = nx.relabel_nodes(H, {node: int(node.split("_")[-1]) for node in H})

        sub_graphs = nx.get_edge_attributes(H, "sub_graph")
        graph_finder = {**graph_finder, **sub_graphs}


        # Define all the possible edge types    
        edge_typing = {("candidate", "request") : "fulfilled",
                       ("request", "candidate") : "was_fulfilled_by",
                       ("candidate", "company_name") : "worked_at",
                       ("company_name", "candidate") : "employed", 
                       ("candidate", "education") : "has_education",
                       ("education", "candidate") : "is_education_of",
                       ("candidate", "isco_code") : "has_worked_code",
                       ("isco_code", "candidate") : "has_been_code_of",
                       ("candidate", "language") : "speaks",
                       ("language", "candidate") : "is_spoken_by",
                       ("candidate", "license") : "has_license",
                       ("license", "candidate") : "is_license_of",
                       ("candidate", "skill") : "has_skill",
                       ("skill", "candidate") : "is_skill_of", 
                       ("candidate", "function_id") : "worked_fid",
                       ("function_id", "candidate") : "was_fid_of", 
                       ("candidate", "function_name") : "worked_function",
                       ("function_name", "candidate") : "was_function_of",
                       ("candidate", "isco_level") : "has_worked_level",
                       ("isco_level", "candidate") : "was_working_level_of",

                       ("request", "education") : "requires_education",
                       ("education", "request") : "is_required_education_of", 
                       ("request", "isco_code") : "falls_under_isco_code", 
                       ("isco_code", "request") : "is_isco_of_request",
                       ("request", "isco_level") : "falls_under_isco_level", 
                       ("isco_level", "request") : "is_level_of_request", 
                       ("request", "function_id") : "request_has_fid",
                       ("function_id", "request") : "is_fid_of_request",
                       ("request", "function_name") : "request_has_function_name",
                       ("function_name", "request") : "is_function_name_of_request",
                       ("request", "company_name") : "offered_by", 
                       ("company_name", "request") : "offers_position",
                       ("request", "skill") : "request_requires_skill", 
                       ("skill", "request") : "skill_is_required_by_request", 
                       ("request", "workgroup") : "part_of_workgroup", 
                       ("workgroup", "request") : "is_workgroup_of",
                       ("request", "license") : "requires_license", 
                       ("license", "request") : "is_required_license_of",

                       ("function_name", "function_id") : "is_name_of_fid",
                       ("function_id", "function_name") : "is_fid_of_function",
                       ("function_name", "isco_level") : "function_has_level",
                       ("isco_level", "function_name") : "is_level_of_function",

                       ("isco_code", "function_name") : "has_function_name",
                       ("function_name", "isco_code") : "is_function_name_of",
                       ("isco_code", "skill") : "code_requires_skill", 
                       ("skill", "isco_code") : "skill_is_required_by_code", 
                       ("isco_code", "function_id") : "code_has_fid",
                       ("function_id", "isco_code") : "is_fid_of",
                       ("isco_code", "isco_level") : "code_has_level",
                       ("isco_level", "isco_code") : "is_level_of_code"}

        # Add a few iteratively since they're the same for everything
        for typ in set(typings):
            edge_typing[(typ, "klass")] =  f"{typ}_is_part_of_class"
            edge_typing[("klass", typ)] =  f"{typ}_is_class_of"
            edge_typing[(typ, "literal")] = f"{typ}_has_literal_value"
            edge_typing[("literal", typ)] = f"{typ}_is_literal_value_of"      
            edge_typing[(typ, typ)] = f"{typ}_same_as"

        for i, j in G.edges():

            edge1, edge1_id = "_".join(i.split("_")[:-1]), i.split("_")[-1]
            edge2, edge2_id = "_".join(j.split("_")[:-1]), j.split("_")[-1]

            type1, type2 = typing[edge1], typing[edge2]

            edge_data[edge_typing[(type1, type2)]].append((index_dict[type1].index(edge1_id), index_dict[type2].index(edge2_id)))
            edge_ids[edge_typing[(type1, type2)]].append((int(edge1_id), int(edge2_id)))


        dummy_embedding = torch.zeros_like(next(iter(nx.get_node_attributes(G, "embedding").values())))  # Assuming all node embeddings have the same shape
        G.add_node(dummy_node_id, embedding=dummy_embedding)

        for k, v in edge_typing.items():
            if edge_data[v]:
                data[k[0], v, k[1]].edge_index = torch.LongTensor(edge_data[v]).T
                data[k[0], v, k[1]].edge_id = torch.LongTensor(edge_ids[v]).T
            else:
                dummy_edge_index = [dummy_node_id, dummy_node_id]

                data[k[0], v, k[1]].edge_index = torch.LongTensor([dummy_edge_index]).T
                data[k[0], v, k[1]].edge_id = torch.LongTensor([dummy_node_id]).T

        node_order = list(G.nodes())

        data.head_nodes = head_nodes
        data.tail_nodes = tail_nodes

        data.num_graphs = torch.LongTensor(user_graphs)
        data.y = y
        data.tups = list(zip(heads, tails))

        if shuffle: # With shuffle, the sample's loader has been determined by its bucket earlier
            if in_train:
                train_loader.append(data)
            elif in_val:
                val_loader.append(data)
            else:
                test_loader.append(data)
        else:
            if usr_idx < train_cutoff:
                train_loader.append(data)
            elif usr_idx < val_cutoff:
                val_loader.append(data)
            else:
                test_loader.append(data)
            
    return train_loader, val_loader, test_loader

In [None]:
# Create/load dataloaders
train_loader, val_loader, test_loader = create_dataloaders(hits_misses_filtered,  
                                                           create_train = create_train, 
                                                           create_val = create_val, 
                                                           create_test = create_test)

### Batching not possible due to the fact that different graphs have different numbers of edge for each type - Considering the RAM usage, we did not implement this with, e.g., padding. 
trainloader = DataLoader(train_loader) 
valloader = DataLoader(val_loader) 
testloader = DataLoader(test_loader)

torch.save(trainloader, '../dataloaders/graph_trainloader.pth')
torch.save(valloader, '../dataloaders/graph_valloader.pth')
torch.save(testloader, '../dataloaders/graph_testloader.pth')    