In [None]:
import pandas as pd
import numpy as np
import networkx as nx

import re
import itertools
import pickle
import json
import datetime

from tqdm.notebook import tqdm
from collections import defaultdict

In [None]:
df_pairs = pd.read_csv("./cleaner_data/cv-vacancy-pairs.csv")

In [None]:
# We only consider candidates with at least 5 matching vacancies in the dataset
# This not only saves time, but also improves results later on
candidate_counts = df_pairs[df_pairs["label"] > 0].groupby("candidate_id")["request_mondriaan_number"].count()
candidate_list = candidate_counts[candidate_counts >= 3].index

candidate_list

In [None]:
G = nx.DiGraph()

with open("kg.edgelist") as f:
    for line in tqdm(f.readlines()):
        
        # Some literals got assigned the wrong datatype - TODO
        if "err:error" in line:
            continue
            
        s, p, o = eval(line)
        G.add_edge(s, o, edge_type=p)

In [None]:
list(G.nodes)[:10]

In [None]:
all_neighbors = {}

H = G.to_undirected()

for n in tqdm(G.nodes):
    all_neighbors[n] = set(H.neighbors(n))
    
all_neighbors = {k: {i for i in v if not pd.isna(i)} for k, v in all_neighbors.items()}

In [None]:
for row in tqdm(df_pairs.itertuples(), total=274407):
    cv = re.sub("\n+", " \n ", row[8]).lower()
    vacancy = re.sub("\n+", " \n ", row[7]).lower()
    
    nx.set_node_attributes(H, {f"rne:c{row[4]}": cv}, "CV")
    nx.set_node_attributes(H, {f"rne:r{row[5]}": vacancy}, "vacancy")

In [None]:
def convert_to_number(value):
    try:
        result = float(value)
        # Check if the float is actually an integer (e.g., '4110.0')
        if result.is_integer():
            return int(result)
        return result
    except ValueError:
        return value
    

def k_walk(G, a, b, all_neighbors, k=6, walks=100):
    sub_graph = []
            
    # Number of walks
    for _ in range(walks):
        path = ()
        
        visited = set()
        
        prev_node = a
        old = None
        
        # Length of each walk
        for _ in range(k):
            # Choose a random neighbor of our current node
            candidates = all_neighbors[prev_node] # if prev_node in all_neighbors else set(G.neighbors(prev_node))
                
            # Don't backtrack unless absolutely necessary
            if old and len(candidates) > 1:
                candidates = candidates - set([old])
            
            # If we can reach the target from the current node, 
            # do so, as long as that wouldn't create a duplicate path
            if b in candidates and (prev_node, b) not in set(sub_graph):
                next_node = b
            else:
                if candidates:
                    next_node = np.random.choice(list(candidates))
                    
                    if next_node in visited:
                        continue
                    else:
                        visited.add(next_node)                    
                else:
                    break
            
            path += (prev_node, next_node),

            # Update
            old = prev_node
            prev_node = next_node
            
            # If we found the target, store
            if next_node == b:
                sub_graph.extend(path)
                break
                
    sg = set(sub_graph)
    sg -= {(a, b)}
    sg -= {(b, a)}
                
    H = G.edge_subgraph(sg)
               
    return H

In [None]:
def create_jsons(G, all_neighbors, k=10, walks=50):
        
    error = ""
        
    for i, candidate in tqdm(enumerate(candidate_list), total=len(candidate_list)):
        # k_walk(H, "rne:c4783221", "rne:r2307362", all_neighbors, k=10, walks=50)

        # Find all misses for the current candidate - these can either be explicit or implicit
        # Store only the first 15, as we do not need, for example, 456 matching vacancies for a single candidate
        try:
            hits = df_pairs[(df_pairs["candidate_id"] == candidate) & (df_pairs["label"] > 0)][["request_mondriaan_number", "label"]][:15]

            n_hits = len(hits)

            # Select 30-n_hits misses, so we have 30 (non-)matching vacancies per candidate
            misses = df_pairs[((df_pairs["candidate_id"] == candidate) & (df_pairs["label"] <= 0)) | ((df_pairs["candidate_id"] != candidate))][["request_mondriaan_number", "label"]].sample(30-n_hits)

            record = ""

            missed_graphs = defaultdict(lambda : defaultdict(dict))
            hit_graphs = defaultdict(lambda : defaultdict(dict))

            for miss in misses.itertuples():
                graph = k_walk(H, f"rne:c{candidate}", f"rne:r{miss[1]}", all_neighbors, k=k, walks=walks)

                if graph:
                    record += f"c{candidate},r{miss[1]},{miss[2] if miss[2] <= 0 else 0}\n"    
                    missed_graphs[miss[1]] = nx.node_link_data(graph)

            for hit in hits.itertuples():
                graph = k_walk(H, f"rne:c{candidate}", f"rne:r{miss[1]}", all_neighbors, k=k, walks=walks)

                if graph:
                    record += f"c{candidate},r{hit[1]},{miss[2]}\n"    
                    hit_graphs[hit[1]] = nx.node_link_data(graph)

            with open(f"graph_data/misses/c{candidate}.json", "w") as f1:
                json.dump(missed_graphs, f1)

            with open(f"graph_data/hits/c{candidate}.json", "w") as f1:
                json.dump(hit_graphs, f1)

            with open(f"graph_data/ground_truth/c{candidate}.csv", "w") as f1:
                f1.write(record)
        except Exception as e:
            error = str(e)
            
        with open("log.txt", "a") as f:
            if error:
                f.write(f"{datetime.datetime.now()} - {i}/{len(candidate_list)} - error: {error}\n")
                error = ""
            else:
                f.write(f"{datetime.datetime.now()} - {i}/{len(candidate_list)} - successfully written\n")

In [None]:
create_jsons(H, all_neighbors, k=7, walks=25)