### Data preperation

In [None]:
import os
import re
import string
import json
import optuna 

import numpy as np
import pandas as pd

In [None]:
li = []

# Retrieve ground truth values for each candidate-vacancy pair
for truth in os.listdir("./ground_truth"):
    if ".csv" in truth:
        df = pd.read_csv(f"./ground_truth/{truth}", header=None)
        li.append(df)
        
        
truths = pd.concat(li, axis=0, ignore_index=True)

In [None]:
truth_dict = {key1: dict(group[[1, 2]].values) for key1, group in truths.groupby(0)}

In [None]:
truth_dict = {k: v for k, v in truth_dict.items() if not all([i <= 0 for i in v.values()])}
truth_dict = {k: {j: graph for j, graph in v.items() if type(j) == str} for k, v in truth_dict.items()}

In [None]:
len(truth_dict), np.mean([len(v) for v in truth_dict.values()])

In [None]:
data = pd.read_csv("./source_data/cv_vacancy_data.tsv", sep="\t")[["user_id", "experience", "jd_no", "full_text", "label"]]

data.head()

In [None]:
relevant_candidates = set(truth_dict.keys())
relevant_vacancies = [set(v.keys()) for _, v in truth_dict.items()]
relevant_vacancies = set([item for sublist in relevant_vacancies for item in sublist])

In [None]:
cv_data = {}
req_data = {}

for row in data.itertuples():   
        if type(row[1]) == str and row[1] not in cv_data and f"{row[1]}" in relevant_candidates:

            cv = row[2]
            if not type(cv) == str:
                cv = ""

            cv_data[row[1]] = re.sub(r"\s+", " ", re.sub("\n+", "\n ", re.sub(r"\W", " ", cv))).lower()

        if type(row[3]) == str and (row[3] not in req_data) and f"{row[3]}" in relevant_vacancies:

            jd = row[4]
            if not type(jd) == str:
                jd = ""
            
            req_data[row[3]] = re.sub(r"\s+", " ", re.sub("\n+", "\n ", re.sub(r"\W", " ", jd))).lower()

In [None]:
!pip install -q transformers
from transformers import AutoTokenizer, AutoModel

In [None]:
avg_cv_len = []
avg_req_len = []

tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small")

for cv in cv_data.values():
    tokens = tokenizer(cv, add_special_tokens=True, return_tensors='pt')["input_ids"]
    avg_cv_len.append(len(tokens[0]))

print(f"Average tokens per CV: {np.mean(avg_cv_len)} ({np.std(avg_cv_len)})")


for req in req_data.values():
    tokens = tokenizer(req, add_special_tokens=True, return_tensors='pt')["input_ids"]
    avg_req_len.append(len(tokens[0]))

print(f"Average tokens per request: {np.mean(avg_req_len)} ({np.std(avg_req_len)})")

In [None]:
training_slice = list(truth_dict.items())[:int(len(truth_dict) * 0.8)]
val_slice = list(truth_dict.items())[int(len(truth_dict) * 0.8):int(len(truth_dict) * 0.9)]
test_slice = list(truth_dict.items())[int(len(truth_dict) * 0.9):]

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

from tqdm.notebook import tqdm

In [None]:
# The first 80% of the data will be used to train the TF-IDF vectorizer
training_set = []

for candidate, vacancies in training_slice:
    training_set.append(cv_data[candidate])
    
    for vacancy in vacancies:
        training_set.append(req_data[vacancy])

In [None]:
# The remaining 80% will be used to evaluate its performance on unseen data
val_set = []

for candidate, vacancies in val_slice:
    batch = []
    
    batch.append(cv_data[candidate])
    
    for vacancy, label in vacancies.items():
        batch.append((req_data[vacancy], label if label >= 0 else 0))
        
    val_set.append(batch)

In [None]:
# The remaining 80% will be used to evaluate its performance on unseen data
val_set = []

for candidate, vacancies in val_slice:
    batch = []
    
    batch.append(cv_data[candidate])
    
    for vacancy, label in vacancies.items():
        batch.append((req_data[vacancy], label if label >= 0 else 0))
        
    val_set.append(batch)

In [None]:
# Initialize model
vectorizer = TfidfVectorizer()
tf_idf_model = vectorizer.fit(training_set)

In [None]:
ndcg_scores = []

# Embed each CV and its corresponding vacancies using the model, calculate cosine similarity
# and evaluate using nDCG
for batch in tqdm(val_set):
    
    ground_truth = []
    y_pred = []
    
    cv_emb = tf_idf_model.transform([batch[0]])
    
    for vacancy, label in batch[1:]:        
        req_emb = tf_idf_model.transform([vacancy])
        
        ground_truth.append(label)
        y_pred.append(cosine_similarity(cv_emb, req_emb)[0][0])
    
    ground_truth = np.array(ground_truth)
    y_pred = np.array(y_pred)

    ndcg_scores.append(ndcg_score([ground_truth], [y_pred], k=10))
    
print("nDCG of TF-IDF model:", np.mean(ndcg_scores))

# Random

In [None]:
ndcg_scores = []

# Embed each CV and its corresponding vacancies using the model, calculate cosine similarity
# and evaluate using nDCG
for batch in tqdm(val_set):
    
    ground_truth = []
    y_pred = []
    
    cv_emb = np.random.random(350)
    
    for vacancy, label in batch[1:]:        
        req_emb = np.random.random(350)
        
        ground_truth.append(label)
        y_pred.append(cosine_similarity([cv_emb], [req_emb])[0][0])
    
    ground_truth = np.array(ground_truth)
    y_pred = np.array(y_pred)

    ndcg_scores.append(ndcg_score([ground_truth], [y_pred], k=10))
    
print("nDCG of random model:", np.mean(ndcg_scores))

# D2V

In [None]:
!pip install -q gensim optuna

In [None]:
!pip install --upgrade scipy

In [None]:
import logging
import optuna
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from scipy.linalg import triu
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
documents = [TaggedDocument(gensim.utils.simple_preprocess(doc), [i]) for i, doc in enumerate(training_set)]

In [None]:
best_score = 0

def train_d2v(trial, documents):

     # Search space
    min_count = trial.suggest_categorical('min_count', [0, 1, 2, 5])
    window_size = trial.suggest_categorical('window_size', [2, 5, 10, 20])
    vector_size = trial.suggest_categorical('vector_size', [16, 32, 128, 256])
    epochs = trial.suggest_categorical('epochs', [5, 20, 40, 100])
    
    print(f"Starting training\nConfig: vector_size: {vector_size}, window_size: {window_size}, min_counts: {min_count}, epochs: {epochs}")

    model = Doc2Vec(vector_size=vector_size, window=window_size, min_count=min_count, epochs=epochs, workers=4)
    model.build_vocab(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

    ndcg_scores = []

    for batch in val_set:
        ground_truth = []
        y_pred = []

        cv_emb = model.infer_vector(gensim.utils.simple_preprocess(batch[0]))

        for vacancy, label in batch[1:]:
            vacancy_emb = model.infer_vector(gensim.utils.simple_preprocess(vacancy))
            ground_truth.append(label)
            y_pred.append(cosine_similarity([cv_emb], [vacancy_emb])[0][0])

        ndcg_scores.append(ndcg_score([ground_truth], [y_pred], k=10))

    score = np.mean(ndcg_scores)
    print(f"score: {score}")
    print()
    
    return score

In [None]:
def objective_wrapper(documents):
    def objective(trial):
        return train_d2v(trial, documents)
    
    return objective

In [None]:
# Define the Optuna study
study = optuna.create_study(direction='maximize')

# We need to provide trainloader and valloader to the training/validation loop
wrapped_objective = objective_wrapper(documents)

# Start optimization
study.optimize(wrapped_objective, n_trials=12)  

print("Best hyperparameters:", study.best_trial.params)

with open("d2v_results.txt", "w+") as f:
    json.dump(study.best_trial.params, f)

# BERT (e5)

In [None]:
!pip install -q transformers wandb peft optuna

In [None]:
import torch
import gc
import wandb
import warnings
import optuna

import torch.nn.functional as F
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from peft import LoraConfig, get_peft_model
from torch.optim.lr_scheduler import StepLR

In [None]:
device = ("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
class TokenDataLoader(Dataset):
    def __init__(self, truth_dict, cv_data, req_data, query_size=512, batch_size=32):
        self.ground_truths = list(truth_dict.items())
        self.cv_texts = cv_data
        self.req_texts = req_data
        
        self.query_size = query_size
        self.tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")
        self.batch_size = batch_size

    def __len__(self):
        # Adjust the length to account for the number of batches based on candidates
        return (len(self.ground_truths) + self.batch_size - 1) // self.batch_size

    def __getitem__(self, idx):

        candidate, vacancies = self.ground_truths[idx]
        req_ids, labels = vacancies.keys(), list(vacancies.values())

        candidate_text = "query: " + self.cv_texts[candidate]
        vacancy_texts = ["passage: " + self.req_texts[v] for v in req_ids]
        input_texts = [candidate_text] + vacancy_texts
        
        # Tokenize together to ensure consistent padding
        tokens = self.tokenizer(input_texts, add_special_tokens=True, padding=True, truncation=True, max_length=self.query_size, return_tensors='pt').to(device)

        # Differentiating between -1 and 0 is practically impossible, so they are considered to be the same
        labels = [i if i >=0 else 0 for i in labels]

        return (candidate, list(req_ids)), tokens, torch.LongTensor(labels).to(device)

In [None]:
len(truth_dict.keys())

In [None]:
train_dataloader = TokenDataLoader(dict(training_slice), cv_data, req_data, query_size=512, batch_size=1)
val_dataloader = TokenDataLoader(dict(val_slice), cv_data, req_data, query_size=512, batch_size=1)
test_dataloader = TokenDataLoader(dict(test_slice), cv_data, req_data, query_size=512, batch_size=1)

if (create_train := not "e5_trainloader.pth" in os.listdir("./dataloaders/")) or (create_test := not "e5_testloader.pth" in os.listdir("./dataloaders/")):
    torch.save(train_dataloader, './dataloaders/e5_trainloader.pth')
    torch.save(test_dataloader, './dataloaders/e5_testloader.pth')    

In [None]:
def listwise_loss(scores, labels):
    
    """
    Compute the LambdaRank loss. (assume sigma=1.)
    
    scores: tensor of size [N, 1] (the output of a neural network), where N = length of <query, document> pairs
    labels: tensor of size [N], contains the relevance labels 
    
    returns: a tensor of size [N, 1]
    """
    if labels.size(0) < 2:
        return torch.Tensor([[0]])

    N = torch.arange(len(scores))
    num_docs = len(scores)

    sigma = 1

    # Calculate lambda_{i, j} for every <i, j>.
    S_j = torch.stack([labels] * num_docs)
    S_i = S_j.T

    S = torch.nan_to_num((S_i - S_j) / (S_i - S_j).abs())
    lamda = (sigma * (0.5 * (1 - S) - (1 / (1 + torch.exp(sigma * (scores - scores.T))))))

    # Calculate abs(Delta-NDCG) for each ordering <i, j> combination
    sorted_ind = torch.flip(scores.argsort(dim=0).flatten(), dims=[0])
    sorted_labels = labels[sorted_ind]
    ideal_labels = torch.sort(labels)[0].flip(dims=[0])
    k = (torch.arange(sorted_labels.shape[0]) + 1).to(device)
    DCG_ideal_labels = torch.sum((2**ideal_labels - 1) / torch.log(k + 1)) 
    doc_id_to_rank = torch.Tensor([(sorted_ind == i).nonzero(as_tuple=True)[0] for i in N]).int()
    doc_id_to_label = torch.Tensor([sorted_labels[R_i] for R_i in doc_id_to_rank]).int().to(device)
    
    # Calculate delta NDCG
    R_j = torch.stack([doc_id_to_rank] * num_docs).to(device)
    R_i = R_j.T
    label_j = torch.stack([doc_id_to_label] * num_docs).to(device)
    label_i = label_j.T
    DCG_discount = ((2**label_i - 1) / torch.log(R_i + 2) + (2**label_j - 1) / torch.log(R_j + 2)).to(device)
    DCG_gain = ((2**label_j - 1) / torch.log(R_i + 2) + (2**label_i - 1) / torch.log(R_j + 2)).to(device)
    delta_NDCG = ((DCG_gain - DCG_discount) / DCG_ideal_labels).abs()

    lambda_rank_loss =  (lamda * delta_NDCG).sum(axis=1).unsqueeze(1) 

    return lambda_rank_loss

In [None]:
class e5_ranker(torch.nn.Module):
    def __init__(self, pooling="mean"):
        super().__init__()
        self.model = AutoModel.from_pretrained("intfloat/multilingual-e5-small")
        self.pooling = pooling

    def forward(self, batch):
        
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Process all embeddings in one go
        outputs = self.model(input_ids, attention_mask=attention_mask)
        
        if self.pooling == "mean":
            # Mean pooling
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size())
            sum_embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)  # Avoid division by zero
            embeddings = sum_embeddings / sum_mask
        elif self.pooling == "sum":
            # Sum pooling
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size())
            embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
        elif self.pooling == "max":
            # Max pooling
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).bool()
            masked_embeddings = outputs.last_hidden_state * input_mask_expanded  # Apply mask to zero out padding tokens
            embeddings, _ = torch.max(masked_embeddings, dim=1)  # Obtain max across the sequence dimension
        
        # Extract CV and request embeddings
        cv_embedding = embeddings[0].unsqueeze(0)  # CV is the first in the batch
        req_embeddings = embeddings[1:]  # rest are requests
            
        # Use the cosine similarity as the score (based on the paper)
        return F.cosine_similarity(cv_embedding, req_embeddings).squeeze()

In [None]:
def train_loop(model, optimizer, trainloader, use_wandb):
    """
    Perform a single epoch of training.
    
    - Model: the model to train (should return ranking scores of CV-vacancy pairs)
    - Optimizer: the optimizer to use
    - Trainloader: the dataloader to use, which should provide tokens/embeddings/texts and labels
    """
    
    ndcg_scores = []
        
    for i, (_, batch_data, batch_labels) in enumerate(trainloader):

        # Make prediction
        y_pred = model(batch_data.to(device))

        # Calculate and propagate loss
        optimizer.zero_grad()
        ground_truth = batch_labels.squeeze()

        if len(y_pred) > len(ground_truth):
            y_pred = y_pred[:len(ground_truth)]
        
        lambda_i = listwise_loss(y_pred, ground_truth)
        
        torch.autograd.backward(y_pred, lambda_i.squeeze())
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)

        optimizer.step()

        if use_wandb:
            # Log loss in wandb
            wandb.log({"loss": lambda_i.squeeze().mean()})

        score = ndcg_score(batch_labels.detach().cpu().unsqueeze(0), 
                           y_pred.unsqueeze(0).detach().cpu(), k=10)
            
        # Calculate nDCG score of current batch
        ndcg_scores.append(score)
        
        
        print("                                                                                                                    ", end="\r")
        print(f"Batch: {i + 1}/{len(trainloader)}, y_pred mean: {y_pred.mean()} ({y_pred.std()}), nDCG: {score}", end="\r")
        
    return ndcg_scores


def val_loop(model, valloader):
    """
    Evaluate given model once.
    
    - Model: the model to train (should return ranking scores of CV-vacancy pairs)
    - Valloader: the dataloader to use, which should provide tokens/embeddings/texts and labels
    """
        
    ndcg_scores = []
    
    with torch.no_grad():
        for i, (_, batch_data, batch_labels) in enumerate(valloader):

            print(f"Batch: {i + 1}/{len(valloader)}", end="\r")
            
            # Make predictions
            y_pred_val = model(batch_data.to(device))

            ground_truth = batch_labels.squeeze()

            if len(y_pred_val) > len(ground_truth):
                y_pred_val = y_pred_val[:len(ground_truth)]
        
            # Calculate nDCG score of current batch        
            ndcg_scores.append(ndcg_score(batch_labels.detach().cpu().unsqueeze(0), 
                                y_pred_val.unsqueeze(0).detach().cpu(), k=10))
            
    return ndcg_scores


def train_model(trial, trainloader, valloader, model_type="e5", epochs=10, step_size=5, use_wandb=False):
    """
    Trains a given model for a given amount of epochs.
    """

    best_score = 0
    
    # Search space
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-4, log=True)
    pooling_method = trial.suggest_categorical('pooling_method', ["mean", "max", "sum"])
    
    if model_type == "e5":
        # Train on GPU if available
        model = e5_ranker(pooling=pooling_method).to(device)      
        model.train()
    elif model_type == "consultantbert":
        # Train on GPU if available
        model = conSultantBERT(pooling=pooling_method).to(device)      
        model.train()
    elif model_type == "PJFNN":
        embedding_size = trial.suggest_categorical("embedding_size", [64, 128, 256])
        
        model = PJFNN(embedding_size=embedding_size, geek_channels=1, job_channels=30, vocab=vocab)
        model.train()

    model.to(device)

    if use_wandb:
        # Set up WandB integration
        wandb.init(project="recsys", job_type="optimize")

        # Configuration for WandB
        config = {
            "learning_rate": learning_rate,
            "pooling": pooling_method,
            "model": model.__class__.__name__
        }
        wandb.config.update(config)

    # We use Adam as the optimizer by default
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
    
    for epoch in range(epochs + 1):
        print(f"Epoch: {epoch}/{epochs}")

        # Train the model for the current epoch
        epoch_ndcg_scores = train_loop(model, optimizer, trainloader, use_wandb)

        print()
        print("Training nDCG:", np.mean(epoch_ndcg_scores))
        print()

        scheduler.step()

        if use_wandb:
            wandb.log({"Training nDCG": np.mean(epoch_ndcg_scores)})

        ndcg_scores = []           

        # Evaluate the model
        val_ndcg_scores = val_loop(model, valloader)

        print()
        print("Testing nDCG:", np.mean(val_ndcg_scores))
        print()

        if use_wandb:
            wandb.log({"Testing nDCG": np.mean(val_ndcg_scores)})

        if np.mean(val_ndcg_scores) > best_score:
            torch.save(model.state_dict(), f"./trained_models/{model.__class__.__name__}.pt")
            best_score = np.mean(val_ndcg_scores)
            
    return np.mean(val_ndcg_scores)

In [None]:
def objective_wrapper(trainloader, valloader, model_type="e5"):
    def objective(trial):
        return train_model(trial, trainloader, valloader, model_type=model_type, epochs=3, use_wandb=False)
    
    return objective

In [None]:
torch.cuda.empty_cache() 
gc.collect()

# Hide user/future warnings
warnings.filterwarnings('ignore')

# Define the Optuna study
study = optuna.create_study(direction='maximize')

# We need to provide trainloader and valloader to the training/validation loop
wrapped_objective = objective_wrapper(train_dataloader, val_dataloader)

# Start optimization
study.optimize(wrapped_objective, n_trials=6)  

print("Best hyperparameters:", study.best_trial.params)

with open("e5_results.txt", "w+") as f:
    json.dump(study.best_trial.params, f)

# ConSultantBERT

In [None]:
from transformers import BertTokenizer, BertModel

from collections import defaultdict

In [None]:
device = "cuda:0"

In [None]:
class BERTTokenDataLoader(Dataset):
    def __init__(self, truth_dict, cv_data, req_data, query_size=512, batch_size=32):
        self.ground_truths = list(truth_dict.items())
        self.cv_texts = cv_data
        self.req_texts = req_data
        
        self.query_size = query_size
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.batch_size = batch_size

    def __len__(self):
        # Adjust the length to account for the number of batches based on candidates
        return (len(self.ground_truths) + self.batch_size - 1) // self.batch_size

    def __getitem__(self, idx):

        candidate, vacancies = self.ground_truths[idx]

        req_ids, labels = vacancies.keys(), list(vacancies.values())

        candidate_text = self.cv_texts[candidate]
        vacancy_texts = [self.req_texts[v] for v in req_ids]
        input_texts = [candidate_text] + vacancy_texts
        
        tokens = self.tokenizer(input_texts, max_length=self.query_size, padding='max_length', truncation=True, return_tensors="pt").to(device)
            
        # Differentiating between -1 and 0 is practically impossible, so they are considered to be the same
        labels = [i if i >=0 else 0 for i in labels]
        
        return (candidate, list(req_ids)), tokens, torch.LongTensor(labels).to(device)

In [None]:
train_dataloader = BERTTokenDataLoader(dict(training_slice), cv_data, req_data, query_size=512, batch_size=1)
val_dataloader = BERTTokenDataLoader(dict(val_slice), cv_data, req_data, query_size=512, batch_size=1)
test_dataloader = BERTTokenDataLoader(dict(test_slice), cv_data, req_data, query_size=512, batch_size=1)

if (create_train := not "bert_trainloader.pth" in os.listdir("./dataloaders/")) or (create_test := not "bert_testloader.pth" in os.listdir("./dataloaders/")):
    torch.save(train_dataloader, './dataloaders/bert_trainloader.pth')
    torch.save(test_dataloader, './dataloaders/bert_testloader.pth')    

In [None]:
# s3://s3-nl-prd-semrb-emr-datascience/volodymyr.medentsiy/models/sbert_v2_full_dataset_folder/

In [None]:
class conSultantBERT(torch.nn.Module):
    def __init__(self, pooling):
        super().__init__()
        
        self.model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)    
        self.pooling = pooling
        
    def forward(self, batch):
        
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Process all embeddings in one go
        outputs = self.model(input_ids, attention_mask=attention_mask)
        
        if self.pooling == "mean":
            # Mean pooling
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size())
            sum_embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)  # Avoid division by zero
            embeddings = sum_embeddings / sum_mask
        elif self.pooling == "sum":
            # Sum pooling
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size())
            embeddings = torch.sum(outputs.last_hidden_state * input_mask_expanded, 1)
        elif self.pooling == "max":
            # Max pooling
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).bool()
            masked_embeddings = outputs.last_hidden_state * input_mask_expanded  # Apply mask to zero out padding tokens
            embeddings, _ = torch.max(masked_embeddings, dim=1)  # Obtain max across the sequence dimension
        
        # Extract CV and request embeddings
        cv_embedding = embeddings[0].unsqueeze(0)  # CV is the first in the batch
        req_embeddings = embeddings[1:]  # rest are requests
            
        # Use the cosine similarity as the score (based on the paper)
        return F.cosine_similarity(cv_embedding, req_embeddings).squeeze()

In [None]:
torch.cuda.empty_cache() 
gc.collect()

# Hide user/future warnings
warnings.filterwarnings('ignore')

# Define the Optuna study
study = optuna.create_study(direction='maximize')

# We need to provide trainloader and valloader to the training/validation loop
wrapped_objective = objective_wrapper(train_dataloader, val_dataloader, model_type="consultantbert")

# Start optimization
study.optimize(wrapped_objective, n_trials=12)  

print("Best hyperparameters:", study.best_trial.params)

with open("consultantbert_results.txt", "w+") as f:
    json.dump(study.best_trial.params, f)

# GNN

In [None]:
!pip install -q torch_geometric

In [None]:
import time 
from torch_geometric.data import Data, HeteroData, Batch
from torch_geometric.loader import DataLoader
from torch_geometric.utils import to_dense_adj
from torch_geometric.nn import to_hetero
from collections import defaultdict

import torch.nn as nn
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data

In [None]:
trainloader = torch.load("./dataloaders/graph_trainloader.pth")
valloader = torch.load("./dataloaders/graph_valloader.pth")

In [None]:
device = "cuda:0"

In [None]:
# We embed the textual nodes (candidates and requests) separately at first
class text_embedding_layer(torch.nn.Module):
    def __init__(self, text_embedding_size=64):
        super().__init__()
        
        self.e5 = AutoModel.from_pretrained("intfloat/multilingual-e5-small").to(device)
                
        self.candidate_out = nn.Linear(in_features=384,
                                       out_features=text_embedding_size)

        self.company_out = nn.Linear(in_features=384,
                                     out_features=text_embedding_size)
        
    def average_pool(self, last_hidden_states, attention_mask):
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        
    def forward(self, x_can, x_req, att_mask_can, att_mask_req):
        
        # Feed tokens into model
        x_candidate = self.e5(x_can, att_mask_can)
        x_company = self.e5(x_req, att_mask_req)
        
        # Create embedding tensor
        candidate_embeddings = self.average_pool(x_candidate.last_hidden_state, attention_mask=att_mask_can)
        company_embeddings = self.average_pool(x_company.last_hidden_state, attention_mask=att_mask_req)

        # normalize embeddings
        candidate_embeddings = F.normalize(candidate_embeddings, p=2, dim=1)
        company_embeddings = F.normalize(company_embeddings, p=2, dim=1)
        
        # Run through MLP to match other embedding sizes
        x_candidate = self.candidate_out(candidate_embeddings).float()
        x_company = self.company_out(company_embeddings).float()
        
        return x_candidate, x_company    
    
# Then, we embed all nodes initially
class embedding_layer(torch.nn.Module):
    def __init__(self, embedding_size=32):
        super().__init__()        
        
        # self.conv = geom_nn.GATv2Conv((-1, -1), out_channels)        
        # self.conv = geom_nn.SimpleConv((-1, -1)) #, embedding_size)
        self.conv = geom_nn.TransformerConv((-1, -1), embedding_size)
        
    def forward(self, x, edge_index):
        
        return self.conv(x, edge_index)
        
    
class baselineGNNModel(torch.nn.Module):
    def __init__(self, data, typings, text_embedding_size=16, embedding_size=32):
        super().__init__()
        
        self.typings = typings
        
        self.text_embedder = text_embedding_layer(text_embedding_size=text_embedding_size)

        self.embedder = embedding_layer(embedding_size=embedding_size)
        self.embedder = to_hetero(self.embedder, data.metadata(), aggr='sum')        
        
        self.fc = nn.Linear(embedding_size, 1)
        
    def forward(self, data):
        # Embed textual features       
        x_candidate, x_request = self.text_embedder(data.x_dict["candidate"], data.x_dict["vacancy"], data["candidate"].att_mask, data["vacancy"].att_mask)
        
        # Store the textual embeddings along with the rest of the graph
        data.x_dict["candidate"] = x_candidate
        data.x_dict["vacancy"] = x_request

        # Embed the graph as a whole
        embedded_data = self.embedder({k: v.float() for k, v in data.x_dict.items()}, data.edge_index_dict)
        
        # Each sub-graph gets its own embedding
        sub_graphs = defaultdict(list)
            
        # Find the sub-graph of each node in the embedding, and add it to the corresponding list
        for typing in self.typings:
            for i, emb in enumerate(embedded_data[typing]):            
                # Some subgraphs do not have all data types (e.g., a graph might not include any education nodes)
                if data[typing]:
                    # Find the sub-graph the current node belongs to
                    current_node_id = int(data[typing].unique_node_id[i].item())
                                        
                    # We were working with a dummy node
                    if current_node_id == 0:
                        continue
                        
                    sg = int(data[typing].sub_graph[i].item())
                                        
                    # Add its candidate embedding to its sub-graph embedding
                    sub_graphs[sg].append(emb.unsqueeze(0))              

        # Finally, mean pool every graph embedding (so the final embedding is the mean of all of the nodes)
        for sg in sub_graphs.keys():            
            sub_graphs[sg] = torch.mean(torch.stack(sub_graphs[sg]).squeeze(1), dim=0)
                                    
        # Stack all the sub-graph embeddings into a single matrix, both candidate- and company-sided
        sub_graphs = torch.stack([i[1] for i in sorted(sub_graphs.items())], dim=0)
                
        # Make predictions based on the sub-graph embeddings
        y_pred = self.fc(sub_graphs)
        
        return y_pred.squeeze()

In [None]:
def train_loop(model, optimizer, trainloader, valloader, epochs=10):
    ndcg_scores = []
    
    for epoch in range(epochs):
        for i, data in enumerate(trainloader):  

            # Make prediction
            y_pred = model(data.detach().clone().to(device))

            print("                                                                                                                    ", end="\r")
            print(f"Epoch: {epoch + 1}/{epochs}, batch (train): {i + 1}/{len(trainloader)}, y_pred mean: {y_pred.mean()}", end="\r")

            # Calculate and backpropagate gradients
            optimizer.zero_grad()

            if len(y_pred) > len(data.y):
                y_pred = y_pred[:len(data.y)]
            
            lambda_i = listwise_loss(y_pred, data.y.to(device))
            torch.autograd.backward(y_pred, lambda_i.squeeze())
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
            optimizer.step()

            # Calculate nDCG score of current batch
            ndcg_scores.append(ndcg_score(data.y.unsqueeze(0).cpu(), 
                                          y_pred.unsqueeze(0).detach().cpu(), k=10))
            
        # Log epoch-level metrics to WandB
        print(f"\n\nTraining nDCG: {np.mean(ndcg_scores)}\n")
        ndcg_scores = []
        
        # Evaluate model
        ndcg_val = val_loop(model, valloader)
        print(f"\nValidation nDCG: {np.mean(ndcg_val)}\n")
        
    # Return nDCG of final trained model
    return ndcg_val

def val_loop(model, valloader):
    ndcg_scores = []

    with torch.no_grad():
        for i, data_val in enumerate(valloader):
            print(f"Batch (val): {i + 1}/{len(valloader)}", end="\r")
            
            # Make prediction
            y_pred_val = model(data_val.detach().clone().to(device))

            if len(y_pred_val) > len(data_val.y):
                y_pred_val = y_pred[:len(data_val.y)]
             
            # Calculate nDCG score of current batch   
            ndcg_scores.append(ndcg_score(data_val.y.unsqueeze(0).cpu(), 
                                           y_pred_val.unsqueeze(0).detach().cpu(), k=10))
            
    return ndcg_scores


def optimize_model(trial, trainloader, valloader, epochs=10):
    
    
    # Data.metadata() is needed to initialize the heterodata
    data = next(iter(trainloader))

    # Search space
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-3, log=True)
    text_embedding_size = trial.suggest_categorical('text_embedding_size', [32, 128, 256])
    embedding_size = trial.suggest_categorical('embedding_size', [32, 128, 256])           

        
    # All the different node types
    typings = ["candidate", "vacancy", "city", "education", "job_type", "industry", "klass", "literal"]

    
    # Initiate the model (number of heads is locked, as that is required for the multi-explanation component to function)
    model = baselineGNNModel(data,
                             typings,
                             text_embedding_size=text_embedding_size,
                             embedding_size=embedding_size).to(device)
                                        
    # Configuration for WandB
    config = {
        "model": model.__class__.__name__,
        "learning_rate": learning_rate,
        "text_embedding_size": text_embedding_size,
        "embedding_size": embedding_size
    }    

    print(f"""\nConfig:\n- learning_rate = {learning_rate}\n- text_embedding_size = {text_embedding_size}\n- embedding_size = {embedding_size}\n\n""")

    # Configure Adam
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    start_time = time.time() 
    # Train and evaluate model
    ndcg_scores_val = train_loop(model, optimizer, trainloader, valloader, epochs=epochs)
    
    end_time = time.time()
    
    print(f"Training for {epochs} epochs took {end_time - start_time} seconds ({(end_time - start_time) / epochs} seconds per epoch)")
    
    return np.mean(ndcg_scores_val)

In [None]:
def objective_wrapper(trainloader, valloader):
    def objective(trial):
        return optimize_model(trial, trainloader, valloader, epochs=6)
    
    return objective

In [None]:
torch.cuda.empty_cache() 
gc.collect()

# Hide user/future warnings
warnings.filterwarnings('ignore')

# Define the Optuna study
study = optuna.create_study(direction='maximize')

# We need to provide trainloader and valloader to the training/validation loop
wrapped_objective = objective_wrapper(trainloader, valloader)

# Start optimization
study.optimize(wrapped_objective, n_trials=6)  

print("Best hyperparameters:", study.best_trial.params)

with open("base_gnn_results.txt", "w+") as f:
    json.dump(study.best_trial.params, f)