# DPGNN - Dual-Perspective Graph Neural Network
## Person-Job Fit on v5-sections Dataset

Paper: "Modeling Two-Way Selection Preference for Person-Job Fit" (RecSys 2022)

**Instructions:**
1. Upload your `train.csv` and `test.csv` to Kaggle
2. Enable GPU (T4 x2)
3. Run all cells

## 1. Setup & Install Dependencies

In [None]:
# Install required packages
!pip install torch_geometric -q
!pip install transformers -q

In [None]:
# Clone original DPGNN repository
!git clone https://github.com/RUCAIBox/DPGNN.git
%cd DPGNN

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Upload & Convert Dataset

Upload your `train.csv` and `test.csv` files to `/kaggle/input/`

In [None]:
# Update this path to match your uploaded dataset
TRAIN_CSV = "/kaggle/input/workfitai-v5-sections/train.csv"  # Change this
TEST_CSV = "/kaggle/input/workfitai-v5-sections/test.csv"    # Change this

# Check if files exist
print(f"Train exists: {os.path.exists(TRAIN_CSV)}")
print(f"Test exists: {os.path.exists(TEST_CSV)}")

In [None]:
# Load data
train_df = pd.read_csv(TRAIN_CSV).fillna('')
test_df = pd.read_csv(TEST_CSV).fillna('')

print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nColumns: {list(train_df.columns)}")

In [None]:
# Create dataset directory
os.makedirs('dataset', exist_ok=True)

# Define columns
resume_cols = ['resume_summary', 'resume_experience', 'resume_skills', 'resume_education']
job_cols = ['jd_overview', 'jd_responsibilities', 'jd_requirements', 'jd_preferred']

def combine_sections(row, cols):
    texts = []
    for col in cols:
        text = str(row.get(col, ''))
        if text and text.lower() != 'nan' and text.strip():
            texts.append(text)
    return ' [SEP] '.join(texts)

In [None]:
# Build unique geeks (resumes)
print("Building unique geeks...")
all_df = pd.concat([train_df, test_df], ignore_index=True)

geek_texts = {}
for idx, row in tqdm(all_df.iterrows(), total=len(all_df)):
    geek_token = str(row['original_index'])
    if geek_token not in geek_texts:
        geek_texts[geek_token] = combine_sections(row, resume_cols)

geek_tokens = sorted(geek_texts.keys(), key=lambda x: int(x))
print(f"Found {len(geek_tokens)} unique geeks")

In [None]:
# Build unique jobs
print("Building unique jobs...")
job_hashes = {}
job_texts = {}

for idx, row in tqdm(all_df.iterrows(), total=len(all_df)):
    jd_content = '|'.join([str(row.get(col, '')) for col in job_cols])
    jd_hash = hash(jd_content)
    
    if jd_hash not in job_hashes:
        job_token = str(len(job_hashes))
        job_hashes[jd_hash] = job_token
        job_texts[job_token] = combine_sections(row, job_cols)

job_tokens = sorted(job_texts.keys(), key=lambda x: int(x))
print(f"Found {len(job_tokens)} unique jobs")

In [None]:
# Write token files
print("Writing token files...")
with open('dataset/geek.token', 'w') as f:
    for token in geek_tokens:
        f.write(f"{token}\n")

with open('dataset/job.token', 'w') as f:
    for token in job_tokens:
        f.write(f"{token}\n")

print("Done!")

In [None]:
# Process interactions
print("Processing interactions...")

def get_job_token(row, job_cols, job_hashes):
    jd_content = '|'.join([str(row.get(col, '')) for col in job_cols])
    jd_hash = hash(jd_content)
    return job_hashes[jd_hash]

# Training data
train_interactions = []
positive_interactions = []

for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    geek_token = str(row['original_index'])
    job_token = get_job_token(row, job_cols, job_hashes)
    label = 1 if str(row['label']).strip().lower() == 'good fit' else 0
    
    train_interactions.append((geek_token, job_token, label))
    if label == 1:
        positive_interactions.append((geek_token, job_token, label))

print(f"Total train: {len(train_interactions)}")
print(f"Positive train: {len(positive_interactions)}")

In [None]:
# Create validation split
np.random.seed(42)
n_valid = int(len(positive_interactions) * 0.1)
indices = np.random.permutation(len(positive_interactions))
valid_indices = set(indices[:n_valid])

valid_interactions = []
train_final = []

for i, inter in enumerate(positive_interactions):
    if i in valid_indices:
        valid_interactions.append(inter)
    else:
        train_final.append(inter)

print(f"Train final: {len(train_final)}")
print(f"Validation: {len(valid_interactions)}")

In [None]:
# Process test data
test_interactions = []
for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    geek_token = str(row['original_index'])
    job_token = get_job_token(row, job_cols, job_hashes)
    label = 1 if str(row['label']).strip().lower() == 'good fit' else 0
    test_interactions.append((geek_token, job_token, label))

print(f"Test: {len(test_interactions)}")

In [None]:
# Write interaction files
def write_interactions(filepath, interactions):
    with open(filepath, 'w') as f:
        for geek, job, label in interactions:
            f.write(f"{geek}\t{job}\t{label}\n")

write_interactions('dataset/data.train_all', train_final)
write_interactions('dataset/data.train_all_add', train_final)
write_interactions('dataset/data.user_add', train_final)  # Same as train for our case
write_interactions('dataset/data.job_add', train_final)
write_interactions('dataset/data.valid_g', valid_interactions)
write_interactions('dataset/data.valid_j', valid_interactions)
write_interactions('dataset/data.test_g', test_interactions)
write_interactions('dataset/data.test_j', test_interactions)

print("Interaction files written!")
!ls -la dataset/

## 3. Generate BERT Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load BERT
print("Loading BERT model...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased').to(device)
model.eval()
print("BERT loaded!")

In [None]:
def encode_texts_bert(texts, batch_size=32, max_length=512):
    all_embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        
        all_embeddings.append(embeddings)
        
        # Clear GPU memory
        del input_ids, attention_mask, outputs
        torch.cuda.empty_cache()
    
    return np.vstack(all_embeddings)

In [None]:
# Encode geeks (resumes)
print("Encoding geeks...")
geek_text_list = [geek_texts[t] for t in geek_tokens]
geek_embeddings = encode_texts_bert(geek_text_list, batch_size=32)

# Format: first column is ID, rest is embedding
geek_ids = np.array([int(t) for t in geek_tokens]).reshape(-1, 1)
geek_data = np.hstack([geek_ids, geek_embeddings])
np.save('dataset/geek.bert.npy', geek_data)
print(f"Saved geek.bert.npy: shape {geek_data.shape}")

In [None]:
# Encode jobs
print("Encoding jobs...")
job_text_list = [job_texts[t] for t in job_tokens]
job_embeddings = encode_texts_bert(job_text_list, batch_size=32)

# Format: first column is ID, rest is embedding
job_ids_arr = np.array([int(t) for t in job_tokens]).reshape(-1, 1)
job_data = np.hstack([job_ids_arr, job_embeddings])
np.save('dataset/job.bert.npy', job_data)
print(f"Saved job.bert.npy: shape {job_data.shape}")

In [None]:
# Verify dataset files
print("Dataset files:")
!ls -la dataset/

## 4. Update Configuration

In [None]:
# Update overall.yaml
overall_config = """# Device
use_gpu: True
gpu_id: 0

# Training
learner: Adam
epochs: 50
eval_step: 1
stopping_step: 10
clip_grad_norm: ~

# Evaluation
topk: [5]
valid_metric: r@5

# DataLoader
num_workers: 2
pin_memory: True

# General
checkpoint_dir: ./saved/
dataset_path: ./dataset/

loss_decimal_place: 4
metric_decimal_place: 4

# Reproducibility
seed: 42
reproducibility: True
"""

with open('prop/overall.yaml', 'w') as f:
    f.write(overall_config)
print("Updated prop/overall.yaml")

In [None]:
# Update DPGNN.yaml
dpgnn_config = """# Model
embedding_size: 128
n_layers: 3
reg_weight: 1e-05
mutual_weight: 0.05
temperature: 0.2

ADD_BERT: True
BERT_embedding_size: 768
BERT_output_size: 32

# Training
learning_rate: 0.001

# Batch size - adjusted for T4 GPU
train_batch_size: 512
eval_batch_size: 512
"""

with open('prop/DPGNN.yaml', 'w') as f:
    f.write(dpgnn_config)
print("Updated prop/DPGNN.yaml")

## 5. Train DPGNN Model

In [None]:
# Clear GPU memory before training
del model, tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()
print("GPU memory cleared!")

In [None]:
# Run training
!python main.py

## 6. Evaluate & Get Results

In [None]:
# Check saved models
!ls -la saved/

In [None]:
# Print final results summary
print("="*60)
print("DPGNN Training Complete!")
print("="*60)
print(f"\nDataset: v5-sections")
print(f"Unique geeks: {len(geek_tokens)}")
print(f"Unique jobs: {len(job_tokens)}")
print(f"Training samples: {len(train_final)}")
print(f"Test samples: {len(test_interactions)}")

## Notes for Paper

When reporting DPGNN results:

```
We used the official DPGNN implementation from [1] with the following adaptations:
- Dataset converted from v5-sections format to DPGNN's expected format
- BERT embeddings generated using bert-base-uncased
- Training: 50 epochs, batch size 512, learning rate 0.001
- 3 GCN layers, embedding size 128

[1] Yang et al. "Modeling Two-Way Selection Preference for Person-Job Fit" RecSys 2022
```