In [3]:
import json


def get_schema(data, indent=0):
    prefix = ' ' * indent
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}{key}: {type(value).__name__}")
            get_schema(value, indent + 4)
    elif isinstance(data, list) and data:
        print(f"{prefix}- list of {type(data[0]).__name__}")
        get_schema(data[0], indent + 4)


# Load the JSON file
with open('../OnetData/abbr_cleaned_IT_data_from_onet.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Print the schema
print("JSON Schema:")
get_schema(json_data)

JSON Schema:
- list of dict
    code: str
    tasks: list
        - list of dict
            statement: str
            category: str
            score: dict
                scale: str
                important: bool
                value: int
    technology_skills: list
        - list of dict
            skill_title: str
            technologies: list
                - list of dict
                    name: str
                    demand_percentage: str
                    in_demand: str
                    hot_tech_percentage: str
                    hot_tech: str
                    hot_tech_in_demand: str
    tools_used: list
        - list of dict
            tool_title: str
    knowledge: list
        - list of dict
            name: str
            description: str
            score: dict
                scale: str
                important: bool
                value: int
    skills: list
        - list of dict
            name: str
            description: str
            scor

In [5]:
import json
import numpy as np
from tqdm import tqdm

# Load processed concepts and embeddings
model_name = "sentence-transformers/msmarco-distilbert-base-v4"
data = np.load(f"onet_concept_embeddings_{model_name.replace('/', '_')}.npz")
main_emb = data['main']
abbr_emb = data['abbr']

with open(f"processed_onet_concepts_{model_name.replace('/', '_')}.json", "r", encoding="utf-8") as f:
    processed_concepts = json.load(f)

# Load job data
with open("../OnetData/abbr_cleaned_IT_data_from_onet.json", "r", encoding="utf-8") as f:
    jobs = json.load(f)

# Precompute job embeddings
job_embeddings = []
job_titles = []

for job in tqdm(jobs):
    terms = []

    for tech_skill in job.get("technology_skills", []):
        skill_title = tech_skill.get("skill_title", "")
        technologies = tech_skill.get("technologies", [])

        # Calculate average demand for skill title
        avg_demand = np.mean([float(t.get("demand_percentage", 0))
                             for t in technologies]) if technologies else 0.0
        terms.append((skill_title, avg_demand))

        # Add each technology with its demand
        for tech in technologies:
            tech_name = tech.get("name", "")
            tech_demand = float(tech.get("demand_percentage", 0))
            terms.append((tech_name, tech_demand))

    # Aggregate term embeddings
    job_vec = np.zeros_like(main_emb[0])
    total_weight = 0.0

    for term_name, weight in terms:
        concept = next(
            (c for c in processed_concepts if c['name'] == term_name), None)
        if not concept:
            continue

        idx = processed_concepts.index(concept)
        m_emb = main_emb[idx]
        a_emb = abbr_emb[idx]

        # Use average of main and abbr if available
        if concept['abbr']:
            term_vec = (m_emb + a_emb) / 2
        else:
            term_vec = m_emb

        job_vec += term_vec * weight
        total_weight += weight

    if total_weight > 0:
        job_vec /= total_weight

    job_embeddings.append(job_vec)
    job_titles.append(job.get("title", ""))

# Save precomputed data
np.save("job_embeddings.npy", np.array(job_embeddings))
with open("job_titles.json", "w") as f:
    json.dump(job_titles, f)

100%|██████████| 38/38 [00:00<00:00, 116.65it/s]


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Load precomputed data
job_embeddings = np.load("job_embeddings.npy")
with open("job_titles.json", "r") as f:
    job_titles = json.load(f)

def recommend_jobs(filtered_candidates, top_n=5):
    # Generate user embedding
    user_vec = np.zeros_like(job_embeddings[0])
    total_score = 0.0
    
    for candidate in filtered_candidates:
        concept_name = candidate[0]
        score = candidate[3]
        
        concept = next((c for c in processed_concepts if c['name'] == concept_name), None)
        if not concept:
            continue
        
        idx = processed_concepts.index(concept)
        m_emb = main_emb[idx]
        a_emb = abbr_emb[idx]
        
        if concept['abbr']:
            term_vec = (m_emb + a_emb) / 2
        else:
            term_vec = m_emb
        
        user_vec += term_vec * score
        total_score += score
    
    if total_score > 0:
        user_vec /= total_score
    
    # Compute similarities
    sims = cosine_similarity([user_vec], job_embeddings)[0]
    top_indices = np.argsort(sims)[-top_n:][::-1]
    
    return [(job_titles[i], sims[i]) for i in top_indices]

