In [2]:
import re
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Load and extract O*NET concepts as before
with open("../OnetData/abbr_cleaned_IT_data_from_onet.json", "r", encoding="utf-8") as f:
    onet_data = json.load(f)

onet_skill_titles = set()
onet_tech_names = set()

for job in onet_data:
    for tech_skill in job.get("technology_skills", []):
        if "skill_title" in tech_skill:
            onet_skill_titles.add(tech_skill["skill_title"])
        for tech_item in tech_skill.get("technologies", []):
            onet_tech_names.add(tech_item["name"])

# Combine into a list of dictionaries
onet_concepts = (
    [{"name": title, "type": "skill_title"} for title in onet_skill_titles] +
    [{"name": tech, "type": "technology_name"} for tech in onet_tech_names]
)

# Process each concept to separate the main text and the abbreviation
processed_concepts = []
for concept in onet_concepts:
    full_text = concept["name"]
    # Get the main part (everything before the first parenthesis)
    main_text = re.sub(r'\s*\(.*', '', full_text).strip()
    # Extract abbreviation if available
    abbr_match = re.search(r'\((.*?)\)', full_text)
    abbr_text = abbr_match.group(1).strip() if abbr_match else ""
    processed_concepts.append({
        "name": full_text,
        "type": concept["type"],
        "main": main_text,
        "abbr": abbr_text
    })

# Initialize the model
model_name = "sentence-transformers/msmarco-distilbert-base-v4"
model = SentenceTransformer(model_name)

# Create lists for the main texts and abbreviation texts
main_texts = [item["main"] for item in processed_concepts]
abbr_texts = [item["abbr"] for item in processed_concepts]

# Generate embeddings for both parts
main_embeddings = model.encode(main_texts, convert_to_numpy=True)
abbr_embeddings = model.encode(abbr_texts, convert_to_numpy=True)

# (Optional) Save dual embeddings along with the processed concepts for later use.
np.savez(f"onet_concept_embeddings_{model_name.replace('/', '_')}.npz",
         main=main_embeddings, abbr=abbr_embeddings)
with open(f"processed_onet_concepts_{model_name.replace('/', '_')}.json", "w", encoding="utf-8") as f:
    json.dump(processed_concepts, f, indent=4)

# (Optional) Build a similarity graph using the main embeddings
G = nx.Graph()
for concept in processed_concepts:
    G.add_node(concept["name"], category=concept["type"])

similarity_matrix = cosine_similarity(main_embeddings)
SIMILARITY_THRESHOLD = 0.7  # For graph creation only
for i in range(len(main_texts)):
    for j in range(i + 1, len(main_texts)):
        if similarity_matrix[i][j] >= SIMILARITY_THRESHOLD:
            G.add_edge(main_texts[i], main_texts[j],
                       weight=similarity_matrix[i][j])

print(f"Graph contains {len(G.nodes)} nodes and {len(G.edges)} edges.")

  from .autonotebook import tqdm as notebook_tqdm


Graph contains 1114 nodes and 450 edges.


In [None]:
import string
import re
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import json

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define stop words and custom filter words.
stop_words = set(stopwords.words('english'))
custom_filter_words = {'additionally', 'also', 'furthermore',
                       'moreover', 'including', 'like', 'career', 'etc'}


def clean_text(text):
    # Lowercase and remove punctuation.
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    return " ".join([token for token in tokens if token not in stop_words])


def is_meaningful(phrase):
    tokens = [t.lower() for t in word_tokenize(phrase) if t.isalpha()]
    if not tokens:
        return False
    if any(token in custom_filter_words for token in tokens):
        return False
    if len(tokens) == 1 and tokens[0] in stop_words:
        return False
    if sum(1 for t in tokens if t in stop_words)/len(tokens) > 0.5:
        return False
    return True


# ---------------------------
# Step 1: Process Resume Text
# ---------------------------
# long_text = """
# I have extensive experience in data analysis and have worked with a variety of technologies including Microsoft SQL Server,
# Python, cloud computing platforms like AWS, and I am proficient with machine learning techniques. My background also includes
# developing user interfaces with modern tools. Additionally, I have hands-on experience with business intelligence and
# graphical user interface design.
# """
# long_text = """Throughout my career, I have developed expertise in backend development using Python and Node.js. I have built robust REST APIs and worked with various databases including MySQL and MongoDB. My experience also extends to cloud services like AWS and Azure,
# enabling me to deploy scalable applications.
# """
# long_text = """In my career as a full-stack developer, I have worked extensively with JavaScript, React, and Node.js to create responsive web applications. I have also integrated MySQL and PostgreSQL databases into several projects, while leveraging cloud platforms like AWS and Google Cloud to deploy and manage applications.
# """
# long_text = """I have worked as a data engineer and have extensive experience with Python and Apache Spark to process large datasets. My work also involved using cloud-based storage solutions like Amazon S3 and Google BigQuery, as well as using Docker for containerization and Kubernetes for orchestration.
# """
# long_text = """As a DevOps engineer, I have automated CI/CD pipelines using Jenkins and GitLab CI. I am proficient in cloud services such as AWS and Azure, where I have deployed applications and managed resources like EC2 instances and databases. I also have experience with containerization tools such as Docker and Kubernetes.
# """
# long_text = """My experience in cybersecurity includes working with various firewalls, encryption algorithms, and intrusion detection systems (IDS). I am proficient with tools like Wireshark, Metasploit, and AWS Security Hub, and I have experience using Kubernetes for securing containerized applications.
# """
# long_text = """I specialize in machine learning, with extensive experience using Python libraries such as TensorFlow and scikit-learn. I have built various models for predictive analytics, and my cloud experience includes working with AWS SageMaker for training models and deploying them into production.
# """
# long_text = """As a system administrator, I have managed Linux and Windows servers, focusing on performance optimization and security. I have experience with AWS EC2 instances, configuring firewalls, and using Ansible for configuration management and automation.
# """
# long_text = """I have worked as a mobile app developer with expertise in building cross-platform apps using Flutter and React Native. I am experienced in integrating REST APIs and managing cloud databases like Firebase and MongoDB for seamless app functionality.
# """
# long_text = """In my role as a cloud architect, I have designed scalable cloud infrastructures using AWS, Azure, and Google Cloud Platform. I have also implemented Infrastructure as Code (IaC) using tools like Terraform and AWS CloudFormation to automate deployments and manage resources.
# """
# long_text = """With a background in UI/UX design, I have used tools like Figma and Adobe XD to design user-centric interfaces. I also have experience in front-end development using JavaScript, HTML, and CSS, along with frameworks like Angular and Vue.js for creating dynamic and interactive web applications.
# """
# long_text = """As a network engineer, I have designed and implemented large-scale network solutions using Cisco devices. I am proficient in configuring routers, switches, and firewalls, and have experience with network monitoring tools like SolarWinds and Wireshark.
# """
long_text = """I’m a data scientist with expertise in Python, R, and SQL (Structured Query Language) for statistical modeling and data analysis. Proficient in machine learning frameworks like TensorFlow, PyTorch, and scikit-learn, and experienced with big data tools such as Apache Spark (Spark) and Hadoop.

Skilled in data visualization using Tableau, Power BI, and Matplotlib, and familiar with cloud platforms like AWS SageMaker, Google Cloud AI Platform, and Microsoft Azure Machine Learning.

Strong background in A/B testing, natural language processing (NLP), and deep learning. Comfortable with version control tools like Git and CI/CD pipelines.

Additional experience with relational databases (e.g., PostgreSQL, MySQL), NoSQL databases (e.g., MongoDB), and BI tools like Looker. Familiar with Docker for containerization and Kubernetes for orchestration.

Soft skills include stakeholder communication, cross-functional collaboration, and agile project management.
"""

cleaned_full_text = clean_text(long_text)
tokens_clean = word_tokenize(cleaned_full_text)

# ---------------------------
# Step 2: Generate Candidate Phrases using n‑grams
# ---------------------------
candidate_phrases = []
for n in [3, 2, 1]:
    for gram in ngrams(tokens_clean, n):
        phrase = " ".join(gram)
        if phrase.strip() and is_meaningful(phrase):
            candidate_phrases.append(phrase)
candidate_phrases = list(set(candidate_phrases))  # Remove duplicates

# ---------------------------
# Step 3: Improved Matching Against Concepts
# ---------------------------
# Load the pre-generated dual embeddings and processed concepts.
model_name = "sentence-transformers/msmarco-distilbert-base-v4"
data = np.load(f"onet_concept_embeddings_{model_name.replace('/', '_')}.npz")
main_embeddings = data['main']
abbr_embeddings = data['abbr']

with open(f"processed_onet_concepts_{model_name.replace('/', '_')}.json", "r", encoding="utf-8") as f:
    processed_concepts = json.load(f)

# Initialize model (must be the same as used for generating embeddings)
model = SentenceTransformer(model_name)

THRESHOLD_NGRAM = 0.5

# Compute embeddings for candidate phrases
candidate_embeddings = model.encode(candidate_phrases, convert_to_numpy=True)

# For each candidate phrase, compute similarity with both main and abbreviation embeddings.
recognized_candidates_ngram = []
for i, cand_emb in enumerate(candidate_embeddings):
    # Compute similarity vectors for main and abbreviation parts.
    sim_main = cosine_similarity([cand_emb], main_embeddings)[0]
    sim_abbr = cosine_similarity([cand_emb], abbr_embeddings)[0]
    # Choose the higher similarity per concept.
    best_scores = np.maximum(sim_main, sim_abbr)
    best_idx = best_scores.argmax()
    best_score = best_scores[best_idx]

    if best_score >= THRESHOLD_NGRAM:
        concept = processed_concepts[best_idx]
        # Determine which part (main or abbr) produced the highest score.
        source = "main" if sim_main[best_idx] >= sim_abbr[best_idx] else "abbr"
        phrase = candidate_phrases[i]
        n_val = len(phrase.split())
        tokens_phrase = phrase.split()
        recognized_candidates_ngram.append(
            (concept["name"], concept["type"], phrase,
             best_score, n_val, tokens_phrase, source)
        )

# ---------------------------
# Step 4: Global Filtering of Overlapping N‑grams
# ---------------------------
FILTER_SIMILARITY_THRESHOLD = 0.85
recognized_candidates_ngram = sorted(
    recognized_candidates_ngram, key=lambda x: x[3], reverse=True)
global_used_words = set()
filtered_candidates = []
for candidate in recognized_candidates_ngram:
    concept_name, concept_type, phrase, score, n_val, tokens_phrase, source = candidate
    if any(token in global_used_words for token in tokens_phrase):
        continue
    filtered_candidates.append(candidate)
    if score > FILTER_SIMILARITY_THRESHOLD:
        global_used_words.update(tokens_phrase)

# ---------------------------
# Step 5: Group and Print the Results
# ---------------------------
filtered_by_concept = {}
for concept_name, concept_type, phrase, score, n_val, tokens_phrase, source in filtered_candidates:
    filtered_by_concept.setdefault(
        concept_name, {"type": concept_type, "phrases": []})
    filtered_by_concept[concept_name]["phrases"].append(
        (phrase, score, n_val, tokens_phrase, source))

print("\nGlobally Filtered Recognized Concepts using n‑gram detection:")
print("=" * 60)
for concept, info in filtered_by_concept.items():
    concept_type = info["type"]
    print(f"Concept: {concept} ({concept_type})")
    for phrase, score, n_val, tokens_phrase, source in sorted(info["phrases"], key=lambda x: x[1], reverse=True):
        print(
            f"    Detected {n_val}-gram: '{phrase}' with similarity {score:.2f} (matched with {source} text)")
    print("-" * 60)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zinou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zinou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Globally Filtered Recognized Concepts using n‑gram detection:
Concept: Microsoft Azure software (Azure) (technology_name)
    Detected 1-gram: 'azure' with similarity 1.00 (matched with abbr text)
------------------------------------------------------------
Concept: NoSQL (technology_name)
    Detected 1-gram: 'nosql' with similarity 1.00 (matched with main text)
------------------------------------------------------------
Concept: Apache (technology_name)
    Detected 1-gram: 'apache' with similarity 1.00 (matched with main text)
------------------------------------------------------------
Concept: Python (technology_name)
    Detected 1-gram: 'python' with similarity 1.00 (matched with main text)
------------------------------------------------------------
Concept: Apache Hadoop (Hadoop) (technology_name)
    Detected 1-gram: 'hadoop' with similarity 1.00 (matched with abbr text)
------------------------------------------------------------
Concept: TensorFlow (TF) (technology_name)


In [4]:
# ---------------------------
# Modified Job Embedding Precomputation (without demand_percentage)
# ---------------------------
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
from tqdm import tqdm

# Load processed concepts and embeddings
model_name = "sentence-transformers/msmarco-distilbert-base-v4"
data = np.load(f"onet_concept_embeddings_{model_name.replace('/', '_')}.npz")
main_emb = data['main']
abbr_emb = data['abbr']

with open(f"processed_onet_concepts_{model_name.replace('/', '_')}.json", "r", encoding="utf-8") as f:
    processed_concepts = json.load(f)

# Load job data
with open("../OnetData/abbr_cleaned_IT_data_from_onet.json", "r", encoding="utf-8") as f:
    jobs = json.load(f)

# Precompute job embeddings (equal weighting)
print("\n[DEBUG] Precomputing job embeddings...")
job_embeddings = []
job_titles = []

for job in tqdm(jobs):
    terms = []

    # Collect all skill titles and technology names
    for tech_skill in job.get("technology_skills", []):
        if skill_title := tech_skill.get("skill_title", ""):
            terms.append(skill_title)

        for tech in tech_skill.get("technologies", []):
            if tech_name := tech.get("name", ""):
                terms.append(tech_name)

    # Aggregate term embeddings with equal weight
    job_vec = np.zeros_like(main_emb[0])
    total_terms = 0

    for term_name in terms:
        concept = next(
            (c for c in processed_concepts if c['name'] == term_name), None)
        if not concept:
            continue

        idx = processed_concepts.index(concept)
        m_emb = main_emb[idx]
        a_emb = abbr_emb[idx]

        # Use average of main and abbr if available
        term_vec = (m_emb + a_emb)/2 if concept['abbr'] else m_emb

        job_vec += term_vec
        total_terms += 1

    if total_terms > 0:
        job_vec /= total_terms
        print(f"[DEBUG] Job '{job.get('title', '')}' has {total_terms} terms")

    job_embeddings.append(job_vec)
    job_titles.append(job.get("title", ""))

# Save precomputed data
np.save("job_embeddings.npy", np.array(job_embeddings))
with open("job_titles.json", "w") as f:
    json.dump(job_titles, f)

# ---------------------------
# Modified Recommendation Function
# ---------------------------


def recommend_jobs(filtered_candidates, top_n=5):
    print("\n[DEBUG] Starting recommendation process...")

    # Group concepts by name and keep highest score
    concept_scores = {}
    for candidate in filtered_candidates:
        name = candidate[0]
        score = candidate[3]
        c_type = candidate[1]
        if name not in concept_scores or score > concept_scores[name]['score']:
            concept_scores[name] = {'score': score, 'type': c_type}

    print("[DEBUG] Unique concepts with max scores:")
    for name, data in concept_scores.items():
        print(f"  - {name} ({data['type']}): {data['score']:.2f}")

    # Create weighted user embedding
    user_vec = np.zeros_like(job_embeddings[0])
    total_weight = 0.0
    # Weight tech higher than skills
    type_weights = {'technology_name': 1.0, 'skill_title': 0.7}

    for name, data in concept_scores.items():
        concept = next(
            (c for c in processed_concepts if c['name'] == name), None)
        if not concept:
            continue

        # Get type-based weight
        weight = data['score'] * type_weights.get(data['type'], 0.5)

        # Get embeddings
        idx = processed_concepts.index(concept)
        m_emb = main_emb[idx]
        a_emb = abbr_emb[idx]
        term_vec = (m_emb + a_emb)/2 if concept['abbr'] else m_emb

        # Apply non-linear scoring (emphasize >0.8 matches)
        boosted_weight = weight * np.tanh(weight * 3)  # Boost higher scores
        contribution = term_vec * boosted_weight

        user_vec += contribution
        total_weight += boosted_weight

        print(f"[DEBUG] {name[:30]:<30} | Type: {data['type']:16} | "
              f"Raw: {data['score']:.2f} | Boosted: {boosted_weight:.2f} | "
              f"Vec Norm: {np.linalg.norm(contribution):.2f}")

    if total_weight > 0:
        user_vec /= total_weight
        print(
            f"\n[DEBUG] Final user vector norm: {np.linalg.norm(user_vec):.2f}")
    else:
        print("[WARNING] No valid concepts found - using zero vector")

    # Calculate similarities
    sims = cosine_similarity([user_vec], job_embeddings)[0]
    top_indices = np.argsort(sims)[-top_n:][::-1]

    print("\n[DEBUG] Top matches:")
    for idx in top_indices:
        print(f"  {sims[idx]:.4f} - {job_titles[idx]}")

    return [(job_titles[i], sims[i]) for i in top_indices]


[DEBUG] Precomputing job embeddings...


  0%|          | 0/38 [00:00<?, ?it/s]

[DEBUG] Job 'Actuaries' has 66 terms
[DEBUG] Job 'Bioinformatics Technicians' has 78 terms
[DEBUG] Job 'Biostatisticians' has 71 terms
[DEBUG] Job 'Blockchain Engineers' has 77 terms
[DEBUG] Job 'Business Intelligence Analysts' has 251 terms
[DEBUG] Job 'Clinical Data Managers' has 70 terms
[DEBUG] Job 'Computer and Information Research Scientists' has 161 terms
[DEBUG] Job 'Computer Network Architects' has 322 terms
[DEBUG] Job 'Computer Network Support Specialists' has 171 terms
[DEBUG] Job 'Computer Programmers' has 265 terms
[DEBUG] Job 'Computer Systems Analysts' has 332 terms
[DEBUG] Job 'Computer Systems Engineers/Architects' has 317 terms
[DEBUG] Job 'Computer User Support Specialists' has 351 terms


 37%|███▋      | 14/38 [00:00<00:00, 138.16it/s]

[DEBUG] Job 'Data Scientists' has 110 terms
[DEBUG] Job 'Data Warehousing Specialists' has 180 terms
[DEBUG] Job 'Database Administrators' has 301 terms
[DEBUG] Job 'Database Architects' has 297 terms
[DEBUG] Job 'Digital Forensics Analysts' has 119 terms
[DEBUG] Job 'Document Management Specialists' has 134 terms
[DEBUG] Job 'Geographic Information Systems Technologists and Technicians' has 158 terms
[DEBUG] Job 'Health Informatics Specialists' has 82 terms
[DEBUG] Job 'Information Security Analysts' has 282 terms
[DEBUG] Job 'Information Security Engineers' has 148 terms
[DEBUG] Job 'Information Technology Project Managers' has 309 terms
[DEBUG] Job 'Mathematicians' has 81 terms
[DEBUG] Job 'Network and Computer Systems Administrators' has 310 terms


100%|██████████| 38/38 [00:00<00:00, 141.25it/s]

[DEBUG] Job 'Operations Research Analysts' has 156 terms
[DEBUG] Job 'Penetration Testers' has 104 terms
[DEBUG] Job 'Software Developers' has 362 terms
[DEBUG] Job 'Software Quality Assurance Analysts and Testers' has 358 terms
[DEBUG] Job 'Statisticians' has 73 terms
[DEBUG] Job 'Telecommunications Engineering Specialists' has 106 terms
[DEBUG] Job 'Video Game Designers' has 91 terms
[DEBUG] Job 'Web Administrators' has 176 terms
[DEBUG] Job 'Web and Digital Interface Designers' has 273 terms
[DEBUG] Job 'Web Developers' has 283 terms





In [5]:
# Example usage with filtered_candidates from previous processing
top_jobs = recommend_jobs(filtered_candidates)
print("Top 5 Recommended Jobs:")
for job, score in top_jobs:
    print(f"- {job} (Score: {score:.4f})")


[DEBUG] Starting recommendation process...
[DEBUG] Unique concepts with max scores:
  - Microsoft Azure software (Azure) (technology_name): 1.00
  - NoSQL (technology_name): 1.00
  - Apache (technology_name): 1.00
  - Python (technology_name): 1.00
  - Apache Hadoop (Hadoop) (technology_name): 1.00
  - TensorFlow (TF) (technology_name): 1.00
  - Amazon Web Services SageMaker (SageMaker) (technology_name): 1.00
  - MongoDB (technology_name): 1.00
  - PyTorch (technology_name): 1.00
  - Kubernetes (K8s) (technology_name): 1.00
  - MySQL (technology_name): 1.00
  - Amazon Web Services software (AWS) (technology_name): 1.00
  - Docker (technology_name): 1.00
  - PostgreSQL (Postgres) (technology_name): 1.00
  - IBM Power Systems software (Power) (technology_name): 1.00
  - Tableau (technology_name): 1.00
  - Structured query language (SQL) (technology_name): 1.00
  - R (technology_name): 1.00
  - Git (technology_name): 1.00
  - Microsoft (technology_name): 1.00
  - Version control softwar