In [4]:
import re
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Load and extract O*NET concepts as before
with open("abbr_cleaned_IT_data_from_onet.json", "r", encoding="utf-8") as f:
    onet_data = json.load(f)

onet_skill_titles = set()
onet_tech_names = set()

for job in onet_data:
    for tech_skill in job.get("technology_skills", []):
        if "skill_title" in tech_skill:
            onet_skill_titles.add(tech_skill["skill_title"])
        for tech_item in tech_skill.get("technologies", []):
            onet_tech_names.add(tech_item["name"])

# Combine into a list of dictionaries
onet_concepts = (
    [{"name": title, "type": "skill_title"} for title in onet_skill_titles] +
    [{"name": tech, "type": "technology_name"} for tech in onet_tech_names]
)

# Process each concept to separate the main text and the abbreviation
processed_concepts = []
for concept in onet_concepts:
    full_text = concept["name"]
    # Get the main part (everything before the first parenthesis)
    main_text = re.sub(r'\s*\(.*', '', full_text).strip()
    # Extract abbreviation if available
    abbr_match = re.search(r'\((.*?)\)', full_text)
    abbr_text = abbr_match.group(1).strip() if abbr_match else ""
    processed_concepts.append({
        "name": full_text,
        "type": concept["type"],
        "main": main_text,
        "abbr": abbr_text
    })

# Initialize the model
model_name = "sentence-transformers/msmarco-distilbert-base-v4"
model = SentenceTransformer(model_name)

# Create lists for the main texts and abbreviation texts
main_texts = [item["main"] for item in processed_concepts]
abbr_texts = [item["abbr"] for item in processed_concepts]

# Generate embeddings for both parts
main_embeddings = model.encode(main_texts, convert_to_numpy=True)
abbr_embeddings = model.encode(abbr_texts, convert_to_numpy=True)

# (Optional) Save dual embeddings along with the processed concepts for later use.
np.savez(f"onet_concept_embeddings_{model_name.replace('/', '_')}.npz",
         main=main_embeddings, abbr=abbr_embeddings)
with open(f"processed_onet_concepts_{model_name.replace('/', '_')}.json", "w", encoding="utf-8") as f:
    json.dump(processed_concepts, f, indent=4)

# (Optional) Build a similarity graph using the main embeddings
G = nx.Graph()
for concept in processed_concepts:
    G.add_node(concept["name"], category=concept["type"])

similarity_matrix = cosine_similarity(main_embeddings)
SIMILARITY_THRESHOLD = 0.7  # For graph creation only
for i in range(len(main_texts)):
    for j in range(i + 1, len(main_texts)):
        if similarity_matrix[i][j] >= SIMILARITY_THRESHOLD:
            G.add_edge(main_texts[i], main_texts[j],
                       weight=similarity_matrix[i][j])

print(f"Graph contains {len(G.nodes)} nodes and {len(G.edges)} edges.")

Graph contains 1114 nodes and 450 edges.


In [7]:
import string
import re
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import json

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define stop words and custom filter words.
stop_words = set(stopwords.words('english'))
custom_filter_words = {'additionally', 'also', 'furthermore',
                       'moreover', 'including', 'like', 'career', 'etc'}


def clean_text(text):
    # Lowercase and remove punctuation.
    text = text.lower().translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    return " ".join([token for token in tokens if token not in stop_words])


def is_meaningful(phrase):
    tokens = [t.lower() for t in word_tokenize(phrase) if t.isalpha()]
    if not tokens:
        return False
    if any(token in custom_filter_words for token in tokens):
        return False
    if len(tokens) == 1 and tokens[0] in stop_words:
        return False
    if sum(1 for t in tokens if t in stop_words)/len(tokens) > 0.5:
        return False
    return True


# ---------------------------
# Step 1: Process Resume Text
# ---------------------------
# long_text = """
# I have extensive experience in data analysis and have worked with a variety of technologies including Microsoft SQL Server,
# Python, cloud computing platforms like AWS, and I am proficient with machine learning techniques. My background also includes
# developing user interfaces with modern tools. Additionally, I have hands-on experience with business intelligence and
# graphical user interface design.
# """
long_text = """Throughout my career, I have developed expertise in backend development using Python and Node.js. I have built robust REST APIs and worked with various databases including MySQL and MongoDB. My experience also extends to cloud services like AWS and Azure,
enabling me to deploy scalable applications.
"""
cleaned_full_text = clean_text(long_text)
tokens_clean = word_tokenize(cleaned_full_text)

# ---------------------------
# Step 2: Generate Candidate Phrases using n‑grams
# ---------------------------
candidate_phrases = []
for n in [3, 2, 1]:
    for gram in ngrams(tokens_clean, n):
        phrase = " ".join(gram)
        if phrase.strip() and is_meaningful(phrase):
            candidate_phrases.append(phrase)
candidate_phrases = list(set(candidate_phrases))  # Remove duplicates

# ---------------------------
# Step 3: Improved Matching Against Concepts
# ---------------------------
# Load the pre-generated dual embeddings and processed concepts.
model_name = "sentence-transformers/msmarco-distilbert-base-v4"
data = np.load(f"onet_concept_embeddings_{model_name.replace('/', '_')}.npz")
main_embeddings = data['main']
abbr_embeddings = data['abbr']

with open(f"processed_onet_concepts_{model_name.replace('/', '_')}.json", "r", encoding="utf-8") as f:
    processed_concepts = json.load(f)

# Initialize model (must be the same as used for generating embeddings)
model = SentenceTransformer(model_name)

THRESHOLD_NGRAM = 0.5

# Compute embeddings for candidate phrases
candidate_embeddings = model.encode(candidate_phrases, convert_to_numpy=True)

# For each candidate phrase, compute similarity with both main and abbreviation embeddings.
recognized_candidates_ngram = []
for i, cand_emb in enumerate(candidate_embeddings):
    # Compute similarity vectors for main and abbreviation parts.
    sim_main = cosine_similarity([cand_emb], main_embeddings)[0]
    sim_abbr = cosine_similarity([cand_emb], abbr_embeddings)[0]
    # Choose the higher similarity per concept.
    best_scores = np.maximum(sim_main, sim_abbr)
    best_idx = best_scores.argmax()
    best_score = best_scores[best_idx]

    if best_score >= THRESHOLD_NGRAM:
        concept = processed_concepts[best_idx]
        # Determine which part (main or abbr) produced the highest score.
        source = "main" if sim_main[best_idx] >= sim_abbr[best_idx] else "abbr"
        phrase = candidate_phrases[i]
        n_val = len(phrase.split())
        tokens_phrase = phrase.split()
        recognized_candidates_ngram.append(
            (concept["name"], concept["type"], phrase,
             best_score, n_val, tokens_phrase, source)
        )

# ---------------------------
# Step 4: Global Filtering of Overlapping N‑grams
# ---------------------------
FILTER_SIMILARITY_THRESHOLD = 0.85
recognized_candidates_ngram = sorted(
    recognized_candidates_ngram, key=lambda x: x[3], reverse=True)
global_used_words = set()
filtered_candidates = []
for candidate in recognized_candidates_ngram:
    concept_name, concept_type, phrase, score, n_val, tokens_phrase, source = candidate
    if any(token in global_used_words for token in tokens_phrase):
        continue
    filtered_candidates.append(candidate)
    if score > FILTER_SIMILARITY_THRESHOLD:
        global_used_words.update(tokens_phrase)

# ---------------------------
# Step 5: Group and Print the Results
# ---------------------------
filtered_by_concept = {}
for concept_name, concept_type, phrase, score, n_val, tokens_phrase, source in filtered_candidates:
    filtered_by_concept.setdefault(
        concept_name, {"type": concept_type, "phrases": []})
    filtered_by_concept[concept_name]["phrases"].append(
        (phrase, score, n_val, tokens_phrase, source))

print("\nGlobally Filtered Recognized Concepts using n‑gram detection:")
print("=" * 60)
for concept, info in filtered_by_concept.items():
    concept_type = info["type"]
    print(f"Concept: {concept} ({concept_type})")
    for phrase, score, n_val, tokens_phrase, source in sorted(info["phrases"], key=lambda x: x[1], reverse=True):
        print(
            f"    Detected {n_val}-gram: '{phrase}' with similarity {score:.2f} (matched with {source} text)")
    print("-" * 60)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zinou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zinou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Globally Filtered Recognized Concepts using n‑gram detection:
Concept: Microsoft Azure software (Azure) (technology_name)
    Detected 1-gram: 'azure' with similarity 1.00 (matched with abbr text)
------------------------------------------------------------
Concept: MySQL (technology_name)
    Detected 1-gram: 'mysql' with similarity 1.00 (matched with main text)
------------------------------------------------------------
Concept: MongoDB (technology_name)
    Detected 1-gram: 'mongodb' with similarity 1.00 (matched with main text)
------------------------------------------------------------
Concept: Python (technology_name)
    Detected 1-gram: 'python' with similarity 1.00 (matched with main text)
------------------------------------------------------------
Concept: Amazon Web Services software (AWS) (technology_name)
    Detected 1-gram: 'aws' with similarity 1.00 (matched with abbr text)
------------------------------------------------------------
Concept: RESTful (REST API) (tec