In [39]:
from tqdm import tqdm


In [1]:
import spacy


In [2]:
from sentence_transformers import SentenceTransformer, util


In [4]:
nlp = spacy.load("model-best")

In [5]:
model = SentenceTransformer("all-mpnet-base-v2")

In [6]:
category_labels = [
    "Backend Development",
    "Frontend Development",
    "Machine Learning",
    "Soft Skills",
    "Tools & Platforms",
    "Certifications"
]

category_descriptions = [
    "Technologies for server-side programming like Node.js, MongoDB, Express.js, SQL.",
    "Technologies for client-side web development like React, HTML, CSS, and JavaScript.",
    "AI libraries and frameworks such as TensorFlow, PyTorch, Scikit-Learn.",
    "Non-technical skills including communication, leadership, and teamwork.",
    "Tools for development and deployment like Docker, Kubernetes, and JIRA.",
    "Professional certifications like AWS Certified Solutions Architect, PMP, Scrum Master."
]

In [7]:
category_embeddings = model.encode(category_descriptions, convert_to_tensor=True)


In [8]:
def hybrid_categorize_skill(text, threshold=0.20):
    """Use NER first, then Sentence Transformers for skill categorization."""
    # Step 1: Apply the NER model
    doc = nlp(text)
    ner_extracted_skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]

    # If NER detects skills, use them for classification
    if ner_extracted_skills:
        print(f"NER Identified Skills: {ner_extracted_skills}")

        # Step 2: Use Sentence Transformer for classification
        skill_embeddings = model.encode(ner_extracted_skills, convert_to_tensor=True)
        similarity_scores = util.pytorch_cos_sim(skill_embeddings, category_embeddings)

        # Generate category scores for each skill
        categorized_skills = {}
        for i, skill in enumerate(ner_extracted_skills):
            scores = similarity_scores[i]
            categorized_skills[skill] = [
                (category_labels[idx], score.item())
                for idx, score in enumerate(scores) if score.item() >= threshold
            ]

        return categorized_skills
    else:
        return {"No Skill Identified": []}




In [9]:
text = "We need a developer skilled in Node.js, React, and TensorFlow."
results = hybrid_categorize_skill(text)
print(results)

NER Identified Skills: ['Node.js', 'React', 'TensorFlow']
{'Node.js': [('Backend Development', 0.5805199146270752), ('Frontend Development', 0.40711545944213867), ('Machine Learning', 0.22635707259178162), ('Tools & Platforms', 0.29266947507858276)], 'React': [('Backend Development', 0.2110331505537033), ('Frontend Development', 0.3913450539112091), ('Machine Learning', 0.20248757302761078)], 'TensorFlow': [('Machine Learning', 0.5825697779655457)]}


In [10]:
from sentence_transformers import InputExample


In [11]:
train_examples = [
    InputExample(texts=["Node.js", "Backend Development"], label=1.0),
    InputExample(texts=["React", "Frontend Development"], label=1.0),
    InputExample(texts=["TensorFlow", "Machine Learning"], label=1.0),
    InputExample(texts=["Node.js", "Machine Learning"], label=0.2),
    InputExample(texts=["PyTorch", "Machine Learning"], label=1.0),
    InputExample(texts=["JavaScript", "Frontend Development"], label=1.0),
    InputExample(texts=["Python", "Backend Development"], label=1.0),
    InputExample(texts=["Django", "Backend Development"], label=1.0),
    InputExample(texts=["Keras", "Machine Learning"], label=1.0),
]


In [12]:
from torch.utils.data import DataLoader
from sentence_transformers import losses

# Prepare the dataloader and loss function
train_dataloader = DataLoader(train_examples, batch_size=8, shuffle=True)
train_loss = losses.CosineSimilarityLoss(model)


# Fine-tune with more epochs and reduced warmup steps
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,                    # Increased epochs for better training
    warmup_steps=10,             # Reduced warmup steps for smaller datasets
    show_progress_bar=True
)


# Save the fine-tuned model
model.save("fine_tuned_sentence_transformer")


Step,Training Loss


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [13]:
fine_tuned_model = SentenceTransformer("fine_tuned_sentence_transformer")
test_pairs = [
    ("Node.js", "Backend Development"),  # Strong match
    ("React", "Frontend Development"),   # Strong match
    ("TensorFlow", "Machine Learning"),  # Strong match
    ("Node.js", "Machine Learning")      # Weaker match
]

In [14]:
def compare_models(test_pairs):
    print("\nOriginal Model vs Fine-Tuned Model Comparison:\n")
    for pair in test_pairs:
        # Generate embeddings for both terms in the pair
        original_embeddings = model.encode(pair)
        fine_tuned_embeddings = fine_tuned_model.encode(pair)

        # Calculate cosine similarity
        original_score = util.cos_sim(original_embeddings[0], original_embeddings[1]).item()
        fine_tuned_score = util.cos_sim(fine_tuned_embeddings[0], fine_tuned_embeddings[1]).item()
        
        # Display results
        print(f"Skill Pair: {pair}")
        print(f"Original Model Similarity: {original_score:.4f}")
        print(f"Fine-Tuned Model Similarity: {fine_tuned_score:.4f}\n")

# Call the corrected comparison function
compare_models(test_pairs)


Original Model vs Fine-Tuned Model Comparison:

Skill Pair: ('Node.js', 'Backend Development')
Original Model Similarity: 0.7845
Fine-Tuned Model Similarity: 0.7845

Skill Pair: ('React', 'Frontend Development')
Original Model Similarity: 0.7154
Fine-Tuned Model Similarity: 0.7154

Skill Pair: ('TensorFlow', 'Machine Learning')
Original Model Similarity: 0.8874
Fine-Tuned Model Similarity: 0.8874

Skill Pair: ('Node.js', 'Machine Learning')
Original Model Similarity: 0.5171
Fine-Tuned Model Similarity: 0.5171



In [16]:
text = "We need a developer skilled in Node.js, React, and TensorFlow."
results = hybrid_categorize_skill(text)
print(results)

NER Identified Skills: ['Node.js', 'React', 'TensorFlow']
{'Node.js': [('Backend Development', 0.7007930278778076), ('Frontend Development', 0.5783318281173706), ('Machine Learning', 0.4337933361530304), ('Tools & Platforms', 0.4844152629375458), ('Certifications', 0.2694885730743408)], 'React': [('Backend Development', 0.4450684189796448), ('Frontend Development', 0.525682270526886), ('Machine Learning', 0.38867056369781494), ('Tools & Platforms', 0.35801786184310913)], 'TensorFlow': [('Backend Development', 0.24374143779277802), ('Frontend Development', 0.23437459766864777), ('Machine Learning', 0.6306157112121582), ('Tools & Platforms', 0.2568182945251465)]}


In [51]:
def categorize_skill_multi(skill_name, threshold=0.20):
    """Categorize a skill into multiple categories with scores."""
    skill_embedding = model.encode(skill_name, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(skill_embedding, category_embeddings)[0]

    # Return all categories with scores above the threshold
    category_scores = [
        (category_labels[idx], score.item())
        for idx, score in enumerate(similarity_scores)  # Corrected to use indexing instead of keys()
        if score.item() >= threshold
    ]

    # Sort categories by score in descending order
    category_scores.sort(key=lambda x: x[1], reverse=True)
    return category_scores

In [53]:
# Test with the same skill list
skills_to_test = ["Node.js", "React", "TensorFlow", "Leadership", "Docker"]

# Categorize each skill into multiple categories
for skill in skills_to_test:
    categories = categorize_skill_multi(skill)
    print(f"{skill} -> {categories}")


Node.js -> [('Backend Development', 0.5805199146270752), ('Frontend Development', 0.40711551904678345), ('Tools & Platforms', 0.29266950488090515), ('Machine Learning', 0.22635704278945923)]
React -> [('Frontend Development', 0.39134514331817627), ('Backend Development', 0.21103322505950928), ('Machine Learning', 0.20248764753341675)]
TensorFlow -> [('Machine Learning', 0.5825698971748352)]
Leadership -> [('Soft Skills', 0.34387636184692383)]
Docker -> [('Tools & Platforms', 0.5509099364280701), ('Machine Learning', 0.22351060807704926), ('Backend Development', 0.2058756947517395)]
