**(Job description texts) x (ESCO skill dictionary) similiarity matrix using S-BERT embeddings & Cosine similarity**

By: PodiPeti

In [11]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from nltk.util import ngrams

INPUT

In [12]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load data
job_df = pd.read_csv('input/preprocessed_jobs_mini.csv')  
dict_df = pd.read_csv('input/prep_esco_skill_dictionary.csv') 

# Prepare data
descriptions = job_df['description'].fillna('').astype(str)
skills = dict_df['skills'].fillna('').astype(str)

# TEST
#skills = ["Software Development", "Public Speaking", "Critical Thinking", "Project Management", "Creative Design"]
#descriptions = ["Description of software projects", "Techniques in public speaking", "Approaches to critical thinking", "Fundamentals of project management", "Elements of creative design"]


N-GRAMS

In [13]:
# Function to create ngrams
def create_ngrams(sentence, n):
    words = sentence.split()
    return [' '.join(gram) for gram in ngrams(words, n)]

SIMILARITY MATRIX

In [14]:
# Compute embeddings for all skills at once
skills_embeddings = model.encode(skills, convert_to_tensor=True)

# Create a DataFrame to store cosine similarities
cosine_similarity_df = pd.DataFrame(index=skills, columns=descriptions)

# Iterate through each skill and description, calculate cosine similarities
for skill_index, skill_embedding in enumerate(skills_embeddings):
    skill = skills[skill_index]
    n = len(skill.split())

    all_description_ngrams = []
    description_ranges = []

    for description in descriptions:
        description_ngrams = create_ngrams(description, n)

        # Skip encoding if no n-grams are generated
        if not description_ngrams:
            continue

        all_description_ngrams.extend(description_ngrams)
        description_ranges.append((description, len(description_ngrams)))

    # Encode all ngrams at once
    all_description_ngrams_embeddings = model.encode(all_description_ngrams, convert_to_tensor=True)

    start = 0
    for description, length in description_ranges:
        end = start + length
        description_embeddings = all_description_ngrams_embeddings[start:end]

        # Calculate cosine similarity
        cosine_scores = util.pytorch_cos_sim(skill_embedding, description_embeddings)
        max_cosine_score = cosine_scores.max().item()

        cosine_similarity_df.at[skill, description] = max_cosine_score
        start = end



OUTPUT

In [15]:
cosine_similarity_df.to_excel("figures/data/similarity_matrix_cosine_sbert.xlsx")