In [1]:
import pandas as pd
import numpy as np

# Load the dataset

df = pd.read_csv("Data\cleaned_df.csv")

# Normalize and tokenize the skills
df['skills'] = df['skills'].apply(lambda x: [skill.lower().strip() for skill in x.split()])

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe embeddings
glove_embeddings = load_glove_embeddings('Data/glove.6B.100d.txt')

def get_skill_embedding(skill, embeddings_index, embedding_dim=100):
    return embeddings_index.get(skill, np.zeros(embedding_dim))

# Create embeddings for each skill
df['skill_embeddings'] = df['skills'].apply(lambda skills: 
    [get_skill_embedding(skill, glove_embeddings) for skill in skills])

def aggregate_embeddings(skill_embeddings):
    if len(skill_embeddings) == 0:
        return np.zeros(len(skill_embeddings[0]))
    return np.mean(skill_embeddings, axis=0)

# Aggregate embeddings for each job posting
df['aggregated_skill_embeddings'] = df['skill_embeddings'].apply(aggregate_embeddings)

# Convert the aggregated skill embeddings into a matrix
embeddings_matrix = np.vstack(df['aggregated_skill_embeddings'].values)

# Display the shape of the embeddings matrix
print(embeddings_matrix.shape)

# Display the dataset with the new embeddings column
print(df[['Qualifications', 'skills', 'aggregated_skill_embeddings']])


(696012, 100)
       Qualifications                                             skills  \
0              B.Tech  [construction, project, management, building, ...   
1                  BA  [procurement, processes, purchase, order, mana...   
2              B.Tech  [proficiency, in, html,, css,, and, javascript...   
3              B.Tech  [marketing, analytics, data, analysis, data, v...   
4                 MBA  [accounting, principles, financial, reporting,...   
...               ...                                                ...   
696007          M.Com  [pediatric, specialization, advanced, pediatri...   
696008            MBA  [data, analysis, database, querying, and, repo...   
696009            BCA  [user-centered, design, principles, ux/ui, des...   
696010            BCA  [content, creation, (e.g.,, writing,, editing,...   
696011          B.Com  [content, creation, (e.g.,, writing,, editing,...   

                              aggregated_skill_embeddings  
0       [0.10

In [9]:
df["aggregated_skill_embeddings"].shape

(696012,)

In [10]:
df["aggregated_skill_embeddings"].head()

0    [0.10629287, -0.038679436, -0.05077727, -0.049...
1    [-0.058340356, -0.019800913, 0.16269246, 0.023...
2    [-0.16762114868119912, 0.1901041136847602, 0.1...
3    [-0.17018999, -0.13516001, -0.026077209, -0.06...
4    [-0.015080009, -0.110652536, 0.260391, -0.3110...
Name: aggregated_skill_embeddings, dtype: object

In [11]:
def aggregate_embeddings(skill_embeddings):
    if len(skill_embeddings) == 0:
        return np.zeros(len(skill_embeddings[0]))
    return np.mean(skill_embeddings, axis=0)

# Aggregate embeddings for each job posting
df['aggregated_skill_embeddings'] = df['aggregated_skill_embeddings'].apply(aggregate_embeddings)

In [12]:
df['aggregated_skill_embeddings'].head()

0   -0.011961
1   -0.015266
2   -0.004422
3   -0.026369
4   -0.078670
Name: aggregated_skill_embeddings, dtype: float64