# Create Training Dataset for Embedding Fine-tuning

Code authored by: Shaw Talebi

Video link (coming soon!) <br>
Blog link (coming soon!) <br>
Dataset (coming soon!)

### imports

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import json
from datasets import DatasetDict, Dataset
import re
import numpy as np
from sentence_transformers import SentenceTransformer

from functions import clean_html, remove_irrelevant_sections, extract_qualifications_from_html, remove_eoe_notes

### load data

In [2]:
# extract JDs
df_jobs = pd.read_csv("data/job_data.csv")
df_jobs = df_jobs.drop_duplicates()

# remove HTML tags from job_descriptions
df_jobs['job_description_cleaned'] = df_jobs['job_description'].apply(clean_html)
# only keep text relevant to job qualifications
df_jobs['job_description_cleaned'] = df_jobs['job_description_cleaned'].apply(remove_irrelevant_sections)
df_jobs['job_description_cleaned'] = df_jobs['job_description_cleaned'].apply(extract_qualifications_from_html)
df_jobs['job_description_cleaned'] = df_jobs['job_description_cleaned'].apply(remove_eoe_notes)

# store job descriptions in a list
job_description_list = df_jobs['job_description_cleaned'].to_list()

In [3]:
# extract synthetic queries and store in list
file_path = 'data/output.jsonl'
query_list = []

with open(file_path, 'r') as file:
    for line in file:
        query = json.loads(line)['response']['body']['choices'][0]['message']['content'].replace('"', '')
        query_list.append(query)

### create positive pairs dataset

In [4]:
# create dict with queries and JDs
df = pd.DataFrame({"query" : query_list, "job_description_pos" : job_description_list})

In [5]:
# drop duplicates
print("Original shape:", df.shape)
df = df.drop_duplicates(subset=['job_description_pos'])
print("Unique JDs:", df.shape)
df = df.drop_duplicates(subset=['query'])
print("Unique queries:",df.shape)

Original shape: (1087, 2)
Unique JDs: (828, 2)
Unique queries: (826, 2)


### create negative pairs

In [6]:
# Load the model
model = SentenceTransformer("all-mpnet-base-v2")

In [7]:
%%time
# Encode all job descriptions
job_embeddings = model.encode(df['job_description_pos'].to_list())
print(job_embeddings.shape)

(826, 768)
CPU times: user 30.4 s, sys: 10.1 s, total: 40.4 s
Wall time: 54.7 s


In [8]:
# compute similarities
similarities = model.similarity(job_embeddings, job_embeddings)
print(similarities.shape)

torch.Size([826, 826])


In [9]:
# match least JDs least similar to positive match as the negative match
similarities_argsorted = np.argsort(similarities.numpy(), axis=1)
negative_pair_index_list = []

for i in range(len(similarities)):

    # Start with the smallest similarity index for the current row
    j = 0
    index = int(similarities_argsorted[i][j])

    # Ensure the index is unique
    while index in negative_pair_index_list:
        j += 1  # Move to the next smallest index
        index = int(similarities_argsorted[i][j])  # Fetch next smallest index

    negative_pair_index_list.append(index)

In [10]:
# add negative pairs to df
df['job_description_neg'] = df['job_description_pos'].iloc[negative_pair_index_list].values

In [11]:
df.head()

Unnamed: 0,query,job_description_pos,job_description_neg
0,Staff Data Scientist specialized in generative...,"experience) in Operations Research, Statistics...","At Broadridge, we've built a culture where the..."
1,"Compliance Testing, Generative AI, Prompt Engi...",skills to translate the complexity of your wor...,"experience, and/or performance. Base pay is ju..."
2,"federal AI strategy consulting, natural langua...",skills and strategic ideas to improve mission ...,qualifications of the individual and do not di...
3,"generative AI techniques, data visualization t...",experienced in applying advanced statistical m...,skills and deserves to experience an epic win....
4,"Senior Data Scientist, statistical analysis, b...",ExperienceProduct Allowance so you can kick ba...,"requirements, specifications, and constraints,..."


### train-eval-test split

In [12]:
# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 10% validation, 10% test)
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

### upload to hugging face hub

In [14]:
# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

In [15]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 660
    })
    validation: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 82
    })
    test: Dataset({
        features: ['query', 'job_description_pos', 'job_description_neg'],
        num_rows: 84
    })
})

In [16]:
# push data to hub
dataset_dict.push_to_hub("shawhin/ai-job-embedding-finetuning")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/shawhin/ai-job-embedding-finetuning/commit/e5013221c56a6cf5bf8a65a46dbed8520dbcfb7d', commit_message='Upload dataset', commit_description='', oid='e5013221c56a6cf5bf8a65a46dbed8520dbcfb7d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/shawhin/ai-job-embedding-finetuning', endpoint='https://huggingface.co', repo_type='dataset', repo_id='shawhin/ai-job-embedding-finetuning'), pr_revision=None, pr_num=None)