In [31]:
!pip install sentence-transformers




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
import pandas as pd
import sentence_transformers
from sentence_transformers import SentenceTransformer, util

In [33]:
# # Code to use if on colab

# from google.colab import drive
# drive.mount('/content/drive')

# # Define your input path. Change as per your exact path
# input_path = '/content/drive/My Drive/HelpMate AI Codes/Your Path Here/'

In [34]:
#Code for local

# Define your input path. Change as per your exact path

input_path = './'

In [35]:
# Read the job postings csv

job_postings = pd.read_csv(input_path + 'job_postings.csv')

In [36]:
job_postings

Unnamed: 0,title,description,location,work_type
0,Licensed Insurance Agent,While many industries were hurt by the last fe...,"Chico, CA",FULL_TIME
1,Sales Manager,Are you a dynamic and creative marketing profe...,"Santa Clarita, CA",FULL_TIME
2,Model Risk Auditor,Join Us as a Model Risk Auditor – Showcase You...,"New York, NY",CONTRACT
3,Business Manager,Business ManagerFirst Baptist Church ForneyFor...,"Forney, TX",FULL_TIME
4,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,"New York, NY",FULL_TIME
...,...,...,...,...
495,Electrical Engineer - P&C Design,Job Description:Role: Electrical Engineer - P&...,United States,FULL_TIME
496,Editor,Role : Editor Remote Description: The Spotify ...,"New York, NY",CONTRACT
497,Spiritual Care Coordinator,"Coordinates spiritual, emotional and psycholog...","Fremont, OH",FULL_TIME
498,Supply Chain Intern,Learn and assist with all activities associate...,"Fort Wayne, IN",INTERNSHIP


In [37]:
# Load the MiniLM embedding model

MiniLM_model_name = "all-MiniLM-L6-v2"
MiniLM_model = SentenceTransformer(MiniLM_model_name)

In [38]:
# Load the QA embedding model

QA_model_name = "multi-qa-MiniLM-L6-cos-v1"
QA_model = SentenceTransformer(QA_model_name)

### Generate the Embeddings using both the Models

In [39]:
# Function to generate embeddings for text
def generate_embeddings(embedder, texts):
    embeddings = embedder.encode(texts, convert_to_tensor=True)
    return embeddings

def generate_embeddings_on_df(embedder, embedder_name,  df):
  df[f'Embeddings_{embedder_name}'] = df['description'].apply(lambda x: generate_embeddings(embedder, [x])[0])

In [40]:
# Generate the embeddings for all rows in the description column using the MiniLM embedding model
generate_embeddings_on_df(MiniLM_model, "MiniLM_model",  job_postings)

In [41]:
# Generate the embeddings for all rows in the description column using the QA embedding model
generate_embeddings_on_df(QA_model, "QA_model",  job_postings)

### Read the two queries

* query_1: I am looking for a data engineer job that offers competitive pay
* query_2: I want to work in UI design in the gaming industry

In [42]:
query1 = "I am looking for a data engineer job that offers competitive pay"
# input()

In [43]:
query2 = "I want to work in UI design in the gaming industry"
# input()

### Generate Query Embeddings and Calculate Similarities - for both queries against both models

In [44]:
# Define the function for calculating cosine similarity

def calculate_similarity(embedding1, embedding2):
    cosine_score = util.pytorch_cos_sim(embedding1, embedding2)
    # Convert the result to a Python float
    similarity = cosine_score.item()

    return similarity


In [45]:


# Function to perform semantic search and return ranked chunks
def semantic_search(user_query, df, embedder, embedder_name):

    # Calculate the query embedding
    query_embedding = embedder.encode(user_query, convert_to_tensor=True)

    # Calculate similarity scores between the query embedding and all chunk embeddings
    df['Similarity'] = df[f'Embeddings_{embedder_name}'].apply(lambda x: calculate_similarity(query_embedding, x))

    # Sort the DataFrame by similarity scores in descending order
    df = df.sort_values(by='Similarity', ascending=False).reset_index(drop=True)

    # Return only the top 3 values from the dataframe, and drop the embeddings column for a cleaner view of the final results
    df = df.head(3)
    # df.drop('Embeddings', axis = 1, inplace=True)

    return df

In [46]:
# Calculate the query embedding for both queries using both models
# Calculate similarity scores between the query embedding and all chunk embeddings for both queries using both models
# Extract the top 3 results for both queries against both the embedding models
#top_3_MiniLM_query1
#top_3_QA_query1
#top_3_MiniLM_query2
#top_3_QA_query2

In [47]:
semantic_search(query1, job_postings, MiniLM_model, "MiniLM_model")


Unnamed: 0,title,description,location,work_type,Embeddings_MiniLM_model,Embeddings_QA_model,Similarity
0,Data Engineer,"Job Description:\n• Design, develop, and launc...",United States,CONTRACT,"[tensor(-0.0304, device='cuda:0'), tensor(0.03...","[tensor(-0.0350, device='cuda:0'), tensor(0.04...",0.52974
1,Azure Data Engineer,Need Genuine candidates who has solid experien...,United States,CONTRACT,"[tensor(-0.0148, device='cuda:0'), tensor(-0.0...","[tensor(0.0092, device='cuda:0'), tensor(-0.05...",0.478422
2,Senior Data Engineer,Combine your technical expertise and problem-s...,"Virginia, United States",FULL_TIME,"[tensor(-0.0201, device='cuda:0'), tensor(0.00...","[tensor(-0.0178, device='cuda:0'), tensor(0.00...",0.449332


In [48]:
semantic_search(query1, job_postings, QA_model, "QA_model")

Unnamed: 0,title,description,location,work_type,Embeddings_MiniLM_model,Embeddings_QA_model,Similarity
0,Data Engineer,"Job Description:\n• Design, develop, and launc...",United States,CONTRACT,"[tensor(-0.0304, device='cuda:0'), tensor(0.03...","[tensor(-0.0350, device='cuda:0'), tensor(0.04...",0.543093
1,Data Scientist/ Product Analyst,Looking for candidates with 4+ years’ experien...,"San Francisco, CA",CONTRACT,"[tensor(0.0009, device='cuda:0'), tensor(-0.05...","[tensor(0.0243, device='cuda:0'), tensor(0.012...",0.462681
2,Assistant to Sales Director,Please send resume with salary requirements an...,"Chicago, IL",FULL_TIME,"[tensor(-0.1007, device='cuda:0'), tensor(-0.0...","[tensor(-0.0475, device='cuda:0'), tensor(-0.0...",0.416805


In [49]:
semantic_search(query2, job_postings, MiniLM_model, "MiniLM_model")

Unnamed: 0,title,description,location,work_type,Embeddings_MiniLM_model,Embeddings_QA_model,Similarity
0,UX Designer/Architect-Plano TX and Columbus OH...,Our client is looking for UX Designer.\nLocati...,"Plano, TX",CONTRACT,"[tensor(0.0440, device='cuda:0'), tensor(0.014...","[tensor(0.0026, device='cuda:0'), tensor(0.020...",0.472885
1,Software Engineer- Graphics Designer,Job Description\nExperience with C/C++ program...,United States,FULL_TIME,"[tensor(-0.1053, device='cuda:0'), tensor(0.01...","[tensor(-0.0668, device='cuda:0'), tensor(-0.0...",0.41272
2,NY Studio Assistant,YOU COULD BE ONE OF THE MAGIC MAKERS\nKen Fulk...,"New York, NY",FULL_TIME,"[tensor(0.0196, device='cuda:0'), tensor(-0.13...","[tensor(0.0156, device='cuda:0'), tensor(-0.06...",0.398065


In [50]:
semantic_search(query2, job_postings, QA_model, "QA_model")

Unnamed: 0,title,description,location,work_type,Embeddings_MiniLM_model,Embeddings_QA_model,Similarity
0,Lead Game Designer,Position: Lead Game DesignerLocation: Remote W...,United States,FULL_TIME,"[tensor(0.0055, device='cuda:0'), tensor(-0.00...","[tensor(-0.0034, device='cuda:0'), tensor(0.02...",0.532436
1,Head of Visual Design Operating Systems,STEALTH GAMING OPERATING SYSTEM. \nWe are a ve...,United States,FULL_TIME,"[tensor(-0.0058, device='cuda:0'), tensor(-0.0...","[tensor(-0.0337, device='cuda:0'), tensor(-0.0...",0.519795
2,Sr. AV Designer,"ABOUT THE POSITION\nAs a Senior AV Designer, y...","West Valley City, UT",FULL_TIME,"[tensor(-0.0371, device='cuda:0'), tensor(0.06...","[tensor(0.0251, device='cuda:0'), tensor(0.061...",0.432891
