## Import

In [73]:
import numpy as np
import pandas as pd
import re
import string # for text cleaning
import contractions # for expanding short form words
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")
import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
import openai

## Loading JD Data

In [4]:
jd_data = load_dataset('jacob-hugging-face/job-descriptions', split="train")
jd_data

Dataset({
    features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
    num_rows: 853
})

In [88]:
jd_df = pd.DataFrame(jd_data)
jd_df.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [89]:
print(jd_df['job_description'][0])

minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizations more product

## Loading Extracted CV Data

In [93]:
# Load the cleaned data
df = pd.read_csv('skills_education.csv')
df.head()

Unnamed: 0,skills,education,ID,Category
0,"Cold, Excel","WFX, State\n \nManage, Staffing Coordinator\n ...",10228751,BUSINESS-DEVELOPMENT
1,Associate of Arts\n \n: \nGolf Management\n \n...,"LLC, Professional Golf Management\n \nCity\n \...",10235211,BUSINESS-DEVELOPMENT
2,"The Over Achiever of the Year, Fortune 500, Ou...","State\n \nFinancial Operations\nExecutive, Sal...",10289113,BUSINESS-DEVELOPMENT
3,Ph.D.,"Birst, Business Management or Sales, Mailchimp...",10501991,BUSINESS-DEVELOPMENT
4,"Fortune 500, Excel","Human Resources, Darton State College, Communi...",10541358,BUSINESS-DEVELOPMENT


In [94]:
# Handle missing or invalid values
df['education'] = df['education'].fillna('')  # Replace NaN with empty string
df['education'] = df['education'].astype(str)
df['skills'] = df['skills'].fillna('')  # Replace NaN with empty string
df['skills'] = df['skills'].astype(str)

In [95]:
def text_cleaning(text:str) -> str:
    if pd.isnull(text):
        return
    
    # lower-case everything
    text = text.lower().strip()
    
    # For removing puctuations
    translator = str.maketrans('', '', string.punctuation)
    
    # expand all the short-form words
    text = contractions.fix(text)
    
    # remove any special chars
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'\S+@\S+', '', text) # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text) # Remove phone numbers
    text = text.translate(translator) # Remove puctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text) # Remove other non-alphanumeric characters
    
    return text.strip()

In [96]:
# Combine columns to create resume text
df['resume_text'] = df['skills'] + " " + df['education']

# Display the result
print(df[['ID', 'resume_text']])

            ID                                        resume_text
0     10228751  Cold, Excel WFX, State\n \nManage, Staffing Co...
1     10235211  Associate of Arts\n \n: \nGolf Management\n \n...
2     10289113  The Over Achiever of the Year, Fortune 500, Ou...
3     10501991  Ph.D. Birst, Business Management or Sales, Mai...
4     10541358  Fortune 500, Excel Human Resources, Darton Sta...
...        ...                                                ...
1010  85973397   the Regional Hospital Administrator, 04/2013\...
1011  88651471  MBA, Excel PM-Group, Federal Acquisition Regul...
1012  91197243  Leadership, Marketing, Excel Public Relations,...
1013  93828034  Excel, Renae's Story" Writer, Aegis MS Office ...
1014  98086373  Merchandising, Games, The Color Run, The Color...

[1015 rows x 2 columns]


In [97]:
# Sample job descriptions
job_descriptions = jd_df['job_description'].apply(text_cleaning)[:15].to_list() 

# Sample resumes (replace with your extracted resume data)
resumes = df['resume_text'].to_list()

## Creating Embedding using `DistilBertTokenizer`, `DistilBertModel`

In [86]:
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [87]:
# Tokenize and embed job descriptions and resumes
job_description_embeddings = []
for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    job_description_embeddings.append(embeddings[0])

resume_embeddings = []
for resume in resumes:
    tokens = tokenizer(resume, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    resume_embeddings.append(embeddings[0])

In [54]:
job_description_embeddings[0].shape, resume_embeddings[0].shape

((768,), (768,))

In [98]:
len(job_description_embeddings), len(resume_embeddings)

(15, 1015)

## Calculating Similarity Score & Getting Top 5 Candidates

In [99]:
# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

array([[0.78613627, 0.77106947, 0.82659954, ..., 0.8323555 , 0.82050574,
        0.842985  ],
       [0.7720886 , 0.7642378 , 0.81697   , ..., 0.8034971 , 0.8105255 ,
        0.829932  ],
       [0.77575755, 0.7825777 , 0.8229194 , ..., 0.824862  , 0.8066385 ,
        0.83687353],
       ...,
       [0.7785107 , 0.7789704 , 0.8168187 , ..., 0.82830167, 0.81880105,
        0.8517502 ],
       [0.8679829 , 0.7852365 , 0.812458  , ..., 0.8164421 , 0.83618987,
        0.83839583],
       [0.7818338 , 0.79047847, 0.8251783 , ..., 0.83040214, 0.8203684 ,
        0.85156286]], dtype=float32)

In [102]:
# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(job_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(job_descriptions):
    print(f"Top candidates for JD {i+1} - Postition: {jd_df['position_title'][i]}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} - {df['Category'][candidate_index]}/{df['ID'][candidate_index]}.pdf")
    print()

Top candidates for JD 1 - Postition: Sales Specialist
  Candidate 327 - Similarity Score: 0.8856 - DESIGNER/44185767.pdf
  Candidate 58 - Similarity Score: 0.8766 - BUSINESS-DEVELOPMENT/22765255.pdf
  Candidate 212 - Similarity Score: 0.8761 - CONSULTANT/51724595.pdf
  Candidate 861 - Similarity Score: 0.8754 - INFORMATION-TECHNOLOGY/28035460.pdf
  Candidate 177 - Similarity Score: 0.8745 - CONSULTANT/26919036.pdf

Top candidates for JD 2 - Postition: Apple Solutions Consultant
  Candidate 76 - Similarity Score: 0.8492 - BUSINESS-DEVELOPMENT/27850777.pdf
  Candidate 972 - Similarity Score: 0.8491 - PUBLIC-RELATIONS/26127853.pdf
  Candidate 775 - Similarity Score: 0.8450 - HR/57667857.pdf
  Candidate 177 - Similarity Score: 0.8442 - CONSULTANT/26919036.pdf
  Candidate 939 - Similarity Score: 0.8418 - PUBLIC-RELATIONS/19497420.pdf

Top candidates for JD 3 - Postition: Licensing Coordinator - Consumer Products
  Candidate 163 - Similarity Score: 0.8740 - CONSULTANT/22259768.pdf
  Candidat