In [41]:
import numpy as np
import pandas as pd

import re
import string # for text cleaning
import contractions # for expanding short form words
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

In [42]:
df = pd.read_csv("/Users/samyukthaganesh/Documents/Samyuktha/University/MachineLearning/SEM7Project/training_data.csv")

In [43]:
df.head

<bound method NDFrame.head of              company_name                                    job_description  \
0                  Google  minimum qualifications\nbachelors degree or eq...   
1                   Apple  description\nas an asc you will be highly infl...   
2                 Netflix  its an amazing time to be joining netflix as w...   
3             Robert Half  description\n\nweb designers looking to expand...   
4               TrackFive  at trackfive weve got big goals were on a miss...   
..                    ...                                                ...   
848               Menards  job description\n\nparttime\n\nmake big money ...   
849                Parker  responsibilities\nparkers internship program w...   
850        Borgen Project   the borgen project is an innovative national ...   
851  Wyndham Destinations  put the world on vacation\n\nat wyndham destin...   
852               Aerotek  this job handles customer inquiries by telepho...   

         

In [44]:
jd_df = pd.DataFrame(df)
jd_df.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [45]:
jd_df.columns

Index(['company_name', 'job_description', 'position_title',
       'description_length', 'model_response'],
      dtype='object')

In [46]:
def text_cleaning(text:str) -> str:
    if pd.isnull(text):
        return
    
    # lower-case everything
    text = text.lower().strip()
    
    # For removing puctuations
    translator = str.maketrans('', '', string.punctuation)
    
    # expand all the short-form words
    text = contractions.fix(text)
    
    # remove any special chars
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'\S+@\S+', '', text) # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text) # Remove phone numbers
    text = text.translate(translator) # Remove puctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text) # Remove other non-alphanumeric characters
    
    return text.strip()

In [47]:
df = pd.read_csv("/Users/samyukthaganesh/Documents/Samyuktha/University/MachineLearning/SEM7Project/pdf_extracted_skills_education.csv")
cv_df = df[~(df['Skills'].isna() & df['Education'].isna())].reset_index(drop=True)

# Filling the null values in Skills & Education with Empty String before concatinating them
cv_df = cv_df.fillna(value='')

# Let's stitch together Skills & Education, similar to given in job description.
cv_df['CV'] = cv_df['Skills'] + ' ' + cv_df['Education']

# Doing text cleaning
cv_df['CV'] = cv_df['CV'].progress_apply(text_cleaning)

Progress Bar: 100%|██████████| 2469/2469 [00:00<00:00, 8896.97it/s]


In [48]:
cv_df.shape

(2469, 5)

In [49]:
cv_df.columns

Index(['Skills', 'Education', 'ID', 'Category', 'CV'], dtype='object')

In [50]:
job_descriptions = jd_df['job_description'].apply(text_cleaning)[:15].to_list() # jd_df['job_description'][:15]

# Sample resumes (replace with your extracted resume data)
resumes = cv_df['CV'].to_list()

In [51]:
# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Tokenize and embed job descriptions
job_description_embeddings = []
for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    job_description_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

# Tokenize and embed resumes
resume_embeddings = []
for resume in resumes:
    tokens = tokenizer(resume, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    resume_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

In [61]:
job_description_embeddings[0].shape, resume_embeddings[0].shape


((768,), (768,))

In [62]:
len(job_description_embeddings), len(resume_embeddings)


(15, 2469)

In [63]:
# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

array([[0.8208875 , 0.7767728 , 0.78489166, ..., 0.85897934, 0.77054554,
        0.60554796],
       [0.78442526, 0.71713597, 0.74813074, ..., 0.83248675, 0.7884389 ,
        0.67844903],
       [0.8139858 , 0.78959244, 0.783637  , ..., 0.84983313, 0.75802976,
        0.6202631 ],
       ...,
       [0.8167765 , 0.7708845 , 0.7748319 , ..., 0.88432616, 0.7708576 ,
        0.6345936 ],
       [0.8439074 , 0.7782758 , 0.797146  , ..., 0.8783191 , 0.82519805,
        0.6813816 ],
       [0.8330029 , 0.7873484 , 0.7752358 , ..., 0.8890798 , 0.7842255 ,
        0.6448568 ]], dtype=float32)

In [64]:
# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(job_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(job_descriptions):
    print(f"Top candidates for JD {i+1} - Postition: {jd_df['position_title'][i]}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} - {cv_df['Category'][candidate_index]}/{cv_df['ID'][candidate_index]}.pdf")
        # print(f"  Resume: {resumes[candidate_index]}")
    print()

Top candidates for JD 1 - Postition: Sales Specialist
  Candidate 1949 - Similarity Score: 0.9415 - HR/18827609.pdf
  Candidate 291 - Similarity Score: 0.9388 - AGRICULTURE/62994611.pdf
  Candidate 478 - Similarity Score: 0.9387 - ARTS/43622023.pdf
  Candidate 28 - Similarity Score: 0.9377 - ACCOUNTANT/16237710.pdf
  Candidate 1803 - Similarity Score: 0.9314 - HEALTHCARE/10466208.pdf

Top candidates for JD 2 - Postition: Apple Solutions Consultant
  Candidate 168 - Similarity Score: 0.9236 - ADVOCATE/22391901.pdf
  Candidate 904 - Similarity Score: 0.9165 - BUSINESS-DEVELOPMENT/95382114.pdf
  Candidate 1730 - Similarity Score: 0.9159 - FITNESS/21238396.pdf
  Candidate 952 - Similarity Score: 0.9155 - CHEF/21869994.pdf
  Candidate 482 - Similarity Score: 0.9146 - ARTS/54100393.pdf

Top candidates for JD 3 - Postition: Licensing Coordinator - Consumer Products
  Candidate 478 - Similarity Score: 0.9598 - ARTS/43622023.pdf
  Candidate 2153 - Similarity Score: 0.9496 - PUBLIC-RELATIONS/122

In [56]:
# Function to rank top candidates
def get_top_candidates(jd_index, num_top_candidates=5):
    candidates_with_scores = list(enumerate(similarity_scores[jd_index]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    
    results = []
    for candidate_index, score in top_candidates_for_job:
        candidate_info = (f"Candidate {candidate_index + 1} - "
                          f"Similarity Score: {score:.4f} - "
                          f"{cv_df['Category'][candidate_index]}/{cv_df['ID'][candidate_index]}.pdf")
        results.append(candidate_info)
    
    return "\n".join(results)

In [57]:
!pip install gradio



In [70]:
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define your function to get top candidates and visualize them
def get_top_candidates(jd_index):
    # Fetch top candidates
    candidates_with_scores = list(enumerate(similarity_scores[jd_index]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates = candidates_with_scores[:5]

    # Prepare data for visualization
    visualization_data = []
    for candidate_index, score in top_candidates:
        visualization_data.append({
            'Candidate Index': candidate_index + 1,
            'Similarity Score': score,
            'Category': cv_df['Category'][candidate_index],
            'Resume ID': cv_df['ID'][candidate_index]
        })
    
    # Create a DataFrame for visualization
    visualization_df = pd.DataFrame(visualization_data)
    
    return top_candidates, visualization_df

def show_top_candidates(jd_selected, visualize):
    jd_index = jd_df['position_title'].to_list().index(jd_selected)
    top_candidates, visualization_df = get_top_candidates(jd_index)
    
    # Create a result string for the top candidates
    result = f"Top candidates for {jd_selected}:\n"
    for candidate_index, score in top_candidates:
        result += f"Candidate {candidate_index + 1} - Similarity Score: {score:.4f} \n"
    
    # Check if visualization is needed
    if visualize:
        # Create bar plot for visualization
        plt.figure(figsize=(10, 6))
        sns.barplot(data=visualization_df, x='Candidate Index', y='Similarity Score', hue='Category', dodge=True)
        plt.title(f'Top Candidates for {jd_selected}')
        plt.xlabel('Candidate Index')
        plt.ylabel('Similarity Score')
        plt.ylim(0, 1)  # Adjust based on your similarity score range
        plt.xticks(rotation=0)
        plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Annotate bars with actual scores
        for index, row in visualization_df.iterrows():
            plt.text(row['Candidate Index'] - 1, row['Similarity Score'] + 0.02, f"{row['Similarity Score']:.4f}", ha='center')
        
        # Save the plot
        plot_filename = "top_candidates_plot.png"
        plt.savefig(plot_filename)
        plt.close()  # Close the plot to avoid displaying it here
        
        return result, plot_filename
    
    return result, None

# Gradio interface
gr_interface = gr.Interface(
    fn=show_top_candidates,
    inputs=[
        gr.Dropdown(label="Select Job Description", choices=jd_df['position_title'].to_list(), value=jd_df['position_title'][0]),
        gr.Checkbox(label="Visualize Top Candidates", value=False)
    ],
    outputs=["text", "image"],
    title="Top 5 Candidates for Job Description",
    description="Select a job description to see the top 5 candidates based on similarity scores."
)

# Launch Gradio interface
gr_interface.launch()


* Running on local URL:  http://127.0.0.1:7871

To create a public link, set `share=True` in `launch()`.




In [71]:
import pandas as pd
import re
import string
import contractions
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load job description data
df = pd.read_csv("/Users/samyukthaganesh/Documents/Samyuktha/University/MachineLearning/SEM7Project/training_data.csv")
jd_df = pd.DataFrame(df)

# Text cleaning function (same as before)
def text_cleaning(text: str) -> str:
    if pd.isnull(text):
        return ''
    
    text = text.lower().strip()
    translator = str.maketrans('', '', string.punctuation)
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text)  # Remove phone numbers
    text = text.translate(translator)  # Remove punctuation
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-alphabetic characters
    return text.strip()

# Load resume data and preprocess
df = pd.read_csv("/Users/samyukthaganesh/Documents/Samyuktha/University/MachineLearning/SEM7Project/pdf_extracted_skills_education.csv")
cv_df = df[~(df['Skills'].isna() & df['Education'].isna())].reset_index(drop=True)
cv_df = cv_df.fillna(value='')  # Fill missing data
cv_df['CV'] = cv_df['Skills'] + ' ' + cv_df['Education']  # Combine Skills and Education

# Clean the CV text
cv_df['CV'] = cv_df['CV'].progress_apply(text_cleaning)

# Clean the job descriptions
job_descriptions = jd_df['job_description'].apply(text_cleaning)[:15].to_list()

# Sample resumes (replace with your extracted resume data)
resumes = cv_df['CV'].to_list()

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()

# Combine job descriptions and resumes into one list for vectorization
combined_text = job_descriptions + resumes

# Convert the text data into TF-IDF features (vectors)
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_text)

# Split the TF-IDF matrix into job descriptions and resumes
job_description_vectors = tfidf_matrix[:len(job_descriptions)]
resume_vectors = tfidf_matrix[len(job_descriptions):]

# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_vectors, resume_vectors)

# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(job_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(job_descriptions):
    print(f"Top candidates for JD {i+1} - Position: {jd_df['position_title'][i]}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} ")
    print()

# Function to rank top candidates
def get_top_candidates(jd_index, num_top_candidates=5):
    candidates_with_scores = list(enumerate(similarity_scores[jd_index]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    
    results = []
    for candidate_index, score in top_candidates_for_job:
        candidate_info = (f"Candidate {candidate_index + 1} - "
                          f"Similarity Score: {score:.4f} - "
                          f"{cv_df['Category'][candidate_index]}/{cv_df['ID'][candidate_index]}.pdf")
        results.append(candidate_info)
    
    return "\n".join(results)

# If you want to implement the Gradio interface, you can do the same as before.


Progress Bar: 100%|██████████| 2469/2469 [00:00<00:00, 8648.32it/s]


Top candidates for JD 1 - Position: Sales Specialist
  Candidate 2271 - Similarity Score: 0.4060 
  Candidate 402 - Similarity Score: 0.3835 
  Candidate 700 - Similarity Score: 0.3691 
  Candidate 1979 - Similarity Score: 0.3656 
  Candidate 1306 - Similarity Score: 0.3564 

Top candidates for JD 2 - Position: Apple Solutions Consultant
  Candidate 1881 - Similarity Score: 0.2295 
  Candidate 1279 - Similarity Score: 0.2106 
  Candidate 1011 - Similarity Score: 0.1930 
  Candidate 1979 - Similarity Score: 0.1887 
  Candidate 478 - Similarity Score: 0.1885 

Top candidates for JD 3 - Position: Licensing Coordinator - Consumer Products
  Candidate 2271 - Similarity Score: 0.4095 
  Candidate 1979 - Similarity Score: 0.3871 
  Candidate 478 - Similarity Score: 0.3853 
  Candidate 2224 - Similarity Score: 0.3820 
  Candidate 1960 - Similarity Score: 0.3770 

Top candidates for JD 4 - Position: Web Designer
  Candidate 1306 - Similarity Score: 0.3129 
  Candidate 951 - Similarity Score: 0.