In [25]:
import pandas as pd  
import numpy as np 

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2

import nltk
import re 
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
import re
def basic_preprocessing(text):
  text=text.lower()
  text=re.sub(r'[^\w\s]','',text) 
  text = re.sub(r'@\w+', '', text)
  text = re.sub(r'\n', ' ' ,text)  
  return text 

### Job Description Processing

In [28]:
job_description = open("job_desc_test.txt", "r")
desc = job_description.read()
cleaned_desc = basic_preprocessing(desc)
remove_stopwords = lambda x: ' '.join([word for word in x.split() if word.lower() not in stop])
cleaned_desc = remove_stopwords(cleaned_desc)

In [29]:
cleaned_desc

'seeking hr manager small companies role department 1 serving support employees throughout employment life cycle 100 remote responsibilities develop implement leadership development program managers team leads develop implement hr strategies initiatives aligned overall business strategy oversee manage recruitment selection process including job postings screening resumes conducting interviews manage employee onboarding offboarding processes ensuring smooth transition new hires departing employees administer employee benefits programs including health insurance retirement plans paid time'

In [30]:
len(cleaned_desc)

592

### Resume Processing

In [31]:
import PyPDF2
import os

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file: 
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()
    return text

# Directory containing PDF files 
pdf_directory = './Sample-PDFs'

# Extract text from each PDF file
pdf_texts = {}
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, filename)
        text = extract_text_from_pdf(pdf_path)
        pdf_texts[filename] = text

In [32]:
df = pd.DataFrame(list(pdf_texts.items()), columns=['File', 'Text'])
df

Unnamed: 0,File,Text
0,ACC1.pdf,MORTGAGE BANKING FORECLOSURE SPECIALIST\nSumma...
1,AGG1.pdf,PATIENT ACCESS REP\nSummary\nDependable Comput...
2,DES1.pdf,LEAD SENIOR GRAPHIC DESIGNER\nSummary\nManage ...
3,DES2.pdf,SOLUTION DESIGNER\nCareer Overview\nSolutions-...
4,HR1.pdf,HR PERSONNEL ASSISTANT\nSummary\nI am a U.S. c...
5,HR2.pdf,HR MANAGER\nSummary\nHuman Resources Manager w...
6,HR3.pdf,HR BENEFITS/LEAVE COORDINATOR\nSummary\n13 yea...


In [33]:
df['resume_cleaned'] = df['Text'].apply(basic_preprocessing)
df['resume_cleaned'] = df['resume_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [34]:
df

Unnamed: 0,File,Text,resume_cleaned
0,ACC1.pdf,MORTGAGE BANKING FORECLOSURE SPECIALIST\nSumma...,mortgage banking foreclosure specialist summar...
1,AGG1.pdf,PATIENT ACCESS REP\nSummary\nDependable Comput...,patient access rep summary dependable computer...
2,DES1.pdf,LEAD SENIOR GRAPHIC DESIGNER\nSummary\nManage ...,lead senior graphic designer summary manage mu...
3,DES2.pdf,SOLUTION DESIGNER\nCareer Overview\nSolutions-...,solution designer career overview solutionsori...
4,HR1.pdf,HR PERSONNEL ASSISTANT\nSummary\nI am a U.S. c...,hr personnel assistant summary us citizen auth...
5,HR2.pdf,HR MANAGER\nSummary\nHuman Resources Manager w...,hr manager summary human resources manager pra...
6,HR3.pdf,HR BENEFITS/LEAVE COORDINATOR\nSummary\n13 yea...,hr benefitsleave coordinator summary 13 years ...


### Using TFIDF for Cosine Similarity

In [35]:
#Obtaining the vocabulary based on the job description
vectorizer = TfidfVectorizer()
vectorizer.fit([cleaned_desc])  

In [36]:
file_val_pair = {} 

X = vectorizer.transform([cleaned_desc])

for index, row in df.iterrows():
    file_name = row['File'] 
    text = row['resume_cleaned']
    Y = vectorizer.transform([text])
    file_val_pair[file_name] = round(cosine_similarity(X,Y)[0][0]*100,2)

In [37]:
file_val_pair

{'ACC1.pdf': 32.74,
 'AGG1.pdf': 31.95,
 'DES1.pdf': 29.59,
 'DES2.pdf': 34.84,
 'HR1.pdf': 35.66,
 'HR2.pdf': 58.24,
 'HR3.pdf': 48.19}

In [38]:
sorted_file_val_pair = dict(sorted(file_val_pair.items(), key=lambda item: item[1], reverse=True))
print(sorted_file_val_pair)  

{'HR2.pdf': 58.24, 'HR3.pdf': 48.19, 'HR1.pdf': 35.66, 'DES2.pdf': 34.84, 'ACC1.pdf': 32.74, 'AGG1.pdf': 31.95, 'DES1.pdf': 29.59}


### Using BERT transformer for Cosine Similarity

In [39]:
from transformers import AutoTokenizer, AutoModel
import torch 

def bert_vectorizer (cleaned_desc):
    # Loading the pre-trained model and tokenizer
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)


    text = cleaned_desc 

    inputs = tokenizer(text, padding=True, max_length=512, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state

    # need to reshape the embeddings for cosine similarity , .squeeze() is to remove batch dimension 
    reshaped_embeddings = embeddings.squeeze(0)  

    return reshaped_embeddings.numpy()

In [40]:
file_bert = {} 

X = bert_vectorizer(cleaned_desc)

for index, row in df.iterrows():
    file_name = row['File']
    text = row['resume_cleaned']
     
    Y = bert_vectorizer(text) 
     
    file_bert[file_name] = round(cosine_similarity(X,Y)[0][0]*100,2) 


print(file_bert)   


{'ACC1.pdf': 88.41, 'AGG1.pdf': 87.66, 'DES1.pdf': 87.02, 'DES2.pdf': 89.87, 'HR1.pdf': 89.17, 'HR2.pdf': 91.5, 'HR3.pdf': 90.83}


In [41]:
sorted_file_val_pair_bert = dict(sorted(file_bert.items(), key=lambda item: item[1], reverse=True))
print(sorted_file_val_pair_bert)   

{'HR2.pdf': 91.5, 'HR3.pdf': 90.83, 'DES2.pdf': 89.87, 'HR1.pdf': 89.17, 'ACC1.pdf': 88.41, 'AGG1.pdf': 87.66, 'DES1.pdf': 87.02}


Observations: 

- Bert
{'HR2.pdf': 91.5, 'HR3.pdf': 90.83, 'DES2.pdf': 89.87, 'HR1.pdf': 89.17, 'ACC1.pdf': 88.41, 'AGG1.pdf': 87.66, 'DES1.pdf': 87.02}

- TFIDF
{'HR2.pdf': 58.24, 'HR3.pdf': 48.19, 'HR1.pdf': 35.66, 'DES2.pdf': 34.84, 'ACC1.pdf': 32.74, 'AGG1.pdf': 31.95, 'DES1.pdf': 29.59}
