In [1]:
!pip install PyPDF2



In [2]:
!pip install transformers



# **Part - 1 PDF Data Extraction**

**Importing Libraries**

In [38]:
import os
import re
import PyPDF2
import string
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if not already downloaded)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Get List of Folders Containing CVs**

In [4]:
path = '/content/drive/MyDrive/Resumes'
folder_path = []

dir_list = os.listdir(path)

for _ in dir_list:
  folder_path.append(os.path.join(path, _))

**Get the file paths of all the CVs**

In [5]:
pdf_path = []

for i in folder_path:
  for root, dirs, files in os.walk(i):
      for file in files:
          if file.lower().endswith(".pdf"):
            file_path = os.path.join(root, file)
            pdf_path.append(file_path)

In [6]:
len(pdf_path)

2484

**Extracting Category(Job Role) from the PDF and Preprocessing it**

In [7]:
category_list = []

for path in pdf_path:
# Open the PDF file
  with open(path, 'rb') as pdf_file:
      pdf_reader = PyPDF2.PdfReader(pdf_file)

      # Extract text from the PDF
      pdf_text = ''
      for page in pdf_reader.pages:
          pdf_text += page.extract_text()

      # Define regular expressions for matching job category, skills, and education
      category_pattern = r'^[^\n]*'

      # Extract category, skills, and education using regex
      category_match = re.search(category_pattern, pdf_text, re.DOTALL)

      # Extracted information
      category = category_match.group(0).strip() if category_match else "None"

      category_list.append(category)


In [41]:
category_preprocessed = []

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Tokenize using SpaCy
    doc = nlp(text)
    tokens = [token.text for token in doc]

    # Remove stopwords using NLTK and SpaCy
    stopwords_nltk = set(nltk.corpus.stopwords.words("english"))
    stopwords_spacy = nlp.Defaults.stop_words
    tokens = [token for token in tokens if token not in stopwords_nltk]
    tokens = [token for token in tokens if token not in stopwords_spacy]

    # Perform stemming using NLTK
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform lemmatization using SpaCy
    tokens = [token.lemma_ for token in doc]

    # Join the preprocessed words back into a string
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

# Example usage
for text in category_list:
  preprocessed_tokens = preprocess_text(text)
  category_preprocessed.append(preprocessed_tokens)

In [42]:
category_preprocessed

['fitness attendant',
 'general manager',
 'manager   fitness instructor',
 'concierge',
 'fitness instructor',
 'group fitness coordinator',
 'mover',
 'sale associate',
 'rnrn team lead',
 'fitness director',
 'group fitness instructor',
 'recreation   sport coordinator',
 'receptionist and veterinary technician',
 'intern',
 'yoga instructor',
 'flight attendant',
 'senior exercise specialist   sport and recreation coordinator',
 'vice president',
 'o',
 'athletic director coach pe teacher',
 'certify master personal trainer',
 'medical scribe',
 'social medium agent',
 'operation associate',
 'bartender on call',
 'fitness staff',
 'physical therapy aide',
 'aba therapist',
 'avid tutor',
 'yoga instructor',
 'advocare distributor',
 'intern',
 'group exercise fitness instructor',
 'administrative office assistant',
 'fitness consultant',
 'sale associate',
 'professional fitness trainer group instructor',
 'owneroperator',
 'customer care representative',
 'guest lecturer',
 'reha

**Extracting Skills from the PDF and Preprocessing it**

In [10]:
skills_list = []

for path in pdf_path:
# Open the PDF file
  with open(path, 'rb') as pdf_file:
      pdf_reader = PyPDF2.PdfReader(pdf_file)

      # Extract text from the PDF
      pdf_text = ''
      for page in pdf_reader.pages:
          pdf_text += page.extract_text()

      # Define regular expressions for matching job category, skills, and education
      skills_pattern = r'Skills(?:\n|.)+?(?=\n|$)'

      # Extract category, skills, and education using regex
      skills_match = re.search(skills_pattern, pdf_text, re.DOTALL)

      # Extracted information
      skills = skills_match.group(0).strip().replace("Skills\n", "") if skills_match else "None"

      skills_list.append(skills)

In [39]:
skills_preprocessed = []

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Tokenize using SpaCy
    doc = nlp(text)
    tokens = [token.text for token in doc]

    # Remove stopwords using NLTK and SpaCy
    stopwords_nltk = set(nltk.corpus.stopwords.words("english"))
    stopwords_spacy = nlp.Defaults.stop_words
    tokens = [token for token in tokens if token not in stopwords_nltk]
    tokens = [token for token in tokens if token not in stopwords_spacy]

    # Perform stemming using NLTK
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform lemmatization using SpaCy
    tokens = [token.lemma_ for token in doc]

    # Join the preprocessed words back into a string
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

# Example usage
for input_text in skills_list:
  preprocessed_tokens = preprocess_text(input_text)
  skills_preprocessed.append(preprocessed_tokens)

In [40]:
skills_preprocessed

['none',
 'acquisition advertising budgeting',
 'balance client fast safety sale statistic',
 'patientfocuse care',
 'sale management',
 'ad cpr first aid guardian health education presentation relationship building seminar staff',
 'forklift operator hand truck inventory machinery building material read and interpret drawing safety service time management',
 'skill   ability management punctual',
 'asc qa emr pharmacy infection control metric risk management satisfaction staff trading training auditing instrumentation',
 'bill blood pressure brochure communication skill client email goal set promote health leadership director marketing market',
 'plan',
 'microsoft office program',
 'arap administrative billing system business process cpr',
 'benefit billing contract cpr certify credit client first aid inventory exchange packaging paint programming progress quality',
 'program development and implementation',
 'investigation financial audit digital evidence surveillance undercover fra

**Extracting Education from the PDF and Preprocessing it**

In [13]:
education_list = []

for path in pdf_path:
# Open the PDF file
  with open(path, 'rb') as pdf_file:
      pdf_reader = PyPDF2.PdfReader(pdf_file)

      # Extract text from the PDF
      pdf_text = ''
      for page in pdf_reader.pages:
          pdf_text += page.extract_text()

      # Define regular expressions for matching job category, skills, and education
      education_pattern = r'Education\s+([\s\S]*?)(?=\n(?:Summary|Organizations|Certifications|Professional|Affiliations|Publications|Company Name|Activities|Licenses|Language|Honors|Speaking|Selected Trainings|Skills|Experience|Highlights|Interests|Additional|Accomplishments|Personal Information)|\Z)'

      # Extract category, skills, and education using regex
      education_match = re.search(education_pattern, pdf_text, re.DOTALL)

      # Extracted information
      education = education_match.group(1).strip() if education_match else "None"

      education_list.append(education)

In [49]:
education_preprocessed = []

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove newline characters
    text = text.replace('\n', '')

    # Tokenize using SpaCy
    doc = nlp(text)
    tokens = [token.text for token in doc]

    # Remove stopwords using NLTK and SpaCy
    stopwords_nltk = set(nltk.corpus.stopwords.words("english"))
    stopwords_spacy = nlp.Defaults.stop_words
    tokens = [token for token in tokens if token not in stopwords_nltk]
    tokens = [token for token in tokens if token not in stopwords_spacy]

    # Perform stemming using NLTK
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform lemmatization using SpaCy
    tokens = [token.lemma_ for token in doc]

    # Join the preprocessed words back into a string
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

# Example usage
for text in education_list:
  preprocessed_tokens = preprocess_text(text)
  education_preprocessed.append(preprocessed_tokens)


In [50]:
education_preprocessed

['master of science   human nutrition    university of new haven ï¼ city   state   usaminor in nutritional genomicsthesis submit in partial fulfillment of requirement for degreeâ   gpabachelor of science   human nutrition and dietetic    university of jordan ï¼ city   jordan   gpa',
 'johnson c smith university   bachelor of science   business administration city   statebusiness administration concentration in marketing and sale management',
 'high school diploma    eagan high school ï¼ city   state',
 'and traininghigh school diploma   general   belton high school   city   state',
 'high school diploma   university of texas   city   state communicationsvolunteeringblue dog rescueb friend animal sanctuary',
 'and trainingbachelor degree   psychology government december   university of tampa psychology government healthy lifestyle principlesfoundation of strength and conditioning cpr first aid ae les mill bodypump guardian ad litem volunteer and advocacy training',
 'and trainingnorther

**Creating a List of CVs from the Extracted Information**

In [51]:
cvs_list = []
for i in range(len(pdf_path)):
  cvs_list.append(category_preprocessed[i]+" "+skills_preprocessed[i]+" "+education_preprocessed[i])

In [52]:
cvs_list

['fitness attendant none master of science   human nutrition    university of new haven ï¼ city   state   usaminor in nutritional genomicsthesis submit in partial fulfillment of requirement for degreeâ   gpabachelor of science   human nutrition and dietetic    university of jordan ï¼ city   jordan   gpa',
 'general manager acquisition advertising budgeting johnson c smith university   bachelor of science   business administration city   statebusiness administration concentration in marketing and sale management',
 'manager   fitness instructor balance client fast safety sale statistic high school diploma    eagan high school ï¼ city   state',
 'concierge patientfocuse care and traininghigh school diploma   general   belton high school   city   state',
 'fitness instructor sale management high school diploma   university of texas   city   state communicationsvolunteeringblue dog rescueb friend animal sanctuary',
 'group fitness coordinator ad cpr first aid guardian health education pres

# **Part 2 - Job Description Data Understanding**

**Importing Libraries**

In [53]:
import pandas as pd
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (if not already downloaded)
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Extract first 10 rows from the Dataset**

In [19]:
df = pd.read_csv('/content/drive/MyDrive/job_description.csv')
df = df.head(10)
display(df)

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."
5,DesignUps,designups is a nashville based design and inte...,Frontend Web Developer,892,"{\n ""Core Responsibilities"": ""Translate desi..."
6,"Equisolve, Inc.",about the position\n\nthe web designer is resp...,Remote Website Designer,3471,"{\n ""Core Responsibilities"": ""Provide design..."
7,Zander Insurance Agency,job description\n\nzander insurance group is o...,Web Designer,2896,"{\n ""Core Responsibilities"": ""Design compell..."
8,Tuff,tuff is a growth marketing team working with c...,Web Designer,5143,"{\n ""Core Responsibilities"": ""Work on variou..."
9,General Dynamics Information Technology,type of requisition regular\n\nclearance level...,SR. Web Designer,4023,"{\n ""Core Responsibilities"": ""Designs and bu..."


**Extract columns 'position_title' and 'job_description' into lists and preprocess them**

In [73]:
position_title = df['position_title'].astype('str').to_list()
job_description = df['model_response'].astype('str').to_list()

In [22]:
job_description_list = []

for i in range(len(position_title)):
  job_description_list.append(position_title[i]+job_description[i])

In [56]:
job_description_preprocessed = []

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Remove newline characters
    text = text.replace('\n', '')

    # Tokenize using SpaCy
    doc = nlp(text)
    tokens = [token.text for token in doc]

    # Remove stopwords using NLTK and SpaCy
    stopwords_nltk = set(nltk.corpus.stopwords.words("english"))
    stopwords_spacy = nlp.Defaults.stop_words
    tokens = [token for token in tokens if token not in stopwords_nltk]
    tokens = [token for token in tokens if token not in stopwords_spacy]

    # Perform stemming using NLTK
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform lemmatization using SpaCy
    tokens = [token.lemma_ for token in doc]

    # Join the preprocessed words back into a string
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

# Example usage
for text in job_description_list:
  preprocessed_tokens = preprocess_text(text)
  job_description_preprocessed.append(preprocessed_tokens)

In [57]:
job_description_preprocessed

['sale specialist    core responsibility responsible for expand google workspace product adoption across an assign territory build relationship with customer to understand need and provide google workspace solution partner with account team to construct solution and grow business for google workspace   require skill bachelor degree or equivalent experience experience manage enterprise saas account and sale cycle    educational requirement bachelor degree or equivalent experience   experience level experience manage enterprise saas account and sale cycle   prefer qualification experience build strategic partnership with enterprise customer ability to work through a reseller ecosystem excellent communication and strategic thinking skill   compensation and benefit na',
 'apple solution consultant    core responsibility as an asc you will be highly influential in grow mind and market share of apple product while build longterm relationship with those who share your passion customer experie

# **Part 3 - Candidate Job Matching**

**Importing Libraries**

In [58]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

**Create Embeddings for CVs and Job Descriptions**

In [69]:
# Create embeddings for each tokenized text
cv_embeddings = []

for text in cvs_list:
    # Tokenize and encode the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Perform a forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings (CLS token or average of all tokens)
    cls_embedding = outputs.last_hidden_state.mean(dim=1).numpy()[0]
    cv_embeddings.append(cls_embedding)

# Convert embeddings to a numpy array
cv_embeddings_array = np.array(cv_embeddings)

# # Now, 'embeddings_array' contains the embeddings for the tokenized text
# print(embeddings_array.shape)  # (number_of_text_samples, embedding_dimension)


In [70]:
# Create embeddings for each tokenized text
job_description_embeddings = []

for text in job_description_preprocessed:
    # Tokenize and encode the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # Perform a forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the embeddings (CLS token or average of all tokens)
    cls_embedding = outputs.last_hidden_state.mean(dim=1).numpy()[0]
    job_description_embeddings.append(cls_embedding)

# Convert embeddings to a numpy array
job_description_embeddings_array = np.array(job_description_embeddings)

# # Now, 'embeddings_array' contains the embeddings for the tokenized text
# print(embeddings_array.shape)  # (number_of_text_samples, embedding_dimension)


**Find Cosine Similarity, Rank CVs based on it and show the Top 5 CVs for a given Job Description**

In [72]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have job_description_embeddings and cv_embeddings as NumPy arrays

# Choose a specific job description you want to compare against
chosen_job_description_idx = 4  # Replace with the index of the job description you want to compare

# Calculate cosine similarity between the chosen job description and all CVs
cosine_similarities = cosine_similarity(
    [job_description_embeddings_array[chosen_job_description_idx]], cv_embeddings_array
)

# Get the ranked indices of CVs based on similarity (descending order)
ranked_cv_indices = np.argsort(cosine_similarities[0])[::-1]

# Get the top 5 most similar CVs
top_5_cv_indices = ranked_cv_indices[:5]

# Print the top 5 CVs along with their cosine similarity scores
for i, cv_idx in enumerate(top_5_cv_indices, start=1):
    similarity_score = cosine_similarities[0][cv_idx]
    print(f"Top {i} CV (Similarity Score: {similarity_score}):")
    print(cvs_list[cv_idx])  # Assuming cv_data is a DataFrame with CV text
    print("\n")


Top 1 CV (Similarity Score: 0.9510120153427124):
lead information technology support specialist application department locate on the main campuschico center orland paradise and glenn countyrecommende modification or improvement for method procedure technique and equipment require for the business educationdegree program and computer lab result in time and cost saving for the departmentdesigne and maintain database of   year of graduate student follow their degree program and certificatesprovide effective instruction and tutoring to a widely diverse group of student in the follow career program microsoft office wordexcel access powerpoint and publisher computer application office assistant medical front office assistant legal officeassistant desktop publishing and medical transcription for certificationcollaborate with the technical support team to recommend new hardware and software to improve the performance of the computersmaintaine and update butte college business education website