In [2]:
import pandas as pd
import os
import PyPDF2
import re
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfFileReader(pdf_file)
            for page_num in range(pdf_reader.getNumPages()):
                text += pdf_reader.getPage(page_num).extractText()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
    return text

# Function to extract category (job role) from text
def extract_category(text):
    category_pattern = re.compile(r'(.+?)\s+Summary', re.IGNORECASE)
    match = category_pattern.search(text)
    if match:
        return match.group(1).strip()
    return None

# Function to preprocess and calculate embeddings using DistilBERT
def calculate_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

# Load job descriptions from CSV dataset
job_descriptions_file = "C:/Users/USER/Desktop/training_data.csv"
job_descriptions_df = pd.read_csv(job_descriptions_file)
job_descriptions = job_descriptions_df["job_description"].tolist()

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Process the provided dataset
data = pd.read_csv("C:/Users/USER/Desktop/Resume.csv")  
cv_texts = data["Resume_str"].tolist()
cv_categories = data["Category"].tolist()

# Calculate embeddings for job descriptions and CVs
job_description_embeddings = calculate_embeddings(job_descriptions, tokenizer, model)
cv_embeddings = calculate_embeddings(cv_texts, tokenizer, model)

# Calculate cosine similarity between job descriptions and CVs
similarity_scores = cosine_similarity(job_description_embeddings, cv_embeddings)

# Rank CVs based on similarity scores for each job description
top_cv_indices = similarity_scores.argsort(axis=1)[:, ::-1][:, :5]

# Print the top 5 CVs for each job description
for i, job_description in enumerate(job_descriptions):
    print(f"Job Description {i + 1}:")
    for j, cv_index in enumerate(top_cv_indices[i]):
        print(f"Top CV {j + 1}: CV {cv_index}, Similarity Score: {similarity_scores[i][cv_index]}")


Job Description 1:
Top CV 1: CV 299, Similarity Score: 0.9613177180290222
Top CV 2: CV 2225, Similarity Score: 0.9565382599830627
Top CV 3: CV 1305, Similarity Score: 0.9551481008529663
Top CV 4: CV 1235, Similarity Score: 0.9536933302879333
Top CV 5: CV 577, Similarity Score: 0.9527629613876343
Job Description 2:
Top CV 1: CV 604, Similarity Score: 0.9328286647796631
Top CV 2: CV 256, Similarity Score: 0.9261701107025146
Top CV 3: CV 2298, Similarity Score: 0.9244762063026428
Top CV 4: CV 661, Similarity Score: 0.9240624904632568
Top CV 5: CV 1305, Similarity Score: 0.922995924949646
Job Description 3:
Top CV 1: CV 2359, Similarity Score: 0.9511442184448242
Top CV 2: CV 2360, Similarity Score: 0.949389636516571
Top CV 3: CV 559, Similarity Score: 0.9489926099777222
Top CV 4: CV 1149, Similarity Score: 0.9487807750701904
Top CV 5: CV 1305, Similarity Score: 0.9485526084899902
Job Description 4:
Top CV 1: CV 1092, Similarity Score: 0.9493599534034729
Top CV 2: CV 1050, Similarity Score:

In [3]:
# Print the top 5 CVs for each job description
for i, job_description in enumerate(job_descriptions):
    print(f"Job Description {i + 1}:")
    top_indices = top_cv_indices[i][:5]  # Get the top 5 indices for this job description
    for j, cv_index in enumerate(top_indices):
        similarity_score = similarity_scores[i][cv_index]
        candidate_category = cv_categories[cv_index]
        print(f"Top CV {j + 1}: CV {cv_index}, Similarity Score: {similarity_score}, Category: {candidate_category}")


Job Description 1:
Top CV 1: CV 299, Similarity Score: 0.9613177180290222, Category: INFORMATION-TECHNOLOGY
Top CV 2: CV 2225, Similarity Score: 0.9565382599830627, Category: BANKING
Top CV 3: CV 1305, Similarity Score: 0.9551481008529663, Category: DIGITAL-MEDIA
Top CV 4: CV 1235, Similarity Score: 0.9536933302879333, Category: DIGITAL-MEDIA
Top CV 5: CV 577, Similarity Score: 0.9527629613876343, Category: BUSINESS-DEVELOPMENT
Job Description 2:
Top CV 1: CV 604, Similarity Score: 0.9328286647796631, Category: BUSINESS-DEVELOPMENT
Top CV 2: CV 256, Similarity Score: 0.9261701107025146, Category: INFORMATION-TECHNOLOGY
Top CV 3: CV 2298, Similarity Score: 0.9244762063026428, Category: ARTS
Top CV 4: CV 661, Similarity Score: 0.9240624904632568, Category: BUSINESS-DEVELOPMENT
Top CV 5: CV 1305, Similarity Score: 0.922995924949646, Category: DIGITAL-MEDIA
Job Description 3:
Top CV 1: CV 2359, Similarity Score: 0.9511442184448242, Category: ARTS
Top CV 2: CV 2360, Similarity Score: 0.9493