# Step 1: Tokenize Text into Words

In [1]:
# Install required libraries
!pip install nltk

# Import necessary modules
import nltk
from nltk.tokenize import word_tokenize

# Download required resources
nltk.download('punkt')

# Example job description and CV content
job_description = "We are looking for a Python developer with experience in machine learning."
cv_text = "Experienced software engineer skilled in Python, Java, and machine learning."

# Tokenizing the text into words
job_tokens = word_tokenize(job_description)
cv_tokens = word_tokenize(cv_text)

# Display the tokens
print("Job Description Tokens:", job_tokens)
print("CV Tokens:", cv_tokens)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Job Description Tokens: ['We', 'are', 'looking', 'for', 'a', 'Python', 'developer', 'with', 'experience', 'in', 'machine', 'learning', '.']
CV Tokens: ['Experienced', 'software', 'engineer', 'skilled', 'in', 'Python', ',', 'Java', ',', 'and', 'machine', 'learning', '.']


# Step 2: Remove Stopwords and Punctuation

In [2]:
# Import necessary modules
import string
from nltk.corpus import stopwords
import nltk

# Download stopwords resource
nltk.download('stopwords')

# Get English stopwords and punctuation
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

# Function to filter tokens by removing stopwords and punctuation
def clean_tokens(tokens):
    return [word for word in tokens if word.lower() not in stop_words and word not in punctuation]

# Example job description and CV content
job_description = "We are looking for a Python developer with experience in machine learning."
cv_text = "Experienced software engineer skilled in Python, Java, and machine learning."

# Tokenize text into words
job_tokens = word_tokenize(job_description)
cv_tokens = word_tokenize(cv_text)

# Clean the tokens
job_tokens_cleaned = clean_tokens(job_tokens)
cv_tokens_cleaned = clean_tokens(cv_tokens)

# Display cleaned tokens
print("Cleaned Job Description Tokens:", job_tokens_cleaned)
print("Cleaned CV Tokens:", cv_tokens_cleaned)


Cleaned Job Description Tokens: ['looking', 'Python', 'developer', 'experience', 'machine', 'learning']
Cleaned CV Tokens: ['Experienced', 'software', 'engineer', 'skilled', 'Python', 'Java', 'machine', 'learning']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Step 3: Lemmatization/Stemming and Text Normalization

In [3]:
# Install required libraries
!pip install nltk

# Import necessary modules for lemmatization
import nltk # This line was missing. Added to import the nltk library.
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


# Download WordNet resource for lemmatization
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function for lemmatization and normalization
def lemmatize_and_normalize(tokens):
    # Normalize to lowercase and lemmatize
    return [lemmatizer.lemmatize(word.lower()) for word in tokens]

# Lemmatize and normalize the cleaned tokens
job_tokens_lemmatized = lemmatize_and_normalize(job_tokens_cleaned)
cv_tokens_lemmatized = lemmatize_and_normalize(cv_tokens_cleaned)

# Display lemmatized tokens
print("Lemmatized Job Description Tokens:", job_tokens_lemmatized)
print("Lemmatized CV Tokens:", cv_tokens_lemmatized)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Lemmatized Job Description Tokens: ['looking', 'python', 'developer', 'experience', 'machine', 'learning']
Lemmatized CV Tokens: ['experienced', 'software', 'engineer', 'skilled', 'python', 'java', 'machine', 'learning']


# Step 5: Segment CVs into Key Sections

In [4]:
# Example CV text for segmentation
cv_text = """
John Doe
Experienced software engineer skilled in Python and Java.
Skills: Python, Java, Machine Learning
Experience:
- Software Engineer at Company A (2018-2021)
- Developed applications using Python and Java
Education:
- B.Sc. in Computer Science, XYZ University
Certifications:
- Certified Python Developer
Summary:
- Passionate about developing scalable software solutions.
"""

# Function to segment the CV
def segment_cv(cv_text):
    sections = {
        "Skills": None,
        "Experience": None,
        "Education": None,
        "Certifications": None,
        "Summary": None,
    }

    # Split CV text into lines
    lines = cv_text.strip().split("\n")

    current_section = None

    for line in lines:
        line = line.strip()
        if line.startswith("Skills:"):
            current_section = "Skills"
            sections[current_section] = line.replace("Skills:", "").strip()
        elif line.startswith("Experience:"):
            current_section = "Experience"
            sections[current_section] = []
        elif line.startswith("Education:"):
            current_section = "Education"
            sections[current_section] = []
        elif line.startswith("Certifications:"):
            current_section = "Certifications"
            sections[current_section] = []
        elif line.startswith("Summary:"):
            current_section = "Summary"
            sections[current_section] = line.replace("Summary:", "").strip()
        else:
            if current_section in ["Experience", "Education", "Certifications"]:
                sections[current_section].append(line)

    # Convert Experience, Education, and Certifications lists to strings
    sections["Experience"] = "\n".join(sections["Experience"])
    sections["Education"] = "\n".join(sections["Education"])
    sections["Certifications"] = "\n".join(sections["Certifications"])

    return sections

# Segment the CV
cv_sections = segment_cv(cv_text)

# Display the segmented CV sections
for section, content in cv_sections.items():
    print(f"{section}: {content}")


Skills: Python, Java, Machine Learning
Experience: - Software Engineer at Company A (2018-2021)
- Developed applications using Python and Java
Education: - B.Sc. in Computer Science, XYZ University
Certifications: - Certified Python Developer
Summary: 


# Step 2: Text Representation.

2.i. Model Selection:

In [5]:
# Install necessary libraries
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Example job description and CV sections from previous steps
job_description = "Looking for a Python developer with experience in machine learning."
cv_sections = {
    "Skills": "Python, Java, Machine Learning",
    "Experience": "Software Engineer at Company A (2018-2021)",
    "Education": "B.Sc. in Computer Science, XYZ University",
    "Certifications": "Certified Python Developer",
    "Summary": "Experienced software engineer skilled in Python and Java."
}

# Generate embeddings for the job description and CV sections
job_desc_embedding = model.encode(job_description)

cv_embeddings = {}
for section, content in cv_sections.items():
    cv_embeddings[section] = model.encode(content)

# Display the embeddings
print("Job Description Embedding:", job_desc_embedding)
print("CV Section Embeddings:")
for section, embedding in cv_embeddings.items():
    print(f"{section}: {embedding[:10]}...")  # Print first 10 elements for brevity


Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Job Description Embedding: [-3.95822942e-01 -7.88429916e-01 -1.26781374e-01 -1.61182314e-01
 -1.00961238e-01  9.43055078e-02 -2.58025587e-01  3.19373786e-01
 -5.20937264e-01 -5.54048978e-02 -3.53664249e-01  4.94888276e-02
 -7.76183978e-02  1.57976955e-01 -3.55736852e-01  2.29111999e-01
  4.85208295e-02  1.66648492e-01  2.30087012e-01 -4.11233097e-01
 -5.57387471e-01 -1.39920026e-01  6.10169291e-01 -1.73858747e-01
  2.47948259e-01 -1.63806409e-01  1.13581354e-02 -1.63399369e-01
  1.04615778e-01 -5.27797490e-02  6.86232865e-01 -2.43322318e-03
  3.66989613e-01 -1.70384154e-01 -3.68356526e-01  4.97611851e-01
 -3.91178191e-01  7.39782527e-02 -2.37683624e-01  2.03166023e-01
 -1.65987268e-01  5.37262224e-02 -3.11962396e-01  8.68798196e-02
 -3.82333815e-01 -3.59470636e-01 -1.51332375e-02  2.05379147e-02
  6.07893057e-02 -8.49068463e-02  9.36539024e-02 -6.77977502e-01
  1.96040943e-01 -1.36470878e+00  2.40235567e-01  3.24047089e-01
  4.77635652e-01  2.95421094e-01  9.87001583e-02 -3.55705231e-0

#Steps for Fine-Tuning BERT/RoBERTa/SBERT:

In [6]:
import torch
# Set device to 'cuda' to utilize GPU
device = torch.device("cuda")
model.to(device)
# Example: Assuming you have cv_embeddings and job_desc_embedding from previous steps
inputs = {
    "cv_embeddings": torch.tensor(list(cv_embeddings.values())).to(device),
    "job_desc_embedding": torch.tensor(job_desc_embedding).to(device)
}

# You can access the inputs like this:
cv_embeddings_tensor = inputs["cv_embeddings"]
job_desc_embedding_tensor = inputs["job_desc_embedding"]

  "cv_embeddings": torch.tensor(list(cv_embeddings.values())).to(device),


In [7]:
# Import necessary components
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Assuming job_descs and cv_sections are lists of strings
job_descs = [
    "Looking for a Python developer with experience in web development.",
    "Hiring a Data Scientist with strong ML skills."
]

cv_sections = [
    "Experienced Python developer with web dev skills.",
    "Skilled in machine learning with relevant project experience."
]

# Tokenize the pairs
inputs = tokenizer(job_descs, cv_sections, truncation=True, padding=True, return_tensors="pt")

# Move model and inputs to the same device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Move input tensors to the same device
inputs = {key: val.to(device) for key, val in inputs.items()}

# Forward pass (fine-tuning will need actual labels for job fit, just using dummy labels for now)
labels = torch.tensor([1, 0]).to(device)  # Move labels to the correct device
outputs = model(**inputs, labels=labels)

# Check the loss and logits
loss = outputs.loss
logits = outputs.logits
print(f"Loss: {loss}")
print(f"Logits: {logits}")


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Loss: 0.7335177659988403
Logits: tensor([[ 0.4018, -0.1433],
        [ 0.3470, -0.1782]], device='cuda:0', grad_fn=<AddmmBackward0>)


# Step 2.3: Embedding Representation

In [8]:
import time
from transformers import BertModel
import torch

# Load the fine-tuned BERT model without classification head
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()  # Set to evaluation mode

# Same job_descs and cv_sections
job_descs = [
    "Looking for a Python developer with experience in web development.",
    "Hiring a Data Scientist with strong ML skills."
]

cv_sections = [
    "Experienced Python developer with web dev skills.",
    "Skilled in machine learning with relevant project experience."
]

# Tokenize and move inputs to device
inputs = tokenizer(job_descs, cv_sections, truncation=True, padding=True, return_tensors="pt")
inputs = {key: val.to(device) for key, val in inputs.items()}

# Measure the time taken for embedding extraction
start_time = time.time()

# Extract embeddings
with torch.no_grad():
    outputs = model(**inputs)

end_time = time.time()

# Time taken
print(f"Time taken for embedding extraction: {end_time - start_time:.2f} seconds")

# Extract [CLS] token embeddings
job_desc_embeddings = outputs.last_hidden_state[:, 0, :]
cv_embeddings = outputs.last_hidden_state[:, 0, :]

# job_desc_embeddings and cv_embeddings are now dense vectors
from torch.nn.functional import cosine_similarity

# Calculate cosine similarity between job description embeddings and CV embeddings
cos_similarities = cosine_similarity(job_desc_embeddings, cv_embeddings)

# Print the cosine similarity scores
print(f"Cosine Similarities: {cos_similarities}")




Time taken for embedding extraction: 0.02 seconds
Cosine Similarities: tensor([1.0000, 1.0000], device='cuda:0')


#3. Relevancy Ranking (Section-wise):
1. Break Down CV into Sections

In [9]:
cv_sections = {
    "Skills": "Python, Java, Machine Learning, Deep Learning, Data Analysis, C, C++,C#,Ruby,Go",
    "Experience": "Software Engineer at Company A from 2018-2021. Worked on full-stack web development and machine learning projects.",
    "Education": "B.Sc. in Computer Science, XYZ University, Graduated in 2017 with First-Class Honors.",
    "Courses": "Completed courses in Advanced Machine Learning, Cloud Computing, and Big Data Analytics.",
    "Certifications": "Certified TensorFlow Developer, AWS Certified Solutions Architect, Oracle Java Certification.",
    "Summary": "A highly motivated software engineer with over 4 years of experience in software development, data science, and machine learning. Passionate about building scalable applications and leveraging AI to solve real-world problems.",
    "Objectives": "Looking for a challenging role that allows me to apply my technical skills and contribute to innovative solutions in the AI and software engineering space."
}


# Step 3.2: Extract Embeddings for Each Section.

In [10]:
from transformers import BertModel

# Load the pre-trained BERT model (instead of BERT for sequence classification)
model = BertModel.from_pretrained('bert-base-uncased')

# Move the model to the correct device
model.to(device)

# Function to extract embeddings for each section of the CV
def extract_section_embeddings(cv_sections, job_desc, model, tokenizer, device):
    section_embeddings = {}

    for section, text in cv_sections.items():
        # Tokenize the section text and job description
        inputs = tokenizer(text, job_desc, truncation=True, padding=True, return_tensors="pt")

        # Move inputs to the same device (GPU/CPU)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # Forward pass to get embeddings
        with torch.no_grad():  # No need to calculate gradients
            outputs = model(**inputs)
            # Extract the pooled output (representation of the [CLS] token)
            embeddings = outputs.pooler_output

        # Store embeddings for each section
        section_embeddings[section] = embeddings

    return section_embeddings

# Example job description
job_desc = "Looking for a software engineer with experience in Python, machine learning, and cloud computing."

# Start the timer
start_time = time.time()

# Extract embeddings for each section of the CV
section_embeddings = extract_section_embeddings(cv_sections, job_desc, model, tokenizer, device)

# End the timer
end_time = time.time()

print(f"Time taken for embedding extraction: {end_time - start_time:.2f} seconds")

# Print the embeddings for each section
for section, embedding in section_embeddings.items():
    print(f"{section} Embedding: {embedding}")



Time taken for embedding extraction: 0.09 seconds
Skills Embedding: tensor([[-0.9629, -0.8630, -0.9988,  0.9612,  0.9548, -0.6859,  0.9628,  0.6901,
         -0.9911, -1.0000, -0.9252,  0.9900,  0.9843,  0.9562,  0.9505, -0.9140,
         -0.7059, -0.8826,  0.7133, -0.5995,  0.9182,  1.0000, -0.7594,  0.7618,
          0.8691,  0.9996, -0.9148,  0.9489,  0.9735,  0.8733, -0.8804,  0.6571,
         -0.9935, -0.7611, -0.9984, -0.9988,  0.8398, -0.9030, -0.6081, -0.5019,
         -0.9559,  0.8161,  1.0000,  0.3064,  0.9045, -0.7513, -1.0000,  0.6923,
         -0.9064,  0.9989,  0.9938,  0.9902,  0.7662,  0.8442,  0.8113, -0.7910,
          0.4884,  0.5560, -0.6823, -0.9113, -0.8801,  0.8198, -0.9868, -0.9340,
          0.9931,  0.9935, -0.7467, -0.7956, -0.6586,  0.5335,  0.9789,  0.6357,
         -0.6694, -0.9297,  0.9827,  0.7758, -0.9208,  1.0000, -0.8017, -0.9859,
          0.9896,  0.9927,  0.8971, -0.9598,  0.9738, -1.0000,  0.9426, -0.6350,
         -0.9929,  0.7854,  0.9018, -0.66

# step 3.4. Rank Sections Based on Similarity Score

In [11]:
import torch
import torch.nn.functional as F

# Assuming `section_embeddings` and `job_desc_embedding` are already computed from previous steps

# Function to calculate cosine similarity between two vectors
def compute_cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2)

# Assuming you have already extracted the job description embedding (we'll reuse this)
job_desc_embedding = extract_section_embeddings({"JobDesc": job_desc}, job_desc, model, tokenizer, device)["JobDesc"]

# Compute cosine similarity for each section
section_names = list(cv_sections.keys())
cosine_similarities = []
for section in section_names:
    section_embedding = section_embeddings[section]
    similarity = compute_cosine_similarity(job_desc_embedding, section_embedding)
    cosine_similarities.append(similarity.item())  # Convert to scalar value

# Rank sections based on similarity score
ranked_sections = sorted(zip(section_names, cosine_similarities), key=lambda x: x[1], reverse=True)

# Print ranked sections
print("Ranked Sections based on similarity:")
for section, score in ranked_sections:
    print(f"{section}: {score:.4f}")



Ranked Sections based on similarity:
Courses: 0.9993
Summary: 0.9992
Objectives: 0.9991
Experience: 0.9990
Skills: 0.9937
Certifications: 0.9927
Education: 0.9779


# step 3.5 Soft Skills Matching:

In [12]:
import spacy
import torch
import torch.nn.functional as F

# Load spaCy's pre-trained NER model
nlp = spacy.load('en_core_web_sm')

# Define a set of common soft skills to look for
soft_skills = set(["teamwork", "leadership", "communication", "problem-solving", "adaptability", "creativity", "work ethic", "time management"])

# Function to extract soft skills from text using NER
def extract_soft_skills(text):
    doc = nlp(text)
    extracted_skills = set()

    for token in doc:
        if token.text.lower() in soft_skills:  # Check if the token matches predefined soft skills
            extracted_skills.add(token.text.lower())

    return extracted_skills

# Function to compute cosine similarity between two embeddings
def compute_cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2)

# Example job description and CV sections
job_description = "We are looking for someone with excellent teamwork, communication, and leadership skills."
cv_sections = {
    "Skills": "Proficient in Python, communication, teamwork, and leadership.",
    "Experience": "Worked as a team lead with good problem-solving skills.",
    "Education": "Completed B.Sc. in Computer Science."
}

# 1. Extract soft skills from job description and CV sections
job_soft_skills = extract_soft_skills(job_description)
cv_soft_skill_matches = {}

for section, content in cv_sections.items():
    section_soft_skills = extract_soft_skills(content)
    # Calculate soft skill match score as the number of matching skills divided by total number of job soft skills
    matches = job_soft_skills.intersection(section_soft_skills)
    soft_skill_score = len(matches) / len(job_soft_skills) if job_soft_skills else 0
    cv_soft_skill_matches[section] = soft_skill_score

# 2. Assuming section_embeddings and job_desc_embedding are already computed from previous steps
# (For demonstration, reusing job description as embedding placeholder)
section_embeddings = {}  # Replace this with actual embeddings
job_desc_embedding = torch.rand(1, 768)  # Placeholder for the embedding of the job description

# List of CV section names
section_names = list(cv_sections.keys())

# 3. Compute cosine similarity for each CV section
cosine_similarities = []
for section in section_names:
    # Extract or compute section embeddings (replace with actual embeddings)
    section_embedding = torch.rand(1, 768)  # Placeholder, replace this line
    similarity = compute_cosine_similarity(job_desc_embedding, section_embedding)
    cosine_similarities.append(similarity.item())  # Convert to scalar

# 4. Rank sections based on similarity score
ranked_sections = sorted(zip(section_names, cosine_similarities), key=lambda x: x[1], reverse=True)

# 5. Combine section similarity scores and soft skill match scores
final_scores = {section: (similarity + cv_soft_skill_matches.get(section, 0)) / 2 for section, similarity in ranked_sections}

# 6. Print final relevancy scores
print("\nFinal relevancy scores (combining section similarity and soft skill matching):")
for section, score in final_scores.items():
    print(f"{section}: {score:.2f}")




Final relevancy scores (combining section similarity and soft skill matching):
Experience: 0.38
Skills: 0.88
Education: 0.37


# 4. Overall CV Ranking

In [13]:
# Define weights for each section
section_weights = {
    "Skills": 0.40,
    "Experience": 0.30,
    "Education": 0.15,
    "Certifications": 0.10,
    "Summary": 0.05
}

# Function to calculate overall score for a CV using weighted averaging
def calculate_overall_score(section_scores, section_weights):
    overall_score = 0.0
    for section, score in section_scores.items():
        weight = section_weights.get(section, 0)  # Default weight is 0 if section is not found in the weights
        overall_score += score * weight
    return overall_score

# Example: Suppose we have three CVs with section-wise relevancy scores
cv_scores = {
    "CV1": {
        "Skills": 0.88,
        "Experience": 0.70,
        "Education": 0.60,
        "Certifications": 0.55,
        "Summary": 0.75
    },
    "CV2": {
        "Skills": 0.90,
        "Experience": 0.85,
        "Education": 0.78,
        "Certifications": 0.60,
        "Summary": 0.80
    },
    "CV3": {
        "Skills": 0.82,
        "Experience": 0.65,
        "Education": 0.72,
        "Certifications": 0.50,
        "Summary": 0.70
    }
}

# Compute overall score for each CV
overall_scores = {cv: calculate_overall_score(scores, section_weights) for cv, scores in cv_scores.items()}

# Rank CVs based on overall scores
ranked_cvs = sorted(overall_scores.items(), key=lambda x: x[1], reverse=True)

# Print ranked CVs
print("Ranked CVs based on overall relevancy score:")
for cv, score in ranked_cvs:
    print(f"{cv}: {score:.4f}")


Ranked CVs based on overall relevancy score:
CV2: 0.8320
CV1: 0.7445
CV3: 0.7160


## 5. Handling Incomplete or Varying CV Formats:

The model should handle CVs with missing sections by assigning a default low relevancy score to missing sections (e.g., if the CV doesn’t have a certification, skills , experience section) now lets continue with that

In [14]:
# Define the default score for missing sections
DEFAULT_SCORE = 0.1

# Define the sections we expect in a CV
expected_sections = ["Skills", "Experience", "Education", "Certifications", "Courses", "Summary", "Objectives"]

# Example CVs with missing sections
cv_list = [
    {
        "Skills": "Python, Java, Machine Learning",
        "Experience": "Software Engineer at Company A from 2018-2021",
        "Education": "B.Sc. in Computer Science, XYZ University"
        # Missing Certifications, Courses, Summary, and Objectives
    },
    {
        "Skills": "HTML, CSS, JavaScript",
        "Experience": "Frontend Developer at Company B from 2020-2022",
        "Certifications": "Certified Frontend Developer",
        # Missing Education, Courses, Summary, and Objectives
    },
]

# Simulate some relevancy scores from previous ranking
section_scores_per_cv = [
    {"Skills": 0.88, "Experience": 0.90, "Education": 0.75},  # Missing Certifications, Courses, Summary, Objectives
    {"Skills": 0.85, "Experience": 0.80, "Certifications": 0.92}  # Missing Education, Courses, Summary, Objectives
]

# Handle missing sections by assigning default scores
def handle_missing_sections(section_scores, expected_sections, default_score=DEFAULT_SCORE):
    # Iterate over the expected sections
    for section in expected_sections:
        # If the section is missing, assign the default score
        if section not in section_scores:
            section_scores[section] = default_score
    return section_scores

# Process each CV and handle missing sections
updated_scores = []
for cv_scores in section_scores_per_cv:
    updated_scores.append(handle_missing_sections(cv_scores, expected_sections))

# Now calculate the final weighted score for each CV
def calculate_final_score(section_scores, weights):
    # Multiply each section's score by its weight and sum them up
    total_score = sum(section_scores[section] * weights.get(section, 0) for section in section_scores)
    return total_score

# Example weights for each section
weights = {
    "Skills": 0.4,
    "Experience": 0.3,
    "Education": 0.15,
    "Certifications": 0.1,
    "Courses": 0.05,
    "Summary": 0.05,
    "Objectives": 0.05
}

# Calculate the overall scores for each CV
final_scores = [calculate_final_score(cv_scores, weights) for cv_scores in updated_scores]

# Print the final scores for each CV
for idx, score in enumerate(final_scores):
    print(f"CV {idx+1} Final Score: {score:.4f}")


CV 1 Final Score: 0.7595
CV 2 Final Score: 0.7020


# 6. Postprocessing, we are implement two tasks:

Ranked list of CV sections for each CV:
For each CV, output the list of sections sorted by their relevancy score.

Ranked list of CVs based on overall job description fit:
Sort all the CVs by their final relevancy score and output them in order of their fit for the job description.

In [15]:
# 1. Postprocessing for Section-wise Ranking (for each CV)
def output_ranked_sections(cv_section_scores):
    for cv_name, section_scores in cv_section_scores.items():
        print(f"Ranked sections for {cv_name}:")
        # Sort sections by score
        ranked_sections = sorted(section_scores.items(), key=lambda x: x[1], reverse=True)
        for section, score in ranked_sections:
            print(f"{section}: {score:.4f}")
        print("\n")

# Example section scores (replace these with actual values from previous steps)
cv_section_scores = {
    "CV 1": {"Experience": 0.8, "Skills": 0.9, "Education": 0.7, "Certifications": 0.1},
    "CV 2": {"Experience": 0.7, "Skills": 0.85, "Education": 0.6, "Certifications": 0.1}
}

# Output ranked sections for each CV
output_ranked_sections(cv_section_scores)

# 2. Postprocessing for CV Ranking based on overall relevancy score
def output_ranked_cvs(cv_final_scores):
    # Sort CVs by their final score
    ranked_cvs = sorted(cv_final_scores.items(), key=lambda x: x[1], reverse=True)
    print("Ranked CVs based on overall job description fit:")
    for cv_name, score in ranked_cvs:
        print(f"{cv_name}: {score:.4f}")

# Example final CV scores (replace these with actual final scores from previous steps)
cv_final_scores = {
    "CV 1": 0.7595,
    "CV 2": 0.7020
}

# Output ranked list of CVs
output_ranked_cvs(cv_final_scores)


Ranked sections for CV 1:
Skills: 0.9000
Experience: 0.8000
Education: 0.7000
Certifications: 0.1000


Ranked sections for CV 2:
Skills: 0.8500
Experience: 0.7000
Education: 0.6000
Certifications: 0.1000


Ranked CVs based on overall job description fit:
CV 1: 0.7595
CV 2: 0.7020


# 7.Explainability (Bonus):

The model should provide explanations for the ranking by highlighting the specific keywords or skills that contributed most to the score.
For example, "This CV ranked high because it contains 'Python', 'machine learning', and '5 years of experience' which were mentioned in the job description.

In [16]:
# Define job description keywords (from earlier extraction)
job_keywords = ["Python", "machine learning", "5 years of experience", "teamwork", "leadership"]

# Extracted key skills/attributes from CVs (assumed extracted in earlier steps)
cv_keywords = {
    "CV 1": {
        "Skills": ["Python", "machine learning", "data science"],
        "Experience": ["5 years of experience", "software engineer"],
        "Education": ["B.Sc. in Computer Science"],
        "Certifications": ["AWS Certified Developer"]
    },
    "CV 2": {
        "Skills": ["Java", "Python", "leadership"],
        "Experience": ["3 years of experience", "team lead"],
        "Education": ["M.Sc. in Data Science"],
        "Certifications": ["Google Cloud Certified"]
    }
}

# Explanation logic: match CV sections with job description keywords
def generate_explanation(cv_name, cv_keywords, job_keywords):
    explanation = []
    for section, keywords in cv_keywords.items():
        matched_keywords = [kw for kw in keywords if kw in job_keywords]
        if matched_keywords:
            explanation.append(f"{section} matches: {', '.join(matched_keywords)}")
    return explanation

# Generate explainability report for each CV
for cv_name, keywords in cv_keywords.items():
    print(f"Explanation for {cv_name}:")
    explanation = generate_explanation(cv_name, keywords, job_keywords)
    if explanation:
        print("\n".join(explanation))
    else:
        print("No significant matches found.")
    print("\n")


Explanation for CV 1:
Skills matches: Python, machine learning
Experience matches: 5 years of experience


Explanation for CV 2:
Skills matches: Python, leadership




# 8.Dynamic Job Description Handling:

8.1If the job description changes, reprocess the CVs and update the rankings accordingly.

8.2Use embeddings to re-rank the sections and overall CV scores based on the new description.

In [17]:
# Assuming we already have the CV sections stored in `cv_sections` for both CV 1 and CV 2
# If not defined, here is how you can structure them (based on your existing sections):

cv_sections_cv1 = {
    "Skills": "Python, machine learning, data science",
    "Experience": "5 years in software development",
    "Education": "B.Sc. in Computer Science",
    "Certifications": "AWS Certified Solutions Architect"
}

cv_sections_cv2 = {
    "Skills": "Python, leadership, project management",
    "Experience": "3 years in cloud computing",
    "Education": "B.Tech. in Information Technology",
    "Certifications": "Google Cloud Certified Professional Architect"
}


# 8.1: Function to detect job description changes and reprocess CVs
def reprocess_cvs(cv_sections, job_desc, model, tokenizer, device):
    print("New job description detected. Reprocessing CVs...")

    # Step 1: Extract new embeddings for the updated job description
    job_desc_embedding = extract_section_embeddings({"JobDesc": job_desc}, job_desc, model, tokenizer, device)["JobDesc"]

    # Step 2: Extract section embeddings for each CV
    section_embeddings = extract_section_embeddings(cv_sections, job_desc, model, tokenizer, device)

    return job_desc_embedding, section_embeddings

# Example usage with a new job description:
new_job_desc = "Looking for a Software Engineer with expertise in Python, cloud computing, and leadership."

# Reprocess CV 1 and CV 2 based on the new job description
job_desc_embedding_cv1, section_embeddings_cv1 = reprocess_cvs(cv_sections_cv1, new_job_desc, model, tokenizer, device)
job_desc_embedding_cv2, section_embeddings_cv2 = reprocess_cvs(cv_sections_cv2, new_job_desc, model, tokenizer, device)

print("Reprocessing for new job description completed.")

New job description detected. Reprocessing CVs...
New job description detected. Reprocessing CVs...
Reprocessing for new job description completed.


In [18]:
# 8.2: Function to compute cosine similarity and re-rank sections based on new job description
def compute_and_rank_sections(job_desc_embedding, section_embeddings):
    # Step 3: Compute cosine similarity for each section with the new job description embedding
    section_names = list(section_embeddings.keys())
    cosine_similarities = []

    for section in section_names:
        section_embedding = section_embeddings[section]
        similarity = compute_cosine_similarity(job_desc_embedding, section_embedding)
        cosine_similarities.append(similarity.item())  # Convert to scalar value

    # Step 4: Rank sections based on similarity scores
    ranked_sections = sorted(zip(section_names, cosine_similarities), key=lambda x: x[1], reverse=True)

    return ranked_sections

# Compute and rank sections for CV 1
ranked_sections_cv1 = compute_and_rank_sections(job_desc_embedding_cv1, section_embeddings_cv1)

# Compute and rank sections for CV 2
ranked_sections_cv2 = compute_and_rank_sections(job_desc_embedding_cv2, section_embeddings_cv2)

# Output the results for CV 1
print("\nCV 1 Reprocessed Ranked Sections:")
for section, score in ranked_sections_cv1:
    print(f"{section}: {score:.4f}")

# Output the results for CV 2
print("\nCV 2 Reprocessed Ranked Sections:")
for section, score in ranked_sections_cv2:
    print(f"{section}: {score:.4f}")



CV 1 Reprocessed Ranked Sections:
Skills: 0.9919
Experience: 0.9870
Certifications: 0.9801
Education: 0.9681

CV 2 Reprocessed Ranked Sections:
Skills: 0.9976
Experience: 0.9972
Certifications: 0.9945
Education: 0.9843
