In [6]:
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [7]:
data = pd.read_csv(r'C:\Users\birdc\Desktop\merged.csv')

In [8]:
# Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [9]:
# Build the candidate profile by combining relevant columns
def build_candidate_profile(row):
    profile_parts = [
        row['Skills - Proficient'] if 'Skills - Proficient' in row.index else '',
        row['Skills - Experienced'] if 'Skills - Experienced' in row.index else '',
        row['Skills - Familiar'] if 'Skills - Familiar' in row.index else '',
        row['Job Responsibilities (1st Experience)'] if 'Job Responsibilities (1st Experience)' in row.index else '',
        row['Project Description (1st Project)'] if 'Project Description (1st Project)' in row.index else '',
        row['Project Tech Stack (1st Project)'] if 'Project Tech Stack (1st Project)' in row.index else '',
        row['Project Description (2nd Project)'] if 'Project Description (2nd Project)' in row.index else '',
        row['Project Tech Stack (2nd Project)'] if 'Project Tech Stack (2nd Project)' in row.index else '',
        row['Cumulative GPA'] if 'Cumulative GPA' in row.index else '',
        row['Achievements'] if 'Achievements' in row.index else ''
    ]  
    # Join non-empty parts with spaces
    return ' '.join(str(part) for part in profile_parts if part)

# Apply the function to each row in the dataset
data['candidate_profile'] = data.apply(build_candidate_profile, axis=1)


In [10]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings


In [11]:
def calculate_similarity(job_description, candidate_profile, weight_skills=0.3, weight_experience=0.3,
                         weight_projects=0.2, weight_cgpa=0.1, weight_achievements=0.1):
    
    job_emb = get_embeddings(job_description)
    candidate_emb = get_embeddings(candidate_profile)
    
    # Calculate the similarity score and apply weights
    score = cosine_similarity(job_emb.detach().numpy(), candidate_emb.detach().numpy())[0][0]
    
    # Weighted final score
    final_score = (weight_skills * score + 
                   weight_experience * score + 
                   weight_projects * score + 
                   weight_cgpa * score + 
                   weight_achievements * score)
    return final_score


In [12]:
def fetch_top_candidates(job_description, top_n=5):
    data['similarity_score'] = data['candidate_profile'].apply(
        lambda x: calculate_similarity(job_description, x)
    )
    top_candidates = data.sort_values(by='similarity_score', ascending=False).head(top_n)
    return top_candidates[['Name', 'Contact Email', 'similarity_score', 'Skills - Proficient', 
                           'Project Description (1st Project)', 'Cumulative GPA', 'Achievements']]

# Example job description
job_description = "Looking for a software developer skilled in Python, machine learning, and data science, with strong academic background."

# Fetch and display top 5 candidates
top_candidates = fetch_top_candidates(job_description, top_n=5)
print(top_candidates)


                  Name                       Contact Email  similarity_score  \
29       Vaibhav Ahuja           ahujavaibhav825@gmail.com          0.816795   
131  Divyanshi Singhal  divyanshi.singhal.20cse@bmu.edu.in          0.807355   
124     Deeksha Mandal             mdeeksha.1603@gmail.com          0.806475   
142    Himanshu Bhalla        himanshubhalla2002@gmail.com          0.803549   
138     Hardikya Gupta     hardikya.gupta.20cse@bmu.edu.in          0.802618   

                               Skills - Proficient  \
29                    C/C++, JavaScript, CSS, HTML   
131  Java, Python, JavaScript, CSS, HTML, Solidity   
124                                   Java, Python   
142         ['C/C++', 'JavaScript', 'CSS', 'HTML']   
138                                         Python   

                     Project Description (1st Project) Cumulative GPA  \
29   Developed a dynamic platform for listing prope...        7.87/10   
131  Designed and developed a decentralized P2P 