In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib

In [11]:
df = pd.read_csv(r"C:\Users\Admin\OneDrive\Desktop\codedot\datasets\resumes.csv")

In [12]:
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [13]:
from bs4 import BeautifulSoup

def extract_text_from_html(html_str):
    soup = BeautifulSoup(html_str, "html.parser")
    return soup.get_text()

# Example usage
df['cleaned_resume_str'] = df['Resume_html'].apply(extract_text_from_html)


In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet if not already done
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.lower().split()  # Tokenize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization and stopwords removal
    return ' '.join(words)

# Clean the resumes
df['cleaned_resume_str'] = df['Resume_str'].apply(clean_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the cleaned resumes
X = vectorizer.fit_transform(df['cleaned_resume_str'])


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity_score(uploaded_resume, model, vectorizer):
    # Clean and vectorize the uploaded resume
    cleaned_resume = clean_text(uploaded_resume)
    vectorized_resume = vectorizer.transform([cleaned_resume])

    # Compute similarity with the entire dataset
    similarities = cosine_similarity(vectorized_resume, model)
    return similarities[0]

# Example of how to get similarity score for an uploaded resume
uploaded_resume = "Some new resume text here."
similarity_scores = get_similarity_score(uploaded_resume, X, vectorizer)

# Get the top matching resume
# Get the top matching resume index
top_match_index = similarity_scores.argmax()

# Print the best match ID and score
print(f"Best match ID: {df.iloc[top_match_index]['ID']}, Score: {similarity_scores[top_match_index]}")



Best match ID: 19464810, Score: 0.1254323057868905


In [19]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach()

# Example: Get BERT embeddings for resume
embeddings = get_bert_embeddings(r"C:\Users\Admin\OneDrive\Desktop\codedot\backend\media\resume\bariankit_btech.pdf")


In [20]:
print(embeddings)

tensor([[ 2.0770e-01,  1.7433e-02,  4.7484e-01, -1.5006e-01,  6.6126e-01,
         -7.2864e-02,  1.0969e-01,  4.2956e-01, -1.1855e-01, -4.4913e-01,
         -6.9159e-01, -1.0007e-02, -2.2287e-01,  3.6764e-01, -3.3715e-03,
          5.4258e-01, -5.1718e-02,  3.8636e-01, -3.7451e-01,  2.2723e-01,
          1.5203e-01, -3.0267e-01, -3.0373e-01,  2.1957e-01,  5.5932e-01,
         -4.5304e-01, -3.1568e-01, -2.3016e-02, -6.8137e-01,  1.3290e-01,
         -4.6275e-03,  3.4408e-01,  2.2946e-01, -3.1857e-02, -9.2900e-02,
         -3.4470e-01, -1.3629e-01,  7.1286e-02,  1.9241e-01,  4.3713e-01,
         -1.6573e-01, -7.8014e-01,  1.2397e-01, -8.0440e-02,  7.1485e-02,
         -3.8785e-01, -1.4395e-01,  3.1597e-01,  2.1335e-01, -1.0892e-01,
         -4.3754e-01,  5.0521e-01,  3.5838e-02, -2.8736e-01,  1.3528e-01,
          5.1759e-01,  7.2787e-01, -6.8178e-01, -1.9461e-01,  9.2856e-02,
          7.1501e-02,  2.9422e-01,  1.8766e-02, -3.3032e-01, -1.7471e-01,
          1.2435e-01, -3.3067e-02,  7.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have a list of job descriptions
job_descriptions = [
    "Software Engineer with experience in Python, machine learning, and data analysis.",
    "HR Administrator with expertise in employee relations, recruiting, and training."
]

# Example candidate resumes
resumes = [
    "Experienced software developer specializing in Python and machine learning.",
    "HR professional with skills in employee relations and marketing."
]

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform job descriptions and resumes
job_embeddings = vectorizer.fit_transform(job_descriptions)
resume_embeddings = vectorizer.transform(resumes)

# Calculate similarity score between the first resume and the first job description
similarity_score = cosine_similarity(resume_embeddings[0], job_embeddings[0])

print(f"Similarity Score for Job 1 and Resume 1: {similarity_score[0][0]}")


Similarity Score for Job 1 and Resume 1: 0.725666580414649


In [22]:
def predict_job_fit(similarity_score, threshold=0.7):
    if similarity_score >= threshold:
        return "Good Fit"
    else:
        return "Not a Good Fit"

# Predict the job fit for Resume 1 with Job 1
predicted_fit = predict_job_fit(similarity_score[0][0])
print(f"Candidate Job Fit: {predicted_fit}")


Candidate Job Fit: Good Fit


In [23]:
def get_best_fit_for_jobs(resumes, job_descriptions):
    all_scores = []
    
    for i, resume in enumerate(resumes):
        resume_embedding = vectorizer.transform([resume])
        
        for j, job_desc in enumerate(job_descriptions):
            job_embedding = job_embeddings[j]
            similarity = cosine_similarity(resume_embedding, job_embedding)
            all_scores.append((i, j, similarity[0][0]))
    
    # Sort by similarity score (high to low)
    sorted_scores = sorted(all_scores, key=lambda x: x[2], reverse=True)
    
    return sorted_scores

# Get best job fit for each resume
best_fits = get_best_fit_for_jobs(resumes, job_descriptions)

# Display the top 3 best fits (Job-Resume pairs with highest similarity scores)
for fit in best_fits[:3]:
    print(f"Resume {fit[0]} is a good fit for Job {fit[1]} with similarity score: {fit[2]}")


Resume 1 is a good fit for Job 1 with similarity score: 0.728317612774094
Resume 0 is a good fit for Job 0 with similarity score: 0.725666580414649
Resume 1 is a good fit for Job 0 with similarity score: 0.2315705261229729
