In [1]:
!pip install sentence-transformers
!pip install nltk
!pip install scikit-learn
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import string

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

from sentence_transformers import SentenceTransformer

nltk.download('stopwords')
nltk.download('wordnet')

print("Libraries loaded successfully")

Libraries loaded successfully


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Upload Resume Dataset


In [3]:
from google.colab import files
uploaded = files.upload()

Saving Resume.csv to Resume (1).csv


In [4]:
df = pd.read_csv("Resume.csv")

print("Dataset loaded")
df.head()

Dataset loaded


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


Cleaning the Resume txt File

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return " ".join(words)

df['cleaned_resume'] = df['Resume_str'].apply(preprocess_text)

print("Text preprocessing completed")
df.head()

Text preprocessing completed


Unnamed: 0,ID,Resume_str,Resume_html,Category,cleaned_resume
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr administratormarketing associate hr adminis...
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR,hr specialist u hr operation summary versatile...
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr director summary year experience recruiting...
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr specialist summary dedicated driven dynamic...
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr manager skill highlight hr skill hr departm...


# Load BERT Model

In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')

print("BERT model loaded")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]



sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BERT model loaded


In [9]:
# Convert Resume to Embeddings
resume_embeddings = model.encode(
    df['cleaned_resume'].tolist(),
    show_progress_bar=True
)

print("Embeddings created")

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

Embeddings created


In [10]:
# Skill Extraction Score
skills = [
    "python","machine learning","deep learning","nlp",
    "sql","tensorflow","pytorch","data analysis",
    "excel","communication","leadership"
]

def skill_score(text):
    score = 0
    for skill in skills:
        if skill in text:
            score += 1
    return score

df['skill_score'] = df['cleaned_resume'].apply(skill_score)

print("Skill scoring completed")

Skill scoring completed


In [11]:
# Cluster Resumes
kmeans = KMeans(n_clusters=8, random_state=42)

df['cluster'] = kmeans.fit_predict(resume_embeddings)

score = silhouette_score(resume_embeddings, df['cluster'])

print("Clustering completed")
print("Silhouette Score:", score)

Clustering completed
Silhouette Score: 0.084229685


In [12]:
# Enter Job Description
job_description = """
Looking for Data Scientist with Python, Machine Learning,
Deep Learning, NLP, SQL, TensorFlow experience
"""

cleaned_jd = preprocess_text(job_description)

jd_embedding = model.encode([cleaned_jd])

similarity = cosine_similarity(jd_embedding, resume_embeddings)[0]

df['similarity_score'] = similarity

print("Similarity calculated")

Similarity calculated


In [13]:
# Calculate Final Score (0–10 Ranking)
scaler = MinMaxScaler()

df['similarity_score'] = scaler.fit_transform(df[['similarity_score']])
df['skill_score'] = scaler.fit_transform(df[['skill_score']])

df['final_score'] = (0.7 * df['similarity_score'] +
                     0.3 * df['skill_score']) * 10

print("Final scoring completed")

Final scoring completed


In [16]:
# Show Top Candidates
top_candidates = df.sort_values(by='final_score', ascending=False)

top_candidates[['Category','final_score']].head(10)

Unnamed: 0,Category,final_score
926,AGRICULTURE,10.0
1717,ENGINEERING,9.320492
1762,ENGINEERING,9.171363
1218,CONSULTANT,8.202506
1348,AUTOMOBILE,8.048127
1323,AUTOMOBILE,7.680626
844,FITNESS,7.478516
1194,CONSULTANT,7.434566
2184,BANKING,7.348338
2395,AVIATION,7.315899


In [17]:
df.to_csv("ranked_resumes.csv", index=False)

print("File saved")

files.download("ranked_resumes.csv")

File saved


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>