In [2]:
!pip install pandas
!pip install scikit-learn
!pip install sentence-transformers
!pip install PyPDF2 # For reading .pdf files
!pip install python-docx # For reading .docx files

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
     ---------------------------------------- 0.0/40.1 kB ? eta -:--:--
     ---------------------------------------- 40.1/40.1 kB 1.9 MB/s eta 0:00:00
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
   ---------------------------------------- 0.0/483.4 kB ? eta -:--:-

ERROR: Invalid requirement: '#'
ERROR: Invalid requirement: '#'


In [3]:
import pandas as pd
import os

# ---- 1. Define the correct paths to the CSV files ----
# The path is 'folder_name/file_name.csv'
resume_path = 'Resume.csv/Resume.csv'
jobs_path = 'job_title_des.csv/job_title_des.csv' # Use the exact filename from your folder

# ---- 2. Load the datasets ----
try:
    resume_df = pd.read_csv(resume_path)
    jobs_df = pd.read_csv(jobs_path)

    print("✅ Successfully loaded both datasets!")

    # ---- 3. Display the first few rows to inspect the data ----
    print("\n--- First 5 Resumes ---")
    print(resume_df.head())

    print("\n--- First 5 Job Descriptions ---")
    print(jobs_df.head())

except FileNotFoundError as e:
    print(f"❌ Error: Could not find the file.")
    print(f"Details: {e}")
    print("\nPlease double-check that the file and folder names in the script match what you have.")



✅ Successfully loaded both datasets!

--- First 5 Resumes ---
         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  

--- First 5 Job Descriptions ---
   Unnamed: 0             Job Title  \
0           0     Flutter Developer   
1           1      Django Developer   
2     

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np

# ---- 4. Generate Embeddings ----

# 1. Load a pre-trained Sentence Transformer model
# 'all-MiniLM-L6-v2' is a great model that balances speed and performance.
print("Loading the Sentence Transformer model... (This may take a moment on the first run)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded successfully.")


# 2. Prepare the text from our dataframes into lists
# It's a good practice to handle any potential missing values (NaNs) by filling them with an empty string.
resume_texts = resume_df['Resume_str'].fillna('').tolist()
job_texts = jobs_df['Job Description'].fillna('').tolist()


# 3. Generate the embeddings for resumes
print("\nGenerating embeddings for all resumes...")
# The model.encode() function does the heavy lifting of converting text to vectors.
# We'll show a progress bar to track the process.
resume_embeddings = model.encode(resume_texts, show_progress_bar=True)
print(f"✅ Generated embeddings for {len(resume_embeddings)} resumes.")


# 4. Generate the embeddings for job descriptions
print("\nGenerating embeddings for all job descriptions...")
job_embeddings = model.encode(job_texts, show_progress_bar=True)
print(f"✅ Generated embeddings for {len(job_embeddings)} job descriptions.")


# 5. (Optional but Recommended) Save the embeddings to files
# This saves a lot of time! You won't have to re-generate the embeddings every time you run the notebook.
np.save('resume_embeddings.npy', resume_embeddings)
np.save('job_embeddings.npy', job_embeddings)
print("\n✅ Embeddings saved to 'resume_embeddings.npy' and 'job_embeddings.npy'")



Loading the Sentence Transformer model... (This may take a moment on the first run)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully.

Generating embeddings for all resumes...


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

✅ Generated embeddings for 2484 resumes.

Generating embeddings for all job descriptions...


Batches:   0%|          | 0/72 [00:00<?, ?it/s]

✅ Generated embeddings for 2277 job descriptions.

✅ Embeddings saved to 'resume_embeddings.npy' and 'job_embeddings.npy'


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ---- Step 5: Calculate Cosine Similarity ----

# 1. Load the embeddings we saved earlier
# This is much faster than re-generating them!
print("Loading pre-computed embeddings...")
try:
    resume_embeddings = np.load('resume_embeddings.npy')
    job_embeddings = np.load('job_embeddings.npy')
    print("✅ Embeddings loaded successfully.")
except FileNotFoundError:
    print("❌ Error: Could not find .npy files. Please re-run Step 4 to generate embeddings.")
    # You might want to stop the script here if the files are missing
    # For now, we'll assume they exist.

# 2. Calculate the cosine similarity matrix
# This compares every resume embedding with every job embedding.
print("\nCalculating cosine similarity matrix...")
# The result is a matrix where matrix[i, j] is the similarity
# between the i-th resume and the j-th job.
cosine_sim_matrix = cosine_similarity(resume_embeddings, job_embeddings)
print(f"✅ Similarity matrix created with shape: {cosine_sim_matrix.shape}")


# ---- Step 6: Rank and Present Results ----

# Let's find the best job matches for a specific resume.
# We'll use the first resume in our dataset as an example (index 0).
resume_index = 0

# 1. Get the similarity scores for our chosen resume against all jobs.
resume_scores = cosine_sim_matrix[resume_index]

# 2. Get the indices of the top N job matches.
# We'll use np.argsort to get the indices that would sort the array,
# then we take the last 5 and reverse them to get the top 5.
top_n = 5
top_job_indices = np.argsort(resume_scores)[-top_n:][::-1]

# 3. Display the results
print(f"\n--- Top {top_n} Job Matches for Resume #{resume_index} ---")

# Get the text of the resume we are matching
# Using .iloc[resume_index] to get the specific resume's text
resume_text = resume_df['Resume_str'].iloc[resume_index]
print(f"\nOriginal Resume Category: {resume_df['Category'].iloc[resume_index]}")
# Optional: Print a snippet of the resume
# print(f"Resume Snippet: {resume_text[:200]}...\n")


for index in top_job_indices:
    # Get the job title and description from the original jobs dataframe
    job_title = jobs_df['Job Title'].iloc[index]
    
    # Get the similarity score and convert it to a percentage
    match_score = resume_scores[index] * 100
    
    print(f"✅ Match: {match_score:.2f}% | Job Title: {job_title}")


Loading pre-computed embeddings...
✅ Embeddings loaded successfully.

Calculating cosine similarity matrix...
✅ Similarity matrix created with shape: (2484, 2277)

--- Top 5 Job Matches for Resume #0 ---

Original Resume Category: HR
✅ Match: 64.66% | Job Title: Database Administrator
✅ Match: 64.42% | Job Title: Database Administrator
✅ Match: 57.51% | Job Title: Database Administrator
✅ Match: 56.66% | Job Title: Database Administrator
✅ Match: 56.34% | Job Title: Database Administrator
