In [1]:
!conda install pip -y
!pip install -U sentence-transformers
!pip install tf-keras

Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
import re
from collections import defaultdict, Counter
import pandas as pd

## Install PDF Support (for PyMuPDF)

Required for extracting resume text from uploaded PDF files.


In [5]:
# Uncomment if running in a fresh environment
#!pip install pymupdf

## Load the Datasets

In [7]:
queries_df = pd.read_csv('../data/Resume.csv')  # Resumes as queries
documents_df = pd.read_csv('../data/job_title_des.csv')  # Jobs as documents

print("Resume dataset columns:", queries_df.columns.tolist())
print("Job dataset columns:", documents_df.columns.tolist())

Resume dataset columns: ['ID', 'Resume_str', 'Resume_html', 'Category']
Job dataset columns: ['Document ID', 'Job Title', 'Job Description']


## Filter Resumes for Tech-Related Roles

In [12]:
queries_df['Category'] = queries_df['Category'].str.upper().str.strip()
target_categories = ['INFORMATION-TECHNOLOGY']  
filtered_queries = queries_df[queries_df['Category'].isin(target_categories)].copy()

## Basic Text Preprocessing

- Convert all text to lowercase for consistency.
- Strip leading and trailing whitespace.
- Apply these cleaning steps to relevant text columns (e.g., job descriptions, resume text).


In [15]:
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""
documents_df['cleaned_description'] = documents_df['Job Description'].apply(clean_text)
filtered_queries['cleaned_text'] = filtered_queries['Resume_str'].apply(clean_text)

## PDF Resume Text Extraction and Preprocessing

This section defines a helper function to extract text from a PDF resume and clean it using the same preprocessing as the existing resumes.

In [17]:
import fitz  # PyMuPDF
def extract_and_clean_pdf_resume(pdf_path):
    """Extract and clean text from a PDF resume using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return clean_text(text)  

In [19]:
# Print Sample rows 
print("Resumes:")
print(filtered_queries[['ID']].head())
print(filtered_queries[['cleaned_text']].head())
print(filtered_queries[['Category']].head())

print("\nDocs:")
print(documents_df[['Job Title', 'cleaned_description']].head())

Resumes:
           ID
217  36856210
218  21780877
219  33241454
220  25990239
221  16899268
                                          cleaned_text
217  information technology         summary     ded...
218  information technology specialist\tgs11       ...
219  information technology supervisor       summar...
220  information technology instructor       summar...
221  information technology manager/analyst        ...
                   Category
217  INFORMATION-TECHNOLOGY
218  INFORMATION-TECHNOLOGY
219  INFORMATION-TECHNOLOGY
220  INFORMATION-TECHNOLOGY
221  INFORMATION-TECHNOLOGY

Docs:
              Job Title                                cleaned_description
0     Flutter Developer  we are looking for hire experts flutter develo...
1      Django Developer  python/django (developer/lead) - job code(pdj ...
2      Machine Learning  data scientist (contractor)\n\nbangalore, in\n...
3         iOS Developer  job description:\n\nstrong framework outside o...
4  Full Stack Developer  jo

## Switching to Semantic Retrieval (LLM-based)

In this section, we transition to a **semantic retrieval approach** using a pre-trained transformer model (`all-MiniLM-L6-v2`). Instead of relying on exact token overlap, this method captures **semantic meaning** to compute similarity between resumes and job descriptions in embedding space.

### Key Difference:
- **TF-IDF**: Lexical overlap-based
- **LLM Embeddings**: Context-aware, meaning-based

In [22]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

# Load pre-trained embedding model
llm_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode job descriptions (documents)
job_ids = documents_df['Document ID'].tolist()
job_titles = documents_df['Job Title'].tolist()
job_texts = documents_df['cleaned_description'].tolist()
job_embeddings = llm_model.encode(job_texts, convert_to_tensor=True, normalize_embeddings=True)

# Encode resumes (queries)
resume_ids = filtered_queries['ID'].tolist() if 'ID' in filtered_queries.columns else list(range(len(filtered_queries)))
resume_texts = filtered_queries['cleaned_text'].tolist()
resume_embeddings = llm_model.encode(resume_texts, convert_to_tensor=True, normalize_embeddings=True)




## LLM: Match Jobs to Uploaded PDF Resume

This function uses semantic similarity (SentenceTransformers) to match a user-uploaded PDF resume to the most relevant job descriptions.

In [25]:
def match_pdf_resume_llm(pdf_path, job_df, llm_model, job_embeddings, top_k=10):
    cleaned_resume = extract_and_clean_pdf_resume(pdf_path)

    resume_embedding = llm_model.encode(cleaned_resume, convert_to_tensor=True, normalize_embeddings=True)

    from sentence_transformers import util
    cosine_scores = util.cos_sim(resume_embedding, job_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    print("=== LLM Top Job Matches ===")
    for rank, (score, idx) in enumerate(zip(top_results.values, top_results.indices), start=1):
        job_idx = idx.item()  # Convert tensor to int
        title = job_df.iloc[job_idx]['Job Title']
        print(f"{rank}. {title} (Score: {score.item():.4f})")
        
    return top_results

In [27]:
def save_llm_results_for_all_resumes(filtered_queries, documents_df, llm_model, job_embeddings, output_path="llm_resume_to_jobs.csv", top_k=10):
    from sentence_transformers import util
    import torch
    import pandas as pd

    resume_ids = filtered_queries['ID'].tolist() if 'ID' in filtered_queries.columns else list(range(len(filtered_queries)))
    resume_texts = filtered_queries['cleaned_text'].tolist()
    
    output_rows = []

    for i, resume_text in enumerate(resume_texts):
        resume_id = resume_ids[i]

        # Encode the resume as a query
        resume_embedding = llm_model.encode(resume_text, convert_to_tensor=True, normalize_embeddings=True)

        # Compute cosine similarity to all job embeddings
        cosine_scores = util.cos_sim(resume_embedding, job_embeddings)[0]
        top_results = torch.topk(cosine_scores, k=top_k)

        for rank, (score, idx) in enumerate(zip(top_results.values, top_results.indices), start=1):
            idx = int(idx)  # convert from tensor to int
            job_id = documents_df.iloc[idx]['Document ID']
            job_title = documents_df.iloc[idx]['Job Title']

            output_rows.append({
                'Resume_ID': resume_id,
                'Rank': rank,
                'Matched_Job_ID': job_id,
                'Job_Title': job_title,
                'LLM_Score': round(score.item(), 4)
            })

    llm_output_df = pd.DataFrame(output_rows)
    llm_output_df.to_csv(output_path, index=False)
    print(f"LLM-based matches saved to '{output_path}'")

In [29]:
def match_job_to_resumes_llm(job_index, job_text, job_title, resume_df, llm_model, resume_embeddings, top_k=10):
    """
    Given a job description, find top-matching resumes using LLM embeddings.
    """
    from sentence_transformers import util
    import torch

    # Encode the job description
    job_embedding = llm_model.encode(job_text, convert_to_tensor=True, normalize_embeddings=True)

    # Compute cosine similarity between job and all resume embeddings
    cosine_scores = util.cos_sim(job_embedding, resume_embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    print(f"\n=== LLM Top Resume Matches for Job #{job_index}: {job_title} ===")
    for rank, (score, idx) in enumerate(zip(top_results.values, top_results.indices), start=1):
        resume_id = resume_df.iloc[idx.item()]["ID"]
        print(f"{rank}. Resume ID: {resume_id} (Score: {score.item():.4f})")


In [31]:
# Test LLM-based matching from job to resumes
sample_job = documents_df.iloc[0]
job_index = sample_job['Document ID']
job_text = sample_job['cleaned_description']
job_title = sample_job['Job Title']

match_job_to_resumes_llm(
    job_index=job_index,
    job_text=job_text,
    job_title=job_title,
    resume_df=filtered_queries,
    llm_model=llm_model,
    resume_embeddings=resume_embeddings,
    top_k=10
)


=== LLM Top Resume Matches for Job #0: Flutter Developer ===
1. Resume ID: 39413067 (Score: 0.5122)
2. Resume ID: 37242217 (Score: 0.4407)
3. Resume ID: 11580408 (Score: 0.3829)
4. Resume ID: 17641670 (Score: 0.3729)
5. Resume ID: 26480367 (Score: 0.3721)
6. Resume ID: 36434348 (Score: 0.3617)
7. Resume ID: 15651486 (Score: 0.3609)
8. Resume ID: 25207620 (Score: 0.3573)
9. Resume ID: 22450718 (Score: 0.3555)
10. Resume ID: 37764298 (Score: 0.3544)


In [33]:
# Test LLM matching from a resume to job descriptions
resume_row = filtered_queries.iloc[0]
resume_id = resume_row["ID"]
resume_text = resume_row["cleaned_text"]

# Encode the resume
resume_embedding = llm_model.encode(resume_text, convert_to_tensor=True, normalize_embeddings=True)

# Compute cosine similarity
cosine_scores = util.cos_sim(resume_embedding, job_embeddings)[0]
top_results = torch.topk(cosine_scores, k=10)

print(f"=== LLM Top Job Matches for Resume ID: {resume_id} ===")
for rank, (score, idx) in enumerate(zip(top_results.values, top_results.indices), start=1):
    job_title = documents_df.iloc[idx.item()]["Job Title"]
    print(f"{rank}. {job_title} (Score: {score.item():.4f})")

=== LLM Top Job Matches for Resume ID: 36856210 ===
1. Network Administrator (Score: 0.6007)
2. Network Administrator (Score: 0.5993)
3. Software Engineer (Score: 0.5954)
4. Network Administrator (Score: 0.5944)
5. Network Administrator (Score: 0.5876)
6. Network Administrator (Score: 0.5853)
7. Network Administrator (Score: 0.5826)
8. Database Administrator (Score: 0.5802)
9. Network Administrator (Score: 0.5719)
10. Database Administrator (Score: 0.5686)


## Save LLM Results to CSV

In [36]:
save_llm_results_for_all_resumes(filtered_queries, documents_df, llm_model, job_embeddings)

LLM-based matches saved to 'llm_resume_to_jobs.csv'
