In [1]:
!conda install pip -y
!pip install -U sentence-transformers
!pip install tf-keras

Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
import re
from collections import defaultdict, Counter
import pandas as pd

## Install PDF Support (for PyMuPDF)

Required for extracting resume text from uploaded PDF files.


In [4]:
# Uncomment if running in a fresh environment
!pip install pymupdf



## Load the Datasets

In [10]:
queries_df = pd.read_csv('../data/Resume.csv')  # Resumes as queries
documents_df = pd.read_csv('../data/job_title_des.csv')  # Jobs as documents

print("Resume dataset columns:", queries_df.columns.tolist())
print("Job dataset columns:", documents_df.columns.tolist())

Resume dataset columns: ['ID', 'Resume_str', 'Resume_html', 'Category']
Job dataset columns: ['Document ID', 'Job Title', 'Job Description']


## Filter Resumes for Tech-Related Roles

In [13]:
queries_df['Category'] = queries_df['Category'].str.upper().str.strip()
target_categories = ['INFORMATION-TECHNOLOGY']  
filtered_queries = queries_df[queries_df['Category'].isin(target_categories)].copy()

## Basic Text Preprocessing

- Convert all text to lowercase for consistency.
- Strip leading and trailing whitespace.
- Apply these cleaning steps to relevant text columns (e.g., job descriptions, resume text).


In [16]:
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""

documents_df['cleaned_description'] = documents_df['Job Description'].apply(clean_text)
filtered_queries['cleaned_text'] = filtered_queries['Resume_str'].apply(clean_text)

## PDF Resume Text Extraction and Preprocessing

This section defines a helper function to extract text from a PDF resume and clean it using the same preprocessing as the existing resumes.

In [19]:
import fitz  # PyMuPDF
def extract_and_clean_pdf_resume(pdf_path):
    """Extract and clean text from a PDF resume using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return clean_text(text)  


In [21]:
# Preview the processed DataFrames to confirm changes
print("Resumes:")
print(filtered_queries[['ID']].head())
print(filtered_queries[['cleaned_text']].head())
print(filtered_queries[['Category']].head())

print("\nDocs:")
print(documents_df[['Job Title', 'cleaned_description']].head())

Resumes:
           ID
217  36856210
218  21780877
219  33241454
220  25990239
221  16899268
                                          cleaned_text
217  information technology         summary     ded...
218  information technology specialist\tgs11       ...
219  information technology supervisor       summar...
220  information technology instructor       summar...
221  information technology manager/analyst        ...
                   Category
217  INFORMATION-TECHNOLOGY
218  INFORMATION-TECHNOLOGY
219  INFORMATION-TECHNOLOGY
220  INFORMATION-TECHNOLOGY
221  INFORMATION-TECHNOLOGY

Docs:
              Job Title                                cleaned_description
0     Flutter Developer  we are looking for hire experts flutter develo...
1      Django Developer  python/django (developer/lead) - job code(pdj ...
2      Machine Learning  data scientist (contractor)\n\nbangalore, in\n...
3         iOS Developer  job description:\n\nstrong framework outside o...
4  Full Stack Developer  jo