In [33]:
!conda install pip -y
!pip install -U sentence-transformers
!pip install tf-keras

Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [34]:
import re
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

## Install PDF Support (for PyMuPDF)

Required for extracting resume text from uploaded PDF files.


In [36]:
# Uncomment if running in a fresh environment
!pip install pymupdf



## Load the Datasets

In [38]:
queries_df = pd.read_csv('../data/Resume.csv')  # Resumes as queries
documents_df = pd.read_csv('../data/job_title_des.csv')  # Jobs as documents

print("Resume dataset columns:", queries_df.columns.tolist())
print("Job dataset columns:", documents_df.columns.tolist())

Resume dataset columns: ['ID', 'Resume_str', 'Resume_html', 'Category']
Job dataset columns: ['Document ID', 'Job Title', 'Job Description']


## Filter Resumes for Tech-Related Roles

In [40]:
queries_df['Category'] = queries_df['Category'].str.upper().str.strip()
target_categories = ['INFORMATION-TECHNOLOGY']  
filtered_queries = queries_df[queries_df['Category'].isin(target_categories)].copy()

## Basic Text Preprocessing

- Convert all text to lowercase for consistency.
- Strip leading and trailing whitespace.
- Apply these cleaning steps to relevant text columns (e.g., job descriptions, resume text).


In [47]:
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""

documents_df['cleaned_description'] = documents_df['Job Description'].apply(clean_text)
filtered_queries['cleaned_text'] = filtered_queries['Resume_str'].apply(clean_text)

## PDF Resume Text Extraction and Preprocessing

This section defines a helper function to extract text from a PDF resume and clean it using the same preprocessing as the existing resumes.

In [50]:
import fitz  # PyMuPDF
def extract_and_clean_pdf_resume(pdf_path):
    """Extract and clean text from a PDF resume using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return clean_text(text)  


In [52]:
# Preview the processed DataFrames to confirm changes
print("Resumes:")
print(filtered_queries[['ID']].head())
print(filtered_queries[['cleaned_text']].head())
print(filtered_queries[['Category']].head())

print("\nDocs:")
print(documents_df[['Job Title', 'cleaned_description']].head())

Resumes:
           ID
217  36856210
218  21780877
219  33241454
220  25990239
221  16899268
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

## Building the Inverted Index

1. **Load Stopwords**  
   Load a predefined list of stopwords to exclude common, non-informative words from the index.

2. **Define Tokenizer**  
   Create a tokenizer function to split text into meaningful tokens.

3. **Build Inverted Index from Filtered Resumes**  
   Iterate through the filtered resumes and populate the inverted index, mapping each token to the list of document IDs in which it appears.


In [55]:
# load stopwords
with open("../data/stopwords_en.txt", "r", encoding="utf-8") as f:
    stopwords = set(word.strip().lower() for word in f if len(word.strip()) > 1)

# define tokenizer
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()
    return [t for t in tokens if len(t) > 1 and t not in stopwords and not t.isdigit()]

In [57]:
inverted_index = {}

for _, row in documents_df.iterrows():
    doc_id = row['Document ID']
    content = row['cleaned_description']

    token_counts = defaultdict(int)
    for token in tokenize(content):
        token_counts[token] += 1

    for token, count in token_counts.items():
        if token not in inverted_index:
            inverted_index[token] = {
                'total_freq': 0,
                'doc_freq': 0,
                'postings': []
            }
        inverted_index[token]['total_freq'] += count
        inverted_index[token]['doc_freq'] += 1
        inverted_index[token]['postings'].append((doc_id, count))


In [59]:
# Display a preview of the inverted index (first 15 terms by alphabetical order)
index_table = []
for token in sorted(inverted_index):
    entry = inverted_index[token]
    index_table.append({
        'Index Term': token,
        'Total Frequency': entry['total_freq'],
        'Document Frequency': entry['doc_freq'],
        'Postings (Job ID, Count)': entry['postings']
    })

df = pd.DataFrame(index_table)

df = df.sort_values(by='Total Frequency', ascending=False)

pd.set_option('display.max_colwidth', None)
print("=== Inverted Index Preview (Top 15 Frequent Terms) ===")
print(df.head(15))

=== Inverted Index Preview (Top 15 Frequent Terms) ===
          Index Term  Total Frequency  Document Frequency  \
6677      experience             9569                2115   
19614           work             4753                1743   
5270     development             4088                1570   
10184      knowledge             3174                1434   
17754           team             3030                1257   
16670       software             2868                1164   
5159          design             2821                1269   
19885          years             2688                1417   
4776            data             2638                 917   
16528         skills             2585                1291   
9871             job             2578                1591   
13838      preferred             2459                 993   
15167       required             2143                 987   
18643  understanding             2080                 994   
19637        working          

In [61]:
all_tokens = []
for text in documents_df['cleaned_description']:
    all_tokens.extend(tokenize(text))

print("\n=== Top 20 Most Frequent Tokens in Job Descriptions (Excluding Stopwords) ===")
print(Counter(all_tokens).most_common(20))


=== Top 20 Most Frequent Tokens in Job Descriptions (Excluding Stopwords) ===
[('experience', 9569), ('work', 4753), ('development', 4088), ('knowledge', 3174), ('team', 3030), ('software', 2868), ('design', 2821), ('years', 2688), ('data', 2638), ('skills', 2585), ('job', 2578), ('preferred', 2459), ('required', 2143), ('understanding', 2080), ('working', 1981), ('application', 1911), ('year', 1895), ('strong', 1849), ('web', 1810), ('code', 1725)]


In [63]:
import math

def compute_tfidf_scores(query, inverted_index, total_docs, top_k=10):
    query_tokens = tokenize(query)
    
    query_tf = defaultdict(int) # count query token frequencies
    for token in query_tokens:
        query_tf[token] += 1

    idf = {}
    for token in query_tf:
        if token in inverted_index:
            df = inverted_index[token]['doc_freq']
            idf[token] = math.log(total_docs / df)  # compute IDF for each token in query
        else:
            idf[token] = 0  # if unseen token then ignore it


    # ompute TF-IDF for the query and its norm
    query_tfidf = {}
    for token in query_tf:
        query_tfidf[token] = query_tf[token] * idf[token]

    query_norm = math.sqrt(sum(val**2 for val in query_tfidf.values()))  # ||Q||

    scores = defaultdict(float)    # dot product numerator
    doc_norms = defaultdict(float) # track ||D|| per doc

    for token, q_tfidf in query_tfidf.items():
        if token not in inverted_index:
            continue
        postings = inverted_index[token]['postings']
        idf_val = idf[token]

        for doc_id, tf in postings:
            d_tfidf = tf * idf_val
            scores[doc_id] += q_tfidf * d_tfidf    # dot product Q·D
            doc_norms[doc_id] += d_tfidf ** 2      # sum squares for ||D||

    # finalize cosine similarity
    cosine_scores = {}
    for doc_id in scores:
        doc_norm = math.sqrt(doc_norms[doc_id])
        if doc_norm == 0 or query_norm == 0:
            cosine_scores[doc_id] = 0
        else:
            cosine_scores[doc_id] = scores[doc_id] / (query_norm * doc_norm)

    ranked = sorted(cosine_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked[:top_k]

## TF-IDF: Match Jobs to Uploaded PDF Resume

This function uses the existing TF-IDF scoring system to retrieve the top-k job matches for a user-uploaded PDF resume.

In [66]:
def match_pdf_resume_tfidf(pdf_path, inverted_index, job_df, total_docs, top_k=10):
    cleaned_resume = extract_and_clean_pdf_resume(pdf_path)
    
    results = compute_tfidf_scores(
        query=cleaned_resume,
        inverted_index=inverted_index,
        total_docs=total_docs,
        top_k=top_k
    )
    
    print("=== TF-IDF Top Job Matches ===")
    for rank, (doc_id, score) in enumerate(results, 1):
        title = job_df.loc[doc_id, 'Job Title']
        print(f"{rank}. {title} (Score: {score:.4f})")
    
    return results

In [68]:
def match_job_to_resumes_tfidf(job_index, job_text, job_title, resumes_df, top_k=10):
    """
    Match a job description to top resumes using TF-IDF.
    """
    total_docs = len(resumes_df)
    
    # Build inverted index over resumes
    resume_inverted_index = {}
    for _, row in resumes_df.iterrows():
        doc_id = row['ID']
        content = row['cleaned_text']
        token_counts = defaultdict(int)
        for token in tokenize(content):
            token_counts[token] += 1
        for token, count in token_counts.items():
            if token not in resume_inverted_index:
                resume_inverted_index[token] = {
                    'total_freq': 0,
                    'doc_freq': 0,
                    'postings': []
                }
            resume_inverted_index[token]['total_freq'] += count
            resume_inverted_index[token]['doc_freq'] += 1
            resume_inverted_index[token]['postings'].append((doc_id, count))

    # Compute similarity
    results = compute_tfidf_scores(job_text, resume_inverted_index, total_docs, top_k=top_k)

    print(f"\n=== TF-IDF Top Resume Matches for Job #{job_index}: {job_title} ===")
    for rank, (doc_id, score) in enumerate(results, 1):
        print(f"{rank}. Resume ID: {doc_id} (Score: {score:.4f})")


In [23]:
# Example: Pick first job description to test TF-IDF-based reverse matching
sample_job = documents_df.iloc[0]
job_index = sample_job['Document ID']
job_text = sample_job['cleaned_description']
job_title = sample_job['Job Title']

# Match this job to resumes
match_job_to_resumes_tfidf(
    job_index=job_index,
    job_text=job_text,
    job_title=job_title,
    resumes_df=filtered_queries,
    top_k=10
)


=== TF-IDF Top Resume Matches for Job #0: Flutter Developer ===
1. Resume ID: 27058381 (Score: 0.4165)
2. Resume ID: 32959732 (Score: 0.3144)
3. Resume ID: 90867631 (Score: 0.3090)
4. Resume ID: 27372171 (Score: 0.2782)
5. Resume ID: 64017585 (Score: 0.2771)
6. Resume ID: 22450718 (Score: 0.2712)
7. Resume ID: 10553553 (Score: 0.2710)
8. Resume ID: 18159866 (Score: 0.2699)
9. Resume ID: 51363762 (Score: 0.2637)
10. Resume ID: 10265057 (Score: 0.2614)


In [70]:
# Pick a resume to test
resume_row = filtered_queries.iloc[0]
resume_id = resume_row["ID"]
query = resume_row["cleaned_text"]
total_docs = len(documents_df)

results = compute_tfidf_scores(query, inverted_index, total_docs, top_k=10)

print(f"=== Top Job Matches for Resume ID: {resume_id} ===")
for i, (doc_id, score) in enumerate(results, 1):
    job_title = documents_df[documents_df['Document ID'] == doc_id]['Job Title'].values[0]
    print(f"{i}. {job_title} (Score: {score:.4f})")


=== Top Job Matches for Resume ID: 36856210 ===
1. Database Administrator (Score: 0.4437)
2. Database Administrator (Score: 0.3455)
3. Database Administrator (Score: 0.3129)
4. Machine Learning (Score: 0.3092)
5. DevOps Engineer (Score: 0.2904)
6. Database Administrator (Score: 0.2877)
7. Network Administrator (Score: 0.2868)
8. Database Administrator (Score: 0.2841)
9. Network Administrator (Score: 0.2839)
10. DevOps Engineer (Score: 0.2809)


## Save TF-IDF Job Matches for All Resumes

This function loops through each resume in the dataset, retrieves the top matching job descriptions using TF-IDF cosine similarity, and saves the results to a CSV file.

### Output Columns:
- `Resume_ID`
- `Rank`
- `Matched_Job_ID`
- `Job_Title`
- `TFIDF_Score`

### Usage Example:
```python
save_tfidf_results_for_all_resumes(filtered_queries, documents_df, inverted_index)


In [73]:
def save_tfidf_results_for_all_resumes(filtered_queries, documents_df, inverted_index, output_path="tfidf_resume_to_jobs.csv", top_k=10):
    total_docs = len(documents_df)
    output_rows = []

    for i, row in filtered_queries.iterrows():
        resume_id = row.get("ID", i)
        resume_text = row['cleaned_text']

        results = compute_tfidf_scores(resume_text, inverted_index, total_docs, top_k=top_k)

        for rank, (job_id, score) in enumerate(results, start=1):
            job_row = documents_df[documents_df['Document ID'] == job_id]
            job_title = job_row['Job Title'].values[0] if not job_row.empty else "Unknown"

            output_rows.append({
                "Resume_ID": resume_id,
                "Rank": rank,
                "Matched_Job_ID": job_id,
                "Job_Title": job_title,
                "TFIDF_Score": score
            })

    tfidf_output_df = pd.DataFrame(output_rows)
    tfidf_output_df.to_csv(output_path, index=False)
    print(f"TF-IDF matches saved to '{output_path}'")

In [75]:
save_tfidf_results_for_all_resumes(filtered_queries, documents_df, inverted_index) 

TF-IDF matches saved to 'tfidf_resume_to_jobs.csv'


## Precision, Recall, and F1 Evaluation

In this section, we evaluate the effectiveness of our retrieval system using standard IR metrics:
- **Precision@k**: What proportion of the top-k results are relevant?
- **Recall@k**: What proportion of all relevant results were returned in the top-k?
- **F1@k**: Harmonic mean of Precision and Recall — balances both.

We evaluate each resume (Sue, RDH, Shang) against a manually selected set of job descriptions that are deemed relevant. Both **TF-IDF** and **LLM**-based models are used for comparison.


In [77]:
# Precision, Recall, F1 @ k
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    relevant_set = set(relevant)
    hits = sum(1 for item in retrieved_k if item in relevant_set)
    return hits / k

def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    relevant_set = set(relevant)
    hits = sum(1 for item in retrieved_k if item in relevant_set)
    return hits / len(relevant) if relevant else 0

def f1_at_k(retrieved, relevant, k):
    p = precision_at_k(retrieved, relevant, k)
    r = recall_at_k(retrieved, relevant, k)
    return (2 * p * r) / (p + r) if (p + r) else 0


In [79]:
# Define relevant jobs for each resume
sue_relevant = documents_df[documents_df['Job Title'] == 'Machine Learning']['Document ID'].values.tolist()
RDH_relevant = documents_df[documents_df['Job Title'].isin(['DevOps Engineer','Software Engineer'])]['Document ID'].values.tolist()
shang_relevant = documents_df[documents_df['Job Title'].isin(['DevOps Engineer','Software Engineer'])]['Document ID'].values.tolist()

### Evaluation: 

In [86]:
# === Ground truth mapping ===
sue_relevant = documents_df[documents_df['Job Title'] == 'Machine Learning']['Document ID'].values.tolist()
rdh_relevant = documents_df[documents_df['Job Title'] == 'Cloud Engineer']['Document ID'].values.tolist()
shang_relevant = documents_df[documents_df['Job Title'] == 'Software Engineer']['Document ID'].values.tolist()

# === TF-IDF retrievals ===
Sue_tfidf_results = match_pdf_resume_tfidf("../data/Sue Yang Resume.pdf", inverted_index, documents_df, total_docs=len(documents_df))
RDH_tfidf_results = match_pdf_resume_tfidf("../data/RDH Resume.pdf", inverted_index, documents_df, total_docs=len(documents_df))
Shang_tfidf_results = match_pdf_resume_tfidf("../data/Shang Andrews Resume 0.1.3.pdf", inverted_index, documents_df, total_docs=len(documents_df))

# === Extract only the retrieved doc IDs ===
Sue_tfidf_retrieved = [item[0] for item in Sue_tfidf_results]
RDH_tfidf_retrieved = [item[0] for item in RDH_tfidf_results]
Shang_tfidf_retrieved = [item[0] for item in Shang_tfidf_results]

# === Precision, Recall, F1 @ 10 ===
print("\n=== TF-IDF Evaluation Metrics @10 ===")
print("Sue TF-IDF →",
      "P:", precision_at_k(Sue_tfidf_retrieved, sue_relevant, 10),
      "R:", recall_at_k(Sue_tfidf_retrieved, sue_relevant, 10),
      "F1:", f1_at_k(Sue_tfidf_retrieved, sue_relevant, 10))

print("RDH TF-IDF →",
      "P:", precision_at_k(RDH_tfidf_retrieved, rdh_relevant, 10),
      "R:", recall_at_k(RDH_tfidf_retrieved, rdh_relevant, 10),
      "F1:", f1_at_k(RDH_tfidf_retrieved, rdh_relevant, 10))

print("Shang TF-IDF →",
      "P:", precision_at_k(Shang_tfidf_retrieved, shang_relevant, 10),
      "R:", recall_at_k(Shang_tfidf_retrieved, shang_relevant, 10),
      "F1:", f1_at_k(Shang_tfidf_retrieved, shang_relevant, 10))


=== TF-IDF Top Job Matches ===
1. Django Developer (Score: 0.3518)
2. Machine Learning (Score: 0.3420)
3. PHP Developer (Score: 0.3250)
4. Java Developer (Score: 0.3189)
5. Machine Learning (Score: 0.3049)
6. Java Developer (Score: 0.2996)
7. Machine Learning (Score: 0.2883)
8. Machine Learning (Score: 0.2837)
9. JavaScript Developer (Score: 0.2836)
10. Machine Learning (Score: 0.2836)
=== TF-IDF Top Job Matches ===
1. Django Developer (Score: 0.4331)
2. Java Developer (Score: 0.3332)
3. Database Administrator (Score: 0.3291)
4. Full Stack Developer (Score: 0.3058)
5. JavaScript Developer (Score: 0.2922)
6. Database Administrator (Score: 0.2714)
7. DevOps Engineer (Score: 0.2492)
8. Database Administrator (Score: 0.2382)
9. Java Developer (Score: 0.2360)
10. Database Administrator (Score: 0.2330)
=== TF-IDF Top Job Matches ===
1. Database Administrator (Score: 0.3293)
2. DevOps Engineer (Score: 0.3187)
3. DevOps Engineer (Score: 0.3151)
4. DevOps Engineer (Score: 0.3137)
5. DevOps Engi