In [None]:
# Install required libraries
!pip install pymupdf spacy pandas
!python -m spacy download en_core_web_md

In [1]:
import fitz  # PyMuPDF
import spacy
import pandas as pd
import os
import re

print("Setup Complete!")

Setup Complete!


In [2]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

raw_text = extract_text_from_pdf("data/resume/Demo Resume.pdf")

In [3]:
raw_text

" \n \n \n \n \n \n PROFESSIONAL SUMMARY                                                                                                                                             .              \nBTech CSE student with strong programming skills and a keen interest in machine learning. Quick learner with problem-solving \nabilities and enthusiasm for applying ML to real-world solutions. \n EDUCATION                                                                                                                                                                        . \nBachelor’s of Technology | Computer Science 2023-2027    DIT University, Dehradun, Uttarakhand \nCGPA: 8.55 \nClass 12th (CBSE) |2022-2023 \n80.5 %| St. Luke’s Sr. Sec. School, Solan, Himachal Pradesh  \nClass 10th  (CBSE) | 2020-2021 \n75.6% |  St. Luke’s Sr. Sec. School, Solan, Himachal Pradesh \n SKILLS                                                                                                                      

In [4]:
# Install using conda's internal installer
!conda install -c conda-forge spacy-model-en_core_web_sm -y

# Restart the spacy registry in this session
import spacy
nlp = spacy.load("en_core_web_sm")
print("Model successfully loaded in Conda base!")

Jupyter detected...
3 channel Terms of Service accepted
Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: C:\Users\sharm\anaconda3

  added / updated specs:
    - spacy-model-en_core_web_sm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.11.12 |       h4c7d964_0         149 KB  conda-forge
    openssl-3.1.0              |       hcfcfb64_3         7.1 MB  conda-forge
    spacy-model-en_core_web_sm-3.1.0|     pyhd8ed1ab_0        13.4 MB  conda-forge
    ucrt-10.0.26100.0          |       h57928b3_0         678 KB  conda-forge
    ------------------------------------------------------------
                                           Total:        21.3 MB

The following NEW packages will be INSTALLED:

  spacy-model-en_co~ conda-forge/noarch::spa



Model successfully loaded in Conda base!


In [7]:
import en_core_web_sm  # Changed from md to sm
import re

# Load the small model that we successfully installed
nlp = en_core_web_sm.load()

def clean_text(text):
    # 1. Basic cleaning using Regex
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) 
    
    # 2. NLP processing
    doc = nlp(text)
    
    # 3. Tokenization, Lemmatization, and Stopword removal
    cleaned_tokens = [
        token.lemma_ for token in doc 
        if not token.is_stop and not token.is_punct and not token.is_space
    ]
    
    return " ".join(cleaned_tokens)

# Test the cleaner
test_text = "I am a Software Engineer with 5 years of experience in Python."
print(f"Cleaned output: {clean_text(test_text)}")

Cleaned output: software engineer year experience python


In [10]:
import os
import pandas as pd

resume_folder = "data/resume" 

resume_data = []

# check if folder exists, then loop through files
if os.path.exists(resume_folder):
    for filename in os.listdir(resume_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(resume_folder, filename)
            
            raw_text = extract_text_from_pdf(file_path)

            cleaned_text = clean_text(raw_text)
            
            # Save results
            resume_data.append({
                "Filename": filename,
                "Cleaned_Text": cleaned_text
            })
    

    df = pd.DataFrame(resume_data)
    print(f"Successfully processed {len(df)} resumes.")
    display(df.head()) 
else:
    print(f"Error: The folder '{resume_folder}' does not exist. Please create it!")

Successfully processed 5 resumes.


Unnamed: 0,Filename,Cleaned_Text
0,Demo Resume.pdf,professional summary btech cse student strong ...
1,Demo Resume2.pdf,professional summary btech cse student strong ...
2,res.pdf,soham sharma sharmasohamgmailcom linkedin gith...
3,res2.pdf,soham sharma sharmasohamgmailcom linkedin gith...
4,res3.pdf,soham sharma sharmasohamgmailcom linkedin gith...


In [11]:
job_description = """
We are looking for a Software Engineer proficient in Python and Data Science. 
The ideal candidate should have experience with NLP, machine learning, 
and building web applications using Streamlit or Flask.
"""

cleaned_job_desc = clean_text(job_description)
print("Cleaned Job Description:")
print(cleaned_job_desc)

Cleaned Job Description:
look software engineer proficient python datum science ideal candidate experience nlp machine learn building web application streamlit flask


In [12]:
df.to_csv("processed_resumes.csv", index=False)
print("Phase 1 Complete! Data saved to processed_resumes.csv")

Phase 1 Complete! Data saved to processed_resumes.csv


PHASE 2: Ranking the resume's based on similarity

In [13]:
def calculate_similarity(resume_text, job_desc_text):
    resume_doc = nlp(resume_text)
    job_doc = nlp(job_desc_text)
    
    # spacy's built-in similarity uses cosine similarity
    return resume_doc.similarity(job_doc)

# apply the function to our DataFrame
df['Match_Score'] = df['Cleaned_Text'].apply(lambda x: calculate_similarity(x, cleaned_job_desc))

# sort the results so the best match is at the top
df = df.sort_values(by='Match_Score', ascending=False)

# Display the ranked candidates
print("--- Ranked Candidates ---")
display(df[['Filename', 'Match_Score']])

  return resume_doc.similarity(job_doc)


--- Ranked Candidates ---


Unnamed: 0,Filename,Match_Score
0,Demo Resume.pdf,0.940091
1,Demo Resume2.pdf,0.940091
2,res.pdf,0.921607
3,res2.pdf,0.921607
4,res3.pdf,0.921607


0.85+: Excellent match. The candidate has most of the required keywords and context.

0.60 - 0.80: Good match. Likely has the right background but might be missing specific tools.

Below 0.50: Poor match.

In [14]:
def get_matching_keywords(resume_text, job_desc_text):
    resume_words = set(resume_text.split())
    job_words = set(job_desc_text.split())
    
    # find the intersection of both sets
    common_words = resume_words.intersection(job_words)
    return ", ".join(list(common_words))

df['Key_Skills_Matched'] = df['Cleaned_Text'].apply(lambda x: get_matching_keywords(x, cleaned_job_desc))

df[['Filename', 'Match_Score', 'Key_Skills_Matched']]

Unnamed: 0,Filename,Match_Score,Key_Skills_Matched
0,Demo Resume.pdf,0.940091,"nlp, science, machine, software, datum, engine..."
1,Demo Resume2.pdf,0.940091,"nlp, science, machine, software, datum, engine..."
2,res.pdf,0.921607,"nlp, science, ideal, machine, datum, engineer,..."
3,res2.pdf,0.921607,"nlp, science, ideal, machine, datum, engineer,..."
4,res3.pdf,0.921607,"nlp, science, ideal, machine, datum, engineer,..."


PHASE 3: 


While this system provides an efficient way to rank candidates, it is subject to 'Algorithmic Bias.' In future iterations, I would implement Name Redaction and Weighting Adjustments to ensure the model focuses on skills rather than demographic indicators. Specifically, I would audit the model using 'Flip Tests' to ensure scores remain consistent regardless of gender-identifying language.

In [15]:
def anonymize_text(text):
    doc = nlp(text)
    anonymized_text = text
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # replace the name with [REDACTED]
            anonymized_text = anonymized_text.replace(ent.text, "[REDACTED NAME]")
            
    return anonymized_text

sample = "John Doe is a great developer from New York."
print(anonymize_text(sample))

[REDACTED NAME] is a great developer from New York.
