In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import pdfplumber
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load spaCy model for text preprocessing
nlp = spacy.load("en_core_web_md")

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - str: Extracted text from the PDF.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def preprocess_text(text):
    """
    Preprocesses text by removing unnecessary characters and whitespace.
    Args:
    - text (str): Raw text to preprocess.

    Returns:
    - str: Preprocessed text.
    """
    doc = nlp(text)
    # Remove punctuation, spaces, and normalize whitespace
    processed_text = " ".join(token.text.lower() for token in doc if not token.is_punct and not token.is_space)
    return processed_text

def compute_cosine_similarity(text1, text2):
    """
    Computes cosine similarity between two texts.
    Args:
    - text1 (str): First text for comparison.
    - text2 (str): Second text for comparison.

    Returns:
    - float: Cosine similarity as a percentage.
    """
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(vectors[0:1], vectors[1:2])
    return similarity[0][0] * 100  # Convert to percentage

def extract_resume_text(resume_path):
    """
    Extracts and preprocesses text from a resume PDF.
    Args:
    - resume_path (str): Path to the resume PDF.

    Returns:
    - str: Preprocessed resume text.
    """
    resume_text = extract_text_from_pdf(resume_path)
    return preprocess_text(resume_text)

def extract_job_description_text(job_description_path):
    """
    Extracts and preprocesses text from a job description PDF.
    Args:
    - job_description_path (str): Path to the job description PDF.

    Returns:
    - str: Preprocessed job description text.
    """
    job_description_text = extract_text_from_pdf(job_description_path)
    return preprocess_text(job_description_text)

def main(resume_path, job_description_path):
    """
    Main function to handle resume and job description comparison.
    Args:
    - resume_path (str): Path to the resume PDF.
    - job_description_path (str): Path to the job description PDF.
    """
    # Extract and preprocess resume text
    resume_text = extract_resume_text(resume_path)

    # Extract and preprocess job description text
    job_description_text = extract_job_description_text(job_description_path)

    # Compute cosine similarity
    similarity_percentage = compute_cosine_similarity(resume_text, job_description_text)

    # Print the result
    print(f"Cosine Similarity: {similarity_percentage:.2f}%")

if __name__ == "__main__":
    # Example paths to PDF files (replace with actual paths)
    resume_path = "VaibhavRaiOffCampus.pdf"
    job_description_path = "path_to_job_description.pdf"

    # Run the main function
    main(resume_path, job_description_path)


Cosine Similarity: 33.10%
