In [1]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0


In [2]:
import spacy
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:

!pip install PyPDF2
!pip install pdfminer
import os
from io import StringIO
import PyPDF2
import docx
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

import spacy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained NLP model for entity recognition
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file"""
    text_io = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, text_io, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    with open(pdf_file, 'rb') as file:
        for page in PDFPage.get_pages(file, set(), maxpages=0, password="", caching=True, check_extractable=True):
            interpreter.process_page(page)

    converter.close()
    text = text_io.getvalue()
    text_io.close()
    return text

def extract_text_from_docx(docx_file):
    """Extract text from a DOCX file"""
    doc = docx.Document(docx_file)
    text = '\n'.join([para.text for para in doc.paragraphs])
    return text

def rank_candidates(job_description, resumes):
    """Rank candidates based on similarity with job description"""

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform job description and resumes
    tfidf_matrix = vectorizer.fit_transform([job_description] + resumes)

    # Calculate cosine similarity between job and each resume
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    # Get ranked list of candidate indices
    ranked_candidates = similarities.ravel().argsort()[::-1]

    return ranked_candidates

def analyze_job_description(job_description_file, resumes):
    """Analyze job description and rank resumes"""
    # Extract job description from PDF file
    job_description = extract_text_from_pdf(job_description_file)

    # Continue with the ranking process
    ranked_candidates = rank_candidates(job_description, resumes)

    # Print ranked candidates
    for rank, idx in enumerate(ranked_candidates):
        print(f"Rank {rank+1}: Resume {idx + 1}")

# Example usage
job_description_file = '/content/Document (9).pdf'
resume_files = ['/content/candidate1.pdf','/content/candidate2.pdf','/content/candidate3.pdf']
resumes = []

for resume_file in resume_files:
    file_extension = os.path.splitext(resume_file)[1].lower()

    if file_extension == '.pdf':
        resume_text = extract_text_from_pdf(resume_file)
    elif file_extension == '.docx':
        resume_text = extract_text_from_docx(resume_file)
    else:
        with open(resume_file, 'r', encoding='utf-8') as file:
            resume_text = file.read()

    resumes.append(resume_text)

# Analyze job description and rank resumes
analyze_job_description(job_description_file, resumes)

Rank 1: Resume 2
Rank 2: Resume 1
Rank 3: Resume 3
