<a href="https://colab.research.google.com/github/SahilBeniwal22/Projects/blob/main/Resume_screening.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [2]:
import re
from PyPDF2 import PdfReader

# Clean text
def clean_text(text):
    text = re .sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.lower()

# Extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return clean_text(text)

# Extract text from multiple resumes
def extract_text_from_multiple_resumes(file_paths):
    resume_texts = {}
    for file_path in file_paths:
        try:
            text = extract_text_from_pdf(file_path)
            resume_texts[file_path] = text
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    return resume_texts

# List of resume file paths
resume_files = [
    "/content/Resume.pdf",
    "/content/Aarti_Yadav_resume.pdf"
]

# Extract text for all resumes
resumes_texts = extract_text_from_multiple_resumes(resume_files)

# Display the first 500 characters of each resume's text
for file_path, text in resumes_texts.items():
    print(f"\nFile: {file_path}")
    print(f"Extracted Text (First 500 Characters): {text[:500]}")


File: /content/Resume.pdf
Extracted Text (First 500 Characters): sahil beniwal gurgaonharyana  91 8586858027 sahilbeny24gmailcom  httpswwwlinkedincominsahilbeniwal22  httpsgithubcomsahilbeniwal22 summary finalyear btech student specializing in aiml with handson experience in python designing rnn and cnn models actively developing skills in data structures and algorithms seeking opportunities for an internship or job in the field skills programming languages  java  python  html5  matlab  sql  dart libraries  frameworks  keras  numpy  pandas  scipy  dialogflow 

File: /content/Aarti_Yadav_resume.pdf
Extracted Text (First 500 Characters): aarti yadav data analyst 91 9667106415 2111aartiyadavgmailcom gurgaonharyana objective passionate and dedicated 3rdyear undergraduate pursuing a bachelors in computer applications proficient in data analysis python and sql with handson experience in projects involving data visualization and trend analysis eager to contribute to organizational growth whi

In [3]:
# Predefined list of skills
skills_list = ["python", "java", "sql","react","javascript","mongodb"]

# Extract skills from text
def extract_skills(text, skills_list):
    skills_found = [skill for skill in skills_list if skill in text]
    return skills_found

# Extract skills from multiple resumes
def extract_skills_from_multiple_resumes(resumes_texts, skills_list):
    skills_summary = {}
    for file_path, text in resumes_texts.items():
        skills_found = extract_skills(text, skills_list)
        skills_summary[file_path] = skills_found
    return skills_summary

# Example usage with multiple resumes
resume_files = [
    "/content/Resume.pdf",
    "/content/Aarti_Yadav_resume.pdf",
    "/content/Aarti's Resume-hackerresume (1).pdf"
]

# Extract text for all resumes
resumes_texts = extract_text_from_multiple_resumes(resume_files)

# Extract skills for all resumes
skills_summary = extract_skills_from_multiple_resumes(resumes_texts, skills_list)

# Display extracted skills for each resume
for file_path, skills in skills_summary.items():
    print(f"\nFile: {file_path}")
    print(f"Extracted Skills: {skills}")


Error reading /content/Aarti's Resume-hackerresume (1).pdf: [Errno 2] No such file or directory: "/content/Aarti's Resume-hackerresume (1).pdf"

File: /content/Resume.pdf
Extracted Skills: ['python', 'java', 'sql']

File: /content/Aarti_Yadav_resume.pdf
Extracted Skills: ['python', 'java', 'sql', 'javascript']


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Match resumes to a job description
def match_resumes_to_job(resumes, job_description):
    # Combine job description and resumes into one list
    documents = [job_description] + resumes
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Compute cosine similarity
    job_vector = tfidf_matrix[0]
    resume_vectors = tfidf_matrix[1:]
    similarities = cosine_similarity(job_vector, resume_vectors).flatten()

    # Rank resumes by similarity
    ranked_indices = similarities.argsort()[::-1]
    return ranked_indices, similarities

# Example usage
job_description = "Looking for a Python developer with experience in NLP and machine learning."

# List of resumes
resumes = [
    extract_text_from_pdf("/content/Resume.pdf"),
    extract_text_from_pdf("/content/Aarti_Yadav_resume.pdf")
]

# Match resumes to the job description
ranked_indices, similarities = match_resumes_to_job(resumes, job_description)

# Display similarity rankings
print("Ranking of Resumes Based on Job Description:")
for rank, idx in enumerate(ranked_indices):
    print(f"Rank {rank + 1}: Resume {idx + 1} - Similarity: {similarities[idx]:.2f}")


Ranking of Resumes Based on Job Description:
Rank 1: Resume 1 - Similarity: 0.21
Rank 2: Resume 2 - Similarity: 0.13


In [7]:
def resume_screening_pipeline(resume_files, job_description, skills_list):
    resumes_texts = []

    # Step 1: Extract and clean text from all resumes
    for file in resume_files:
        text = extract_text_from_pdf(file)
        resumes_texts.append(text)

    # Step 2: Extract skills from resumes
    skills_summary = [extract_skills(resume, skills_list) for resume in resumes_texts]

    # Step 3: Match resumes to the job description
    ranked_indices, similarities = match_resumes_to_job(resumes_texts, job_description)

    # Step 4: Compile results
    results = []
    for rank, idx in enumerate(ranked_indices):
        results.append({
            "Rank": rank + 1,
            "Resume": resume_files[idx],
            "Similarity": round(similarities[idx], 2),
            "Skills": skills_summary[idx]
        })

    return results


# Example usage
resume_files = [
    "/content/Resume.pdf",
    "/content/Aarti_Yadav_resume.pdf",
    # "/content/Ankur_Resume.pdf"  # Add more files as needed
]

# Define the job description and skills list
job_description = "Looking for a Python developer with experience in NLP and machine learning."
skills_list = ["python", "java", "sql", "html", "css", "nlp", "machine learning", "aws"]

# Run the screening pipeline
results = resume_screening_pipeline(resume_files, job_description, skills_list)

# Display results
for result in results:
    print(f"Rank: {result['Rank']}")
    print(f"Resume: {result['Resume']}")
    print(f"Similarity: {result['Similarity']}")
    print(f"Extracted Skills: {result['Skills']}")
    print("-" * 40)


Rank: 1
Resume: /content/Resume.pdf
Similarity: 0.21
Extracted Skills: ['python', 'java', 'sql', 'html']
----------------------------------------
Rank: 2
Resume: /content/Aarti_Yadav_resume.pdf
Similarity: 0.13
Extracted Skills: ['python', 'java', 'sql', 'html']
----------------------------------------
