In [1]:
import os
import pdfplumber
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy


c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
#extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return ''.join(page.extract_text() for page in pdf.pages if page.extract_text())


In [4]:
 #for preprocessing text
def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [5]:
# Function to calculate similarity using keyword matching
def calculate_similarity(job_requirements, resume_text):
    # Create a CountVectorizer with binary values (1 if word is present, 0 otherwise)
    vectorizer = CountVectorizer(binary=True, stop_words=ENGLISH_STOP_WORDS)
    
    # Transform job requirements and resume text into binary vectors
    job_vector = vectorizer.fit_transform([job_requirements])
    resume_vector = vectorizer.transform([resume_text])

    # Calculate cosine similarity
    similarity = cosine_similarity(job_vector, resume_vector)[0][0]

    return similarity


In [6]:
 # path containing resumes
resume_folder_path = 'C:/Users/DELL/Desktop/res/resumes'


In [8]:
# job requirements
job_requirements = "SQL, javascript, HTML, Java, CSS, Python, communication skill, project management"


In [9]:
# Iterate through resumes and calculate similarity
print("Similarity Scores of All Resumes:")
similarity_scores = []  # List to store similarity scores
for filename in os.listdir(resume_folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(resume_folder_path, filename)
        resume_text = extract_text_from_pdf(pdf_path)
        processed_resume = preprocess(resume_text)

        # Calculate similarity using keyword matching
        similarity = calculate_similarity(job_requirements, processed_resume)
        similarity_scores.append((pdf_path, similarity * 100))  # Store the score

        print(f"Resume: {pdf_path}, Similarity Score: {similarity * 100:.2f}%")


Similarity Scores of All Resumes:
Resume: C:/Users/DELL/Desktop/res/resumes\10089434.pdf, Similarity Score: 77.46%
Resume: C:/Users/DELL/Desktop/res/resumes\10247517.pdf, Similarity Score: 70.71%
Resume: C:/Users/DELL/Desktop/res/resumes\11187796.pdf, Similarity Score: 63.25%
Resume: C:/Users/DELL/Desktop/res/resumes\94230796.pdf, Similarity Score: 54.77%
Resume: C:/Users/DELL/Desktop/res/resumes\VIDHI_RESUME.pdf, Similarity Score: 94.87%


In [10]:
# Set a threshold for similarity score
threshold = 70 
top_matches = [score for score in similarity_scores if score[1] >= threshold]


In [11]:
# Display the top matching resumes
print("\nShortlisted Resumes:")
for match in top_matches:
    print(f"Resume: {match[0]}, Similarity Score: {match[1]:.2f}%")



Shortlisted Resumes:
Resume: C:/Users/DELL/Desktop/res/resumes\10089434.pdf, Similarity Score: 77.46%
Resume: C:/Users/DELL/Desktop/res/resumes\10247517.pdf, Similarity Score: 70.71%
Resume: C:/Users/DELL/Desktop/res/resumes\VIDHI_RESUME.pdf, Similarity Score: 94.87%


In [18]:
nlp = spacy.load("en_core_web_sm")
def extract_requirements_from_resume(resume_text):
    doc = nlp(resume_text.lower()) 

    extracted_info = {
        "sql": False,
        "javascript": False,
        "html": False,
        "java": False,
        "communication skill":False,
        "project management":False
    }
    for sentence in doc.sents:
        processed_sentence = preprocess(sentence.text)
        if any(keyword in processed_sentence for keyword in ["sql", "javascript", "html","java","communication skill","project management"]):
            extracted_info["sql"] = True

        if "javascript" in processed_sentence:
            extracted_info["javascript"] = True

        if "html" in processed_sentence:
            extracted_info["html"] = True

        if "java" in processed_sentence:
            extracted_info["java"] = True

        if "communication skill" in processed_sentence:
            extracted_info["communication skill"] = True

        if "project management" in processed_sentence:
            extracted_info["project management"] = True
        
        return extracted_info

# Extract information from each shortlisted resume
for resume_path, _ in top_matches:
    resume_text = extract_text_from_pdf(resume_path)
    extracted_info = extract_requirements_from_resume(resume_text)
    
    print(f"\nExtracted skill from {resume_path}:")
    for key, value in extracted_info.items():
        print(f"{key.replace('_', ' ').title()}: {value}")



Extracted skill from C:/Users/DELL/Desktop/res/resumes\10089434.pdf:
Sql: False
Javascript: False
Html: False
Java: False
Communication Skill: False
Project Management: False

Extracted skill from C:/Users/DELL/Desktop/res/resumes\10247517.pdf:
Sql: False
Javascript: False
Html: False
Java: False
Communication Skill: False
Project Management: False

Extracted skill from C:/Users/DELL/Desktop/res/resumes\VIDHI_RESUME.pdf:
Sql: False
Javascript: False
Html: False
Java: False
Communication Skill: False
Project Management: False


In [14]:
import PyPDF2
import re
# PDF Text Extraction and Preprocessing
def extract_text_from_pdf(file_path):
    text = ''
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return text

def extract_and_preprocess_texts_from_folder(folder_path):
    pdf_texts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            extracted_text = extract_text_from_pdf(file_path)
            cleaned_text = preprocess(extracted_text)
            pdf_texts[filename] = cleaned_text
    return pdf_texts

folder_path = 'C:/Users/DELL/Documents/intern/Task3/resumes'  # Replace with the path to your folder
pdf_texts = extract_and_preprocess_texts_from_folder(folder_path)

for filename, text in pdf_texts.items():
    print(f"Cleaned text from {filename}:")
    print(text)
    print("--------------------------------------------------\n")

Cleaned text from 10089434.pdf:
information technology technician summary versatile system administrator possessing superior troubleshooting skill networking issue end user problem network security experienced server management system analysis offering understanding infrastructure area independent focused taking systematic approach solving complex problem demonstrated exceptional technical knowledge skill working various team achieve shared goal objective highlight active directory group policy object powershell vbscript microsoft exchange vmware experience new technology product research office 365 azure storage management enterprise backup management disaster recovery experience information technology technician aug 2007 current company name city state migrating managing user account microsoft office 365 exchange online creating managing virtual machine system domain controller active directory federation service adfs microsoft window azure iaa creating managing storage microsoft win

In [19]:
# Skill extraction using regex
def extract_skills_regex(text, skill_dict):
    extracted_skills = set()
    for skill, variations in skill_dict.items():
        for variation in variations:
            if re.search(r'\b' + re.escape(variation) + r'\b', text, re.IGNORECASE):
                extracted_skills.add(skill)
    return extracted_skills

folder_path = 'C:/Users/DELL/Documents/intern/Task3/resumes'  
pdf_texts = extract_and_preprocess_texts_from_folder(folder_path)


skill_lemma_dict = {
    "python": ["python", "py", "pandas", "numpy", "scipy", "flask", "django"],
    "java": ["java", "jvm", "spring", "hibernate", "maven", "gradle"],
    "javascript": ["javascript", "js", "node.js", "react.js", "vue.js", "angular", "typescript"],
    "html": ["html"],
    "css": ["css", "sass", "less"],
    'communication skill': ['communication', 'interpersonal'],
    "sql": ["sql", "mysql", "postgresql", "oracle", "sql server"],
    
}

for filename, text in pdf_texts.items():
    skills_from_regex = extract_skills_regex(text, skill_lemma_dict)
    print(f"File: {filename}")
    print("Skills Extracted using Regex:", skills_from_regex)
    print("--------------------------------------------------\n")

File: 10089434.pdf
Skills Extracted using Regex: {'java', 'sql', 'html'}
--------------------------------------------------

File: 10247517.pdf
Skills Extracted using Regex: {'communication skill', 'sql'}
--------------------------------------------------

File: 11187796.pdf
Skills Extracted using Regex: {'communication skill'}
--------------------------------------------------

File: 94230796.pdf
Skills Extracted using Regex: {'communication skill'}
--------------------------------------------------

File: VIDHI_RESUME.pdf
Skills Extracted using Regex: {'python', 'html', 'javascript', 'java', 'communication skill', 'sql'}
--------------------------------------------------

