In [1]:
import os
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define the folder path
folder_path = 'resume dataset/data/test'

# Initialize an empty list to store the preprocessed resumes
preprocessed_resumes = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
	# Check if the file is a PDF
	if filename.endswith('.pdf'):
		with open(os.path.join(folder_path, filename), 'rb') as f:
			pdf = PyPDF2.PdfReader(f)
			text = ''
			for page in range(len(pdf.pages)):
				text += pdf.pages[page].extract_text()
			
			# Tokenize the text
			tokenized_text = word_tokenize(text)
			
			# Remove stop words
			stop_words = set(stopwords.words('english'))
			filtered_text = [word for word in tokenized_text if word not in stop_words]
			
			# Lemmatize words
			lemmatizer = WordNetLemmatizer()
			lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
			
			# Join the words back into a string
			preprocessed_resume = ' '.join(lemmatized_text)
			
			# Add the preprocessed resume to the list
			preprocessed_resumes.append(preprocessed_resume)

In [20]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
for i in preprocessed_resumes:
    print(i)

DATA ANALYST Professional Summary Industrial Systems Engineering graduate , certified Base SAS Programmer Lean Six Sigma Green Belt strong background statistic , mathematics logical problem solving looking dynamic opportunity data driven field analytics statistical modeling . Core Qualifications Data Science Tools : R , Base SAS , Python ( Numpy , Pandas , Matplotlib , Scikit- learn ) , SPSS , Minitab , MATLAB , Apache Spark , SQL , MS Excel , MS Visio , Tableau MySQL , Oracle Database , Microsoft Access Key Competencies : Data Extraction , Data Wrangling , Data Analysis , Data Visualization , Regression Analysis ( Linear , Logistic Multinomial ) , Time Series Analysis , Association Rule Mining , Monte Carlo Simulation , Optimization , Random Forests Experience 07/2016 Current Data Analyst Company Name ï¼​ State 09/2015 05/2016 Student Manager Company Name ï¼​ State Undertook leadership advisory role training newcomer hone culinary behavioral skill . PROJECTS Classification Customers C

In [22]:
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def extract_keywords(text):
	# Tokenize the text
	tokens = word_tokenize(text)
	
	# Remove stop words
	stop_words = set(stopwords.words('english'))
	keywords = [word for word in tokens if word not in stop_words]
	
	# Use NLTK's FreqDist to get the most common words
	fdist = FreqDist(keywords)
	
	# Get the top 5 keywords
	keywords = [word for word, freq in fdist.most_common(5)]
	
	return keywords

def sentiment_analysis(text):
	# Use NLTK's SentimentIntensityAnalyzer
	sia = SentimentIntensityAnalyzer()
	
	# Get the sentiment scores
	scores = sia.polarity_scores(text)
	
	return scores

In [24]:
import spacy

# Load the spacy model for entity recognition
nlp = spacy.load("en_core_web_sm")

keywords_list = []
sentiment_scores_list = []
entities_list = []
pos_tags_list = []

# Loop through each preprocessed resume
for resume in preprocessed_resumes:
	# Extract keywords and phrases
	keywords = extract_keywords(resume)
	keywords_list.append(keywords)
	
	# Sentiment Analysis
	sentiment_scores = sentiment_analysis(resume)
	sentiment_scores_list.append(sentiment_scores)
	
	# Entity Recognition
	entities = []
	for entity in nlp(resume).ents:
		entities.append(entity.text)
	entities_list.append(entities)
	
	# Part-of-speech tagging
	pos_tags = nltk.pos_tag(word_tokenize(resume))
	pos_tags_list.append(pos_tags)

# Display the extracted features for each resume
for i, resume in enumerate(preprocessed_resumes):
	print(f"Resume {i+1}:")
	print("Keywords and phrases:", keywords_list[i])
	print("Sentiment analysis:", sentiment_scores_list[i])
	print("Entities:", entities_list[i])
	print("Part-of-speech tags:", pos_tags_list[i])



Resume 1:
Keywords and phrases: [',', '.', 'Data', 'Analysis', 'Engineering']
Sentiment analysis: {'neg': 0.02, 'neu': 0.85, 'pos': 0.131, 'compound': 0.9902}
Entities: ['Numpy', 'Pandas', 'Matplotlib', 'SPSS', 'Minitab', 'MATLAB', 'Apache Spark', 'SQL', 'MS Excel', 'MS Visio', 'Tableau', 'Oracle Database', 'Microsoft Access Key Competencies', 'Data Extraction', 'Data Wrangling', 'Data Analysis', 'Data Visualization , Regression Analysis', 'Time Series Analysis', 'State Undertook', 'PROJECTS Classification Customers Credit Card Company', 'Jan 2016', 'May 2016', 'Dec 2015', 'MATLAB', 'ExpertFit', 'Arena', '6.2 minute 1.8 minute', 'Statistical Analysis Defects Clutch Plate Manufacturing - A Six Sigma Study', 'Jan 2015', 'May 2015', 'DMAIC', 'Define , Measure', 'Improve Control', 'Minitab', '300 %', 'Random Forest', 'Public Storage', 'State University New York Industrial Systems Engineering', '3.51/4.00', 'May 2014', 'Mechanical Engineering Osmania University Mechanical Engineering', 'Ski

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


job_descriptions = ["Currently pursuing a Bachelor’s or Master’s degree in Computer Science, Software Engineering, or a related field.Strong programming skills Familiarity with API development. Experience with version control systems (e.g., Git) Excellent problem-solving abilities and a strong desire to learn in a fast-paced startup environment. Effective communication skills and the ability to work both independently and collaboratively in a team."]

required_skills = ["Java"]

# Create a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer to the preprocessed resumes and job descriptions
vectorizer.fit(preprocessed_resumes + job_descriptions)

# Transform the resumes and job descriptions into vectors
resume_vectors = vectorizer.transform(preprocessed_resumes)
job_description_vectors = vectorizer.transform(job_descriptions)

# Calculate the cosine similarity between resume vectors and job description vectors
cosine_similarities = cosine_similarity(resume_vectors, job_description_vectors)

# Print the cosine similarity scores
for i, similarity in enumerate(cosine_similarities):
	print(f"Resume {i+1} and Job Description {i+1} similarity: {similarity}")

# Repeat the process for required skills/qualifications
required_skills_vectors = vectorizer.transform(required_skills)
cosine_similarities_skills = cosine_similarity(resume_vectors, required_skills_vectors)

for i, similarity in enumerate(cosine_similarities_skills):
	print(f"Resume {i+1} and Required Skills {i+1} similarity: {similarity}")

Resume 1 and Job Description 1 similarity: [0.88646647]
Resume 2 and Job Description 2 similarity: [0.95986936]
Resume 3 and Job Description 3 similarity: [0.937353]
Resume 4 and Job Description 4 similarity: [0.93042877]
Resume 5 and Job Description 5 similarity: [0.92611099]
Resume 6 and Job Description 6 similarity: [0.96127017]
Resume 7 and Job Description 7 similarity: [0.90409442]
Resume 8 and Job Description 8 similarity: [0.95456672]
Resume 9 and Job Description 9 similarity: [0.95700881]
Resume 10 and Job Description 10 similarity: [0.93646529]
Resume 11 and Job Description 11 similarity: [0.96487334]
Resume 12 and Job Description 12 similarity: [0.91549355]
Resume 13 and Job Description 13 similarity: [0.86243017]
Resume 14 and Job Description 14 similarity: [0.85674442]
Resume 15 and Job Description 15 similarity: [0.8653104]
Resume 16 and Job Description 16 similarity: [0.85723119]
Resume 17 and Job Description 17 similarity: [0.85664647]
Resume 18 and Job Description 18 si

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# ... (rest of your code)

# Create a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer to the preprocessed resumes and job descriptions
vectorizer.fit(preprocessed_resumes + job_descriptions)

# Transform the resumes and job descriptions into vectors
resume_vectors = vectorizer.transform(preprocessed_resumes)
job_description_vectors = vectorizer.transform(job_descriptions)

# Calculate the cosine similarity between resume vectors and job description vectors
cosine_similarities = cosine_similarity(resume_vectors, job_description_vectors)

# Print the cosine similarity scores
for i, similarity in enumerate(cosine_similarities):
    print(f"Resume {i+1} and Job Description similarity: {similarity[0]}")

Resume 1 and Job Description similarity: 0.886466465096174
Resume 2 and Job Description similarity: 0.9598693627975485
Resume 3 and Job Description similarity: 0.9373529986209265
Resume 4 and Job Description similarity: 0.9304287748921176
Resume 5 and Job Description similarity: 0.9261109868402054
Resume 6 and Job Description similarity: 0.9612701665379259
Resume 7 and Job Description similarity: 0.9040944205165665
Resume 8 and Job Description similarity: 0.9545667170869343
Resume 9 and Job Description similarity: 0.9570088148082956
Resume 10 and Job Description similarity: 0.9364652893423902
Resume 11 and Job Description similarity: 0.9648733376050865
Resume 12 and Job Description similarity: 0.9154935457740526
Resume 13 and Job Description similarity: 0.8624301737201348
Resume 14 and Job Description similarity: 0.8567444189367546
Resume 15 and Job Description similarity: 0.8653103959557913
Resume 16 and Job Description similarity: 0.8572311875241316
Resume 17 and Job Description simi