In [1]:
import PyPDF2
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pymongo import MongoClient

#Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

#Function to match keywords in the extracted text
def match_keywords(text, keywords):
    found_keywords = []
    for keyword in keywords:
        if re.search(rf'\b{keyword}\b', text, re.IGNORECASE):
            found_keywords.append(keyword)
    return found_keywords

#Connect to MongoDB Atlas
client = MongoClient('mongodb+srv://sidd843657:faJXWLIOSHaRPNOQ@conversationalai.nbnymmx.mongodb.net/?retryWrites=true&w=majority&appName=ConversationalAI')  # Replace with your MongoDB Atlas connection string
db1 = client['ESCO_Skills']  # Replace with your database name
db2 = client['job_database']

#Path to the resume PDF
pdf_file_path = 'resume.pdf' 

#Extract text from the resume PDF
extracted_text = extract_text_from_pdf(pdf_file_path)

#Load the skills data from MongoDB collection
skills_collection = db1['Skills'] 
skills_cursor = skills_collection.find({})
skills = [doc['preferredLabel'] for doc in skills_cursor]

#Find matching keywords
matched_keywords = match_keywords(extracted_text, skills)
print("Keywords found in the resume:", matched_keywords)

#If no keywords are matched, we can exit the script
if not matched_keywords:
    print("No matching keywords found in the resume.")
    exit()

#Combine matched keywords into a single string for vectorization
resume_text = ' '.join(matched_keywords)

#Load the job listings data from MongoDB collection
jobs_collection = db2['Job_Listings']
job_listings_cursor = jobs_collection.find({})
job_listings = pd.DataFrame(list(job_listings_cursor))

#Extract job descriptions from the dataset
job_descriptions = job_listings['Description'].tolist()

#Combine resume and job descriptions for vectorization
texts = [resume_text] + job_descriptions

#Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

#Compute cosine similarity between the resume and each job description
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

#Add similarity scores to the job listings
job_listings['similarity'] = cosine_similarities

#Sort job listings by similarity in descending order and get the top 5 matches
top_matches = job_listings.sort_values(by='similarity', ascending=False).head(5)

# Display the top 5 matches with Job ID, Job Title, Company Name, Link, and similarity score
print(top_matches[['Job ID', 'Job Title', 'Company', 'Link', 'similarity']])


Keywords found in the resume: ['JavaScript', 'SQL', 'computer science', 'C++', 'CSS']
                Job ID                                          Job Title  \
1633  1f8d3ca8ab827223                         Senior Agile Web Developer   
1884  991b1e7b13f779ac  Data Engineering, Data Warehouses, Business In...   
1570  84dfe342e0f025c5                               Full Stack Developer   
1803  84dfe342e0f025c5                               Full Stack Developer   
1872  84dfe342e0f025c5                               Full Stack Developer   

                          Company  \
1633                 Adroitpeople   
1884  Sonra Intelligence Limited.   
1570             EX Squared LATAM   
1803             EX Squared LATAM   
1872             EX Squared LATAM   

                                                   Link  similarity  
1633  https://ie.indeed.com/viewjob?jk=1f8d3ca8ab827223    0.185470  
1884  https://ie.indeed.com/viewjob?jk=991b1e7b13f779ac    0.166681  
1570  https://ie.i

In [2]:
from sklearn.model_selection import KFold
import numpy as np

def get_similarity_scores(train_data, test_data, resume_text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([resume_text] + train_data['Job Description'].tolist())
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    return cosine_similarities

kf = KFold(n_splits=5)
similarity_scores = []

for train_index, test_index in kf.split(job_listings):
    train_data = job_listings.iloc[train_index]
    test_data = job_listings.iloc[test_index]
    
    scores = get_similarity_scores(train_data, test_data, resume_text)
    similarity_scores.extend(scores)

print(f"Cross-validated similarity scores: {similarity_scores}")


Cross-validated similarity scores: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03642058199640866, 0.0, 0.0, 0.05788021031835727, 0.0, 0.03006247278138807, 0.0, 0.0, 0.03440510869930476, 0.024074676670137463, 0.020578568572901107, 0.0, 0.020681684187337106, 0.0, 0.0, 0.013207217631328132, 0.009818253674986184, 0.025868235190421862, 0.03277354930972456, 0.020681684187337106, 0.0, 0.0, 0.012050824244223867, 0.025036639174075438, 0.02648499269809648, 0.0, 0.0, 0.03731872905235078, 0.015636456984026038, 0.0, 0.0, 0.024074676670137463, 0.01838235699195699, 0.0, 0.024074676670137463, 0.0, 0.0, 0.0, 0.0, 0.053317123712428256, 0.13961519796162175, 0.0609553906032536, 0.0, 0.011894259001734832, 0.015868868151158136, 0.0, 0.015762855431271695, 0.0, 0.0, 0.030303415276914866, 0.0, 0.0, 0.01851205088299576, 0.0, 0.07475525320120363, 0.04406031905368911, 0.016266366889292744, 0.0, 0.0, 0.02648499269809648, 0.0, 0.0, 0.0445017247703981, 0.019301360180696375, 0.0, 0.0, 0.03381527103388324, 0.0, 0.03499

In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Example: Let's assume you have a list of relevant job IDs for the resume
relevant_job_ids = ['84dfe342e0f025c5', '991b1e7b13f779ac', '8fe83522aacc9acc', 'c50dc7492ba30129']  

# Get the job IDs of the top 5 matches
top_5_job_ids = top_matches['Job ID'].tolist()

# Generate true labels and predicted labels
true_labels = [1 if job_id in relevant_job_ids else 0 for job_id in job_listings['Job ID']]
predicted_labels = [1 if job_id in top_5_job_ids else 0 for job_id in job_listings['Job ID']]

# Calculate precision, recall, and F1 score
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Precision: {precision:.2f}") 
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")




Precision: 0.80
Recall: 0.67
F1 Score: 0.73
