In [9]:
import os
import PyPDF2
import re
def extract_cv_details(pdf_path):
    cv_details = {}
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
        
            category = re.search(r'Category\s*(.*?)\n', text, re.IGNORECASE)
            skills = re.search(r'Skills\s*(.*?)\n', text, re.IGNORECASE)
            education = re.search(r'Education\s*(.*?)\n', text, re.IGNORECASE)
            if category:
                cv_details['Category'] = category.group(1).strip()
            if skills:
                cv_details['Skills'] = skills.group(1).strip()
            if education:
                cv_details['Education'] = education.group(1).strip()
    return cv_details
def process_cvs_in_directory(directory_path):
    cv_details_list = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            cv_details = extract_cv_details(pdf_path)
            cv_details_list.append(cv_details)
    return cv_details_list
cv_directory = r"C:\Users\reddy\Desktop\INPUT"
cv_details_list = process_cvs_in_directory(cv_directory)
print(cv_details_list)


[{'Skills': 'ADA compliance, auditing, computer programming, contracts, CPR, credit, customer satisfaction, customer', 'Education': 'al Background'}, {'Category': 'Silver Paragon Award', 'Education': '1994', 'Skills': 'academic, ads, advertising, banners, brochures, budget, conferences, special events, market research, marketing, materials, newspaper, office'}, {'Skills': 'and continue to gain experience in the aerospace industry to advance in my career.', 'Education': 'and Training'}, {'Education': 'al attainment and professional experience in Energy Engineering, Project Engineering, Building Energy Systems, Energy Conservation,', 'Skills': 'Sales, Proposals, Solutions, Commercial Buildings, Million, Sales And, Contracts, Ecms, Energy Conservation, Energy Solutions, Industrial'}, {'Skills': 'classes,Â became a lead and trained and supervised', 'Education': ', and community referrals. Per attorney request, evaluated, wrote'}, {'Skills': 'Microsoft Office Suite: Word, Excel, Publisher, 

In [16]:


import datasets

dataset = datasets.load_dataset("jacob-hugging-face/job-descriptions")
job_descriptions = dataset['train']['job_description'][:15]
for idx, description in enumerate(job_descriptions):
    print(f"Job Description {idx + 1}:\n{description}\n")


Downloading readme: 100%|██████████| 24.0/24.0 [00:00<?, ?B/s]
Downloading data: 100%|██████████| 3.77M/3.77M [00:01<00:00, 1.89MB/s]
Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.02s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 47.36it/s]
Generating train split: 853 examples [00:00, 6318.73 examples/s]


Job Description 1:
minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organiz

In [19]:

from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def text_to_embedding(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state.mean(dim=1)

def calculate_similarity(embedding1, embedding2):
    sim = cosine_similarity(embedding1, embedding2)
    return sim[0][0]
top_matches = {}
for job_desc in job_descriptions:
    job_embedding = text_to_embedding(job_desc)
    matches = []

    for cv_details in cv_details_list:
        cv_text = cv_details.get('Category', '') + ' ' + cv_details.get('Skills', '') + ' ' + cv_details.get('Education', '')
        cv_embedding = text_to_embedding(cv_text)
        similarity = calculate_similarity(job_embedding, cv_embedding)
        matches.append((cv_details, similarity))

    matches.sort(key=lambda x: x[1], reverse=True)
    top_matches[job_desc] = matches[:5]
for job_desc, matches in top_matches.items():
    print(f"Job Description: {job_desc}\n")
    for cv_details, similarity in matches:
        print(f"Similarity Score: {similarity}")
        print(f"CV Details: {cv_details}\n")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Job Description: minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizat

In [18]:
from tabulate import tabulate
for job_desc, matches in top_matches.items():
    print("=" * 50)
    print(f"Job Description: {job_desc}\n")
    table_data = []

    for cv_details, similarity in matches:
        table_data.append([f"Similarity Score: {similarity}", f"CV Details: {cv_details}"])

    print(tabulate(table_data, headers=["", ""], tablefmt="fancy_grid"))


Job Description: minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organizat