In [None]:
!pip install transformers scikit-learn
!pip install PyPDF2
!pip install os-sys
!pip install regex
!pip install pdfplumber
!pip install datasets

In [None]:
import os
import re
import PyPDF2
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_information(text):
    category = text.split('\n', 1)[0].strip()

    skills_match = re.search(r'Skills\n(.*?)\n', text, re.DOTALL)
    if skills_match:
        skills = skills_match.group(1).strip()
    else:
        skills = "Not found"

    education_match = re.search(r'Education\n(.*?)\n', text, re.DOTALL)
    if education_match:
        education = education_match.group(1).strip()
    else:
        education = "Not found"

    return {
        "Category": category,
        "Skills": skills,
        "Education": education
    }

pdf_folder = "data_resume/"

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        text = extract_text_from_pdf(pdf_path)
        information = extract_information(text)

        print(f"File: {filename}")
        print(f"Category (Job role): {information['Category']}")
        print(f"Skills: {information['Skills']}")
        print(f"Education: {information['Education']}")
        print("\n")


In [None]:
from datasets import load_dataset

dataset = load_dataset("jacob-hugging-face/job-descriptions")

num_descriptions_to_fetch = 15
descriptions = dataset["train"]["job_description"][:num_descriptions_to_fetch]

for idx, description in enumerate(descriptions, start=1):
    print(f"Job Description {idx}:\n")
    print(description)
    print("\n" + "=" * 50 + "\n")


In [6]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

job_descriptions = [dataset["train"]["job_description"][i] for i in range(num_descriptions_to_fetch)]
job_description_embeddings = []

for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    job_description_embeddings.append(embeddings)

cv_embeddings = []
cv_information_list = []

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        text = extract_text_from_pdf(pdf_path)
        information = extract_information(text)
        cv_information_list.append(information)
        tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            output = model(**tokens)
        embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
        cv_embeddings.append(embeddings)

job_description_embeddings = np.array(job_description_embeddings)
cv_embeddings = np.array(cv_embeddings)

In [None]:
similarity_scores = cosine_similarity(job_description_embeddings, cv_embeddings)

top_cv_indices = np.argsort(similarity_scores, axis=1)[:, ::-1]

for i, description in enumerate(job_descriptions):
    print(f"Job Description {i + 1}:\n")
    print(description)
    print("\nTop 5 CV Matches:")
    for j, cv_index in enumerate(top_cv_indices[i][:5]):
        print(f"CV {j + 1}: {cv_information_list[cv_index]['Category']} (Similarity Score: {similarity_scores[i][cv_index]:.4f})")
    print("=" * 50 + "\n")