In [3]:
import os
import re
import string
import pandas as pd
import numpy as np
import pdfplumber
import torch
import contractions
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Extract from PDF

In [4]:
def extract_information(pdf_path):
    """Extract all text from a PDF file using pdfplumber."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            resume_text = ""
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    resume_text += " " + page_text
        return resume_text.strip()
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

In [5]:
text = extract_information('Shivansh_Tyagi_Resume.pdf')

In [6]:
print(text)

Shivansh Tyagi
# shivelite2003@gmail.com (cid:132) +91-7456833022 (cid:240) Shivansh Tyagi § ShivanshTyagi9
Summary
SkilledinPython,dataanalysis,andmachinelearning,withexperienceinbuildingpredictivemodels,automation
tools, and interactive applications.
Education
Raj Kumar Goel Institute of Technology Ghaziabad, UP
B.Tech in Computer Science and Engineering Expeted Graduation, 2026
◦ Concentrations: Computational Fundamentals
◦ CGPA: 8
◦ Coursework: Data Structure and Algorithms, Operating System, Database Management System, Com-
puter Architecture, Machine Learning, OOPS
Projects
Handwritten-Digit-Recognition github repo 2
◦ Used TensorFlow and Keras to develop a Convolutional Neural Network (CNN) for handwritten digit
recognition with the MNIST dataset, achieving 99.12 accuracy, enhancing classification performance and
model efficiency.
◦ Used MNIST dataset to train the model.
◦ Tools used: Python, NumPy, Pandas, Tensorflow / Keras, Matplotlib
Fruits and Vegetable Classifier github re

In [7]:
def extract_details(resume_text):
    """Extract Skills and Education sections using regex."""
    # Pattern for Skills (assuming header "Skills" followed by newline)
    skills_pattern = r'Skills\n([\s\S]*?)(?=\n[A-Z]|$)'
    # Pattern for Education (assuming header "Education" followed by newline)
    education_pattern = r'Education\n([\s\S]*?)(?=\n[A-Z][a-z]*\n|$)'
    
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL)
    
    skills = skills_match[0] if skills_match else ""
    education = education_match[0] if education_match else ""
    
    return {'Skills': skills, 'Education': education}

In [8]:
extracted = extract_details(text)

In [9]:
print(extracted)

{'Skills': 'Languages: Python, Java, C', 'Education': 'Raj Kumar Goel Institute of Technology Ghaziabad, UP\nB.Tech in Computer Science and Engineering Expeted Graduation, 2026\n◦ Concentrations: Computational Fundamentals\n◦ CGPA: 8\n◦ Coursework: Data Structure and Algorithms, Operating System, Database Management System, Com-\nputer Architecture, Machine Learning, OOPS'}


In [10]:
def text_cleaning(text: str) -> str:
    """Clean and normalize text for embedding."""
    if pd.isnull(text):
        return ""
    # Lowercase and trim
    text = text.lower().strip()
    # Expand contractions
    text = contractions.fix(text)
    # Remove URLs, emails, phone numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text)
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text.strip()

In [11]:
%%time
def build_candidate_dataset(data_folder='data'):
    """
    Walk through the data folder (with subfolders by category),
    extract resume text from each PDF, extract details (Skills, Education),
    and store metadata.
    """
    candidate_data = []
    for category in os.listdir(data_folder):
        category_path = os.path.join(data_folder, category)
        if os.path.isdir(category_path):
            for file in os.listdir(category_path):
                if file.lower().endswith('.pdf'):
                    pdf_path = os.path.join(category_path, file)
                    full_text = extract_information(pdf_path)
                    details = extract_details(full_text)
                    # Use filename (without extension) as unique ID
                    candidate_id = os.path.splitext(file)[0]
                    candidate_data.append({
                        'ID': candidate_id,
                        'Category': category,
                        'Skills': details.get('Skills', ''),
                        'Education': details.get('Education', '')
                    })
    # Create DataFrame
    df = pd.DataFrame(candidate_data)
    # Optionally, remove resumes where both Skills and Education are missing
    df = df[~((df['Skills'] == "") & (df['Education'] == ""))].reset_index(drop=True)
    # Concatenate Skills and Education to form the complete CV text
    df['CV'] = (df['Skills'] + " " + df['Education']).fillna("")
    # Clean the concatenated CV text
    df['CV'] = df['CV'].apply(text_cleaning)
    return df

CPU times: total: 0 ns
Wall time: 0 ns


In [12]:
cv_df = build_candidate_dataset(data_folder='data')
print("Candidate dataset shape:", cv_df.shape)

Candidate dataset shape: (2460, 5)


In [13]:
cv_df.to_csv('./pdf_extracted_skills_education.csv', index=False)

In [14]:
jd_data = load_dataset('jacob-hugging-face/job-descriptions', split="train")
jd_df = pd.DataFrame(jd_data)
# Clean the job description text
jd_df['clean_jd'] = jd_df['job_description'].apply(text_cleaning)
# For the prototype, we use a subset (e.g. first 15 JDs)
num_jds = 15
job_descriptions = jd_df['clean_jd'][:num_jds].tolist()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|███████████████████████████████████████████████| 853/853 [00:00<00:00, 6147.49 examples/s]


In [15]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Set model to evaluation mode

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [16]:
def get_embedding(text):
    """Generate an embedding for a given text using mean pooling."""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    # Mean pooling over the sequence length dimension
    embedding = outputs.last_hidden_state.mean(dim=1).numpy()[0]
    return embedding

In [17]:
%%time
print("Generating embeddings for job descriptions...")
jd_embeddings = [get_embedding(jd) for jd in job_descriptions]
print("Generating embeddings for candidate CVs...")
resume_embeddings = [get_embedding(cv) for cv in cv_df['CV'].tolist()]

Generating embeddings for job descriptions...
Generating embeddings for candidate CVs...
CPU times: total: 13min 20s
Wall time: 2min 6s


In [18]:
print("JD embedding shape:", jd_embeddings[0].shape)
print("Resume embedding shape:", resume_embeddings[0].shape)

JD embedding shape: (768,)
Resume embedding shape: (768,)


In [19]:
similarity_matrix = cosine_similarity(jd_embeddings, resume_embeddings)

# For each JD, pick the top N candidate indices based on similarity
num_top_candidates = 5
top_candidates = {}

for jd_index, scores in enumerate(similarity_matrix):
    candidate_indices = np.argsort(scores)[::-1][:num_top_candidates]
    top_candidates[jd_index] = [(idx, scores[idx]) for idx in candidate_indices]

In [21]:
for jd_index, candidate_list in top_candidates.items():
    position_title = jd_df.loc[jd_index, 'position_title'] if jd_index < len(jd_df) else "N/A"
    print(f"\nTop candidates for JD {jd_index+1} - Position: {position_title}")
    for candidate_index, score in candidate_list:
        candidate_info = cv_df.iloc[candidate_index]
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} - {candidate_info['Category']}/{candidate_info['ID']}.pdf")


Top candidates for JD 1 - Position: Sales Specialist
  Candidate 1942 - Similarity Score: 0.9415 - HR/18827609.pdf
  Candidate 291 - Similarity Score: 0.9388 - AGRICULTURE/62994611.pdf
  Candidate 28 - Similarity Score: 0.9377 - ACCOUNTANT/16237710.pdf
  Candidate 1796 - Similarity Score: 0.9314 - HEALTHCARE/10466208.pdf
  Candidate 2145 - Similarity Score: 0.9303 - PUBLIC-RELATIONS/12237267.pdf

Top candidates for JD 2 - Position: Apple Solutions Consultant
  Candidate 168 - Similarity Score: 0.9236 - ADVOCATE/22391901.pdf
  Candidate 904 - Similarity Score: 0.9165 - BUSINESS-DEVELOPMENT/95382114.pdf
  Candidate 1724 - Similarity Score: 0.9159 - FITNESS/21238396.pdf
  Candidate 950 - Similarity Score: 0.9155 - CHEF/21869994.pdf
  Candidate 482 - Similarity Score: 0.9146 - ARTS/54100393.pdf

Top candidates for JD 3 - Position: Licensing Coordinator - Consumer Products
  Candidate 2145 - Similarity Score: 0.9496 - PUBLIC-RELATIONS/12237267.pdf
  Candidate 1186 - Similarity Score: 0.945