In [2]:
!apt-get update
!apt-get install -y tesseract-ocr
!pip install transformers pandas scikit-learn pdfplumber pytesseract Pillow torch torchvision torchaudio


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [60.9 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,503 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,545 kB]
Get:13 http://archive.ubuntu.com/ubuntu

In [3]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.1


In [4]:
import pdfplumber
import pytesseract
from sklearn.utils import resample
from sklearn.metrics import classification_report
from PIL import Image
import fitz
import re
from google.colab import files
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score



In [5]:
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

In [6]:
# The function extract text from PDF using pdfplumber and pytesseract
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    extracted_text += text
                else:
                    extracted_text += extract_text_from_images(pdf_path)
    except Exception as e:
        extracted_text += extract_text_from_images(pdf_path)
    return extracted_text


In [7]:

def extract_text_from_images(pdf_path):
    extracted_text = ""
    try:
        pdf_document = fitz.open(pdf_path)
        for page_number in range(len(pdf_document)):
            page = pdf_document[page_number]
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            extracted_text += pytesseract.image_to_string(image)
    except Exception as e:
        pass
    return extracted_text

In [8]:
# The function is used to parse specific sections of the resume using regex
def parse_resume(text):
    data = {}
    education_pattern = re.compile(r"EDUCATION\s*([\s\S]+?)(?=SKILLS|WORK EXPERIENCE|$)", re.IGNORECASE)
    education_match = education_pattern.search(text)
    data["education"] = education_match.group(1).strip() if education_match else "Not Found"
    experience_pattern = re.compile(r"(WORK EXPERIENCE|EXPERIENCE)\s*([\s\S]+?)(?=PROJECTS|$)", re.IGNORECASE)
    experience_match = experience_pattern.search(text)
    data["experience"] = experience_match.group(2).strip() if experience_match else "Not Found"
    project_pattern = re.compile(r"(PROJECTS)\s*([\s\S]+?)(?=SKILLS|$)", re.IGNORECASE)
    project_match = project_pattern.search(text)
    data["projects"] = project_match.group(2).strip() if project_match else "Not Found"
    skills_pattern = re.compile(r"(SKILLS|TECHNICAL SKILLS)\s*([\s\S]+?)(?=WORK EXPERIENCE|$)", re.IGNORECASE)
    skills_match = skills_pattern.search(text)
    data["technical_skills"] = skills_match.group(2).strip() if skills_match else "Not Found"
    return data

In [9]:
def fine_tune_bert(dataset_path):
    data = pd.read_csv(dataset_path)
    labels = pd.factorize(data["Category"])[0]
    category_mapping = dict(enumerate(pd.factorize(data["Category"])[1]))


In [10]:
# The function is used to fine tune the BERT model for job classification
def fine_tune_bert(dataset_path):
    # Load the dataset from a CSV file
    data = pd.read_csv(dataset_path)

    # Encode the target labels as numeric values and create a category mapping dictionary
    labels = pd.factorize(data["Category"])[0]
    category_mapping = dict(enumerate(pd.factorize(data["Category"])[1]))

    # Define a custom PyTorch Dataset class for processing text data
    class JobDataset(torch.utils.data.Dataset):
        def __init__(self, texts, labels, tokenizer, max_len):
            """
            Initializes the dataset class.

            Args:
                texts (list): List of input texts.
                labels (list): List of corresponding labels.
                tokenizer (BertTokenizer): Tokenizer for BERT.
                max_len (int): Maximum sequence length for tokenization.
            """
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_len = max_len

        def __len__(self):
            """Returns the total number of samples in the dataset."""
            return len(self.texts)

        def __getitem__(self, idx):
            """
            Returns a single data point as input_ids, attention_mask, and label.

            Args:
                idx (int): Index of the data point.

            Returns:
                dict: Contains tokenized input_ids, attention_mask, and labels.
            """
            text = self.texts[idx]
            label = self.labels[idx]

            # Tokenize the input text using BERT's tokenizer
            encoding = self.tokenizer.encode_plus(
                text,
                add_special_tokens=True,   # Add [CLS] and [SEP] tokens
                max_length=self.max_len,   # Truncate/pad to the specified max length
                padding="max_length",
                truncation=True,
                return_attention_mask=True,
                return_tensors="pt",
            )
            return {
                "input_ids": encoding["input_ids"].squeeze(0),
                "attention_mask": encoding["attention_mask"].squeeze(0),
                "labels": torch.tensor(label, dtype=torch.long),
            }

    # Load the pre-trained BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Create the dataset and split it into training and validation sets (80-20 split)
    dataset = JobDataset(data["Resume"], labels, tokenizer, max_len=128)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    # Load the pre-trained BERT model with a classification head
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(category_mapping))

    # Define a function to compute evaluation metrics: Accuracy and F1-score
    def compute_metrics(eval_pred):
        """
        Computes accuracy and F1-score for model evaluation.

        Args:
            eval_pred (tuple): Contains logits and true labels.

        Returns:
            dict: A dictionary containing accuracy and F1-score.
        """
        logits, labels = eval_pred
        preds = logits.argmax(axis=1)  # Convert logits to predicted labels
        accuracy = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds, average="weighted")
        return {"accuracy": accuracy, "f1_score": f1}

    # Define training arguments for the Trainer API
    training_args = TrainingArguments(
        output_dir="./results",              # Directory to save results
        evaluation_strategy="epoch",         # Evaluate the model at the end of each epoch
        save_strategy="epoch",               # Save the model checkpoint at the end of each epoch
        save_total_limit=2,                  # Limit the total saved checkpoints
        learning_rate=2e-5,                  # Learning rate for optimization
        per_device_train_batch_size=16,      # Batch size for training
        per_device_eval_batch_size=16,       # Batch size for evaluation
        num_train_epochs=20,                 # Total number of training epochs
        weight_decay=0.01,                   # Weight decay for regularization
        logging_dir="./logs",                # Directory for logging
        logging_steps=10,                    # Log every 10 steps
        load_best_model_at_end=True,         # Load the best model based on validation loss
        metric_for_best_model="eval_loss",   # Metric to monitor for early stopping
        greater_is_better=False,             # Lower eval_loss is better
        report_to="none",                    # Disable reporting to external loggers
        save_steps=500,                      # Save checkpoint every 500 steps
        lr_scheduler_type="reduce_lr_on_plateau",  # Adjust learning rate on plateau
    )

    # Initialize the Hugging Face Trainer
    trainer = Trainer(
        model=model,                       # BERT model for sequence classification
        args=training_args,                # Training arguments
        train_dataset=train_dataset,       # Training dataset
        eval_dataset=val_dataset,          # Validation dataset
        tokenizer=tokenizer,               # Tokenizer for preprocessing
        compute_metrics=compute_metrics,   # Function to compute evaluation metrics
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Early stopping callback
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model and tokenizer
    model.save_pretrained("./fine_tuned_bert")
    tokenizer.save_pretrained("./fine_tuned_bert")

    # Return the trained model, tokenizer, and category mapping
    return model, tokenizer, category_mapping

In [24]:
def generate_recommendations(resume_text, job_data, model, tokenizer, max_len=128):
    """
    Generates job recommendations by comparing resume text embeddings with job description embeddings.

    Args:
        resume_text (str): The parsed resume text.
        job_data (DataFrame): A DataFrame containing job descriptions.
        model (BERT): The fine-tuned BERT model for embedding generation.
        tokenizer (BertTokenizer): The tokenizer corresponding to the BERT model.
        max_len (int, optional): Maximum sequence length for tokenization. Default is 128.

    Returns:
        DataFrame: Top 10 job recommendations sorted by similarity score.
    """
    # Check if GPU is available; otherwise, fallback to CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move the model to the specified device

    # Encode and tokenize the input resume text
    resume_encoding = tokenizer.encode_plus(
        resume_text,
        max_length=max_len,
        add_special_tokens=True,  # Add [CLS] and [SEP] tokens
        padding="max_length",     # Pad to max length
        truncation=True,          # Truncate sequences longer than max_len
        return_attention_mask=True,
        return_tensors="pt",      # Return PyTorch tensors
    )

    # Move the encoded tokens to the appropriate device
    resume_encoding = {key: value.to(device) for key, value in resume_encoding.items()}

    #Generate resume embeddings using the BERT model
    with torch.no_grad():  # Disable gradient calculation for inference
        resume_embedding = model.bert(
            input_ids=resume_encoding["input_ids"],
            attention_mask=resume_encoding["attention_mask"],
        ).last_hidden_state.mean(dim=1).cpu().numpy()  # Compute the mean of the last hidden states

    # Generate embeddings for all job descriptions
    job_embeddings = []  # List to store job embeddings
    for description in job_data["Resume"]:  # Iterate through job descriptions
        # Tokenize and encode each job description
        job_encoding = tokenizer.encode_plus(
            description,
            max_length=max_len,
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        # Move the encoded tokens to the appropriate device
        job_encoding = {key: value.to(device) for key, value in job_encoding.items()}

        # Generate embeddings for each job description
        with torch.no_grad():
            job_embedding = model.bert(
                input_ids=job_encoding["input_ids"],
                attention_mask=job_encoding["attention_mask"],
            ).last_hidden_state.mean(dim=1).cpu().numpy()

        # Append the job embedding to the list
        job_embeddings.append(job_embedding)

    #Convert the list of job embeddings to a NumPy array
    job_embeddings = torch.tensor(job_embeddings, dtype=torch.float32).squeeze().numpy()

    #Calculate cosine similarity between the resume and job embeddings
    similarities = cosine_similarity(resume_embedding, job_embeddings)

    #Add similarity scores to the job dataset
    job_data["Similarity"] = similarities.flatten()  # Flatten similarity scores to 1D array

    #Remove duplicate job descriptions (if any) and sort by similarity
    job_data = job_data.drop_duplicates(subset=["Resume"])
    recommendations = job_data.sort_values(by="Similarity", ascending=False).head(10).reset_index(drop=True)

    return recommendations



In [12]:
uploaded = files.upload()
pdf_file_path = list(uploaded.keys())[0]
resume_text = extract_text_from_pdf(pdf_file_path)
parsed_data = parse_resume(resume_text)
print("Parsed Resume:")
for key, value in parsed_data.items():
    print(f"{key.capitalize()}: {value}")


Saving resume.pdf to resume.pdf
Parsed Resume:
Education: Masters of Science in Computer Engineering Aug 2023 - Dec 2024
Virginia Tech, Blacksburg CGPA: 4.0
B.Tech in Electronics and Communication Engineering Sep 2017 - May 2021
National Insititute of Technology, Karnataka CGPA: 7.35/10
Professional Experience
RTL Systems Design Intern, Skyworks Solutions, Austin May 2024 - Aug 2024
• Designed Skyworks proprietary communication protocol test driver on Zynq ZCU 102 FPGA, used Vivado flow to
generate bit stream and VBScript to test driver on PetaLinux Environment
• Executed Silicon Post-Validation tests on prototype test chip for Skyworks Solutions
Embedded Hardware Engineer, Bharat Electronics Limited, Bangalore Oct 2021 - Jun 2023
• Designed and Tested Microchip PIC microcontroller based Display System for aerospace applications using OrCAD
• Tested with I2C interface and developed Embedded C code that would communicate with board
• Designed and Tested 24v down-converted circuit board 

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [14]:
uploaded_dataset = files.upload()
dataset_path = list(uploaded_dataset.keys())[0]
model, tokenizer, category_mapping = fine_tune_bert(dataset_path)

Saving Updated_Job_Dataset.csv to Updated_Job_Dataset.csv


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,3.1008,2.964345,0.391667,0.3279
2,2.6111,2.333276,0.575,0.54422
3,2.1336,1.746516,0.708333,0.685526
4,1.5241,1.442227,0.7,0.686422
5,1.4636,1.258203,0.716667,0.696842
6,1.2234,1.155845,0.7,0.694697
7,1.0583,1.105559,0.708333,0.696795
8,1.033,1.084383,0.708333,0.699074
9,1.1577,1.076729,0.708333,0.700926
10,1.0122,1.073672,0.708333,0.700556


In [20]:
job_data_uploaded = files.upload()
job_data_path = list(job_data_uploaded.keys())[0]
job_data = pd.read_csv(job_data_path)


Saving Updated_Job_Dataset.csv to Updated_Job_Dataset (1).csv


In [21]:
parsed_resume_text = (
    f"Education: {parsed_data['education']} "
    f"Experience: {parsed_data['experience']} "
    f"Projects: {parsed_data['projects']} "
    f"Skills: {parsed_data['technical_skills']}"
)

In [28]:
recommendations = generate_recommendations(parsed_resume_text, job_data, model, tokenizer)
print("\nTop Recommended Jobs:")
print(recommendations[["Category", "Similarity"]])


Top Recommended Jobs:
            Category  Similarity
0  Backend Developer    0.500571
1  Backend Developer    0.463420
2  Backend Developer    0.462671
3  Backend Developer    0.461088
4  Backend Developer    0.460023
5  Backend Developer    0.459191
6  Backend Developer    0.457737
7  Backend Developer    0.452712
8  Backend Developer    0.452524
9  Backend Developer    0.452454
