Understanding the distinct roles these models play is key to harnessing their potential:

BERT excels in tasks such as sentiment analysis, question answering, and named entity recognition, where word-level granularity is crucial.
Sentence Transformers are the preferred choice for semantic similarity assessments, text matching, and document retrieval tasks, where capturing the essence of entire sentences or paragraphs is essential.

In [10]:
%pip install sentence-transformers
%pip install transformers
%pip install datasets
%pip install faiss
%pip install torch

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m396.7 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torchvision (from sentence-transformers)
  Downloading torchvision-0.16.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.6 kB)
Downloading torchvision-0.16.1-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[

In [17]:
import pandas as pd
import re
import nltk
from datasets import load_dataset

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from keras.preprocessing.text import Tokenizer
from transformers import AutoTokenizer, AutoModel

from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
import logging
import torch

In [12]:
dataset = load_dataset("talanAI/resumesamples")
dataset = pd.DataFrame(dataset['train'])
dataset = dataset.dropna()
dataset = dataset[:10]

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokens = tokenizer(dataset['Resume'].tolist(), return_tensors='pt', truncation=True, padding=True)



In [16]:
# multi-qa-mpnet-base-dot-v1  <= best quality
# all-mpnet-base-v2 	
# multi-qa-distilbert-cos-v1 	
# multi-qa-MiniLM-L6-cos-v1 <= faster

# Fine-tune the model

# Load a pre-trained transformer model
model_name = 'multi-qa-MiniLM-L6-cos-v1'
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Create a list of sentences (or in your case, resumes)
sentences = ['Resume 1', 'Resume 2', 'Resume 3'] # Replace with your resumes

# Create a DataLoader for your sentences
train_data = DataLoader(sentences, shuffle=True, batch_size=16)

# Define the training procedure
train_loss = losses.DenoisingAutoEncoderLoss(model)

# Train the model
model.fit(train_objectives=[(train_data, train_loss)], epochs=1, warmup_steps=100)

.gitattributes: 100%|██████████| 737/737 [00:00<00:00, 6.06MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.51MB/s]
README.md: 100%|██████████| 11.5k/11.5k [00:00<00:00, 21.7MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 1.24MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 447kB/s]
data_config.json: 100%|██████████| 25.5k/25.5k [00:00<00:00, 47.0MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:04<00:00, 19.0MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 252kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 732kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 13.5MB/s]
tokenizer_config.json: 100%|██████████| 383/383 [00:00<00:00, 2.17MB/s]
train_script.py: 100%|██████████| 13.8k/13.8k [00:00<00:00, 53.3MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 967kB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 2.49MB/s]


In [15]:
embeddings = model.encode(dataset['Resume'].tolist(), convert_to_tensor=True)
cosine_scores = util.cos_sim(embeddings, embeddings)

In [5]:
# # Closer to 1 means more similar, closer to 0 means unrelated, closer to -1 means opposite
# cosine_sim_1 = torch.nn.functional.cosine_similarity(hr_admin_vector.unsqueeze(0), hr_assistant_vector.unsqueeze(0))
# print("Cosine similarity:", cosine_sim_1.item())

# cosine_sim_2 = torch.nn.functional.cosine_similarity(hr_admin_vector.unsqueeze(0), hr_executive_vector.unsqueeze(0))
# print("Cosine similarity:", cosine_sim_2.item())

# cosine_sim_3 = torch.nn.functional.cosine_similarity(hr_assistant_vector.unsqueeze(0), hr_executive_vector.unsqueeze(0))
# print("Cosine similarity:", cosine_sim_3.item())


Cosine similarity: 0.8887186050415039
Cosine similarity: 0.9065752625465393
Cosine similarity: 0.9094988703727722
