<a href="https://colab.research.google.com/github/Sweta-Das/LangChain-HuggingFace-LLM/blob/SentenceTransformers/Symmetric_BE_ST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%capture
%pip -q install langchain sentence-transformers transformers

In [None]:
from langchain import HuggingFaceHub
import numpy as np
import sys, random
import time
import os

In [None]:
# Accessing through HuggingFace Access Token
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'HUGGINGFACEHUB_API_TOKEN'

### Symmetric Semantic Search Binary Encoder

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine

# Get our models - The package will take care of downloading the models automatically
# For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-125M-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-125M-weightedmean-nli-bitfit")
# Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
model.eval()

# Tokenize input texts
texts = [
    "deep learning",
    "artificial intelligence",
    "deep diving",
    "artificial snow",
]
batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    # Get hidden state of shape [bs, seq_len, hid_dim]
    last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state

# Get weights of shape [bs, seq_len, hid_dim]
weights = (
    torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
    .unsqueeze(0)
    .unsqueeze(-1)
    .expand(last_hidden_state.size())
    .float().to(last_hidden_state.device)
)

# Get attn mask of shape [bs, seq_len, hid_dim]
input_mask_expanded = (
    batch_tokens["attention_mask"]
    .unsqueeze(-1)
    .expand(last_hidden_state.size())
    .float()
)

# Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

embeddings = sum_embeddings / sum_mask

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])
cosine_sim_0_3 = 1 - cosine(embeddings[0], embeddings[3])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[3], cosine_sim_0_3))

tokenizer_config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

Cosine similarity between "deep learning" and "artificial intelligence" is: 0.591
Cosine similarity between "deep learning" and "deep diving" is: 0.563
Cosine similarity between "deep learning" and "artificial snow" is: 0.370


In [None]:
cosine_sim_1_0 = 1 - cosine(embeddings[1], embeddings[0])
cosine_sim_1_2 = 1 - cosine(embeddings[1], embeddings[2])
cosine_sim_1_3 = 1 - cosine(embeddings[1], embeddings[3])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[1], texts[0], cosine_sim_1_0))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[1], texts[2], cosine_sim_1_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[1], texts[3], cosine_sim_1_3))

Cosine similarity between "artificial intelligence" and "deep learning" is: 0.591
Cosine similarity between "artificial intelligence" and "deep diving" is: 0.365
Cosine similarity between "artificial intelligence" and "artificial snow" is: 0.497


In [None]:
batch_tokens

{'input_ids': tensor([[22089,  4673, 50256],
        [  433,  9542,  4430],
        [22089, 23186, 50256],
        [  433,  9542,  6729]]), 'attention_mask': tensor([[1, 1, 0],
        [1, 1, 1],
        [1, 1, 0],
        [1, 1, 1]])}

### Symmetric Semantic Search Binary Encoder Sentence Transformer

In [None]:
%pip install git+https://github.com/UKPLab/sentence-transformers.git

In [None]:
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

texts = [
    "deep learning",
    "artificial intelligence",
    "deep diving",
    "artificial snow",
]

model = SentenceTransformer("Muennighoff/SGPT-125M-weightedmean-nli-bitfit")
embeddings = model.encode(texts)

cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])
cosine_sim_0_3 = 1 - cosine(embeddings[0], embeddings[3])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[3], cosine_sim_0_3))

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/116k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Cosine similarity between "deep learning" and "artificial intelligence" is: 0.591
Cosine similarity between "deep learning" and "deep diving" is: 0.563
Cosine similarity between "deep learning" and "artificial snow" is: 0.370


**Referenced From: **<br>
[**SGPT Symmetric Bi-Encoder Sentence Transformer**](https://huggingface.co/bigscience-data/sgpt-bloom-1b7-nli)