## Installation

In [1]:
# This notebook is a playground for the embedding layer
# Learn about Instructor : https://instructor-embedding.github.io/
# And this notebook: https://colab.research.google.com/drive/1P7ivNLMosHyG7XOHmoh7CoqpXryKy3Qt?usp=sharing


## Get started

In [2]:
import pandas as pd

In [3]:

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
import plotly.express as px

def cluster_embeddings(embeddings, n_clusters=5):
    clustering_model = MiniBatchKMeans(n_clusters=n_clusters, random_state=0, n_init="auto")
    clustering_model.fit(embeddings)
    return clustering_model

def plot_clusters(embeddings, plot_title, cluster_labels, hover_data):
    # Code to project the embeddings to 2D using PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings)

    fig = px.scatter(title=plot_title, x=pca_result[:,0], y=pca_result[:,1], hover_data=[hover_data], color=cluster_labels, color_continuous_scale=[(0, 'red'), (0.5, 'white'), (1, 'blue')])
    
    fig.show()


In [8]:
# from InstructorEmbedding import INSTRUCTOR
# model_instructor_large = INSTRUCTOR('hkunlp/instructor-large')
# sentences = []
# words = ["king", "queen", "man", "woman"]
# for word in words:
#     sentences.append(['Represent the gendered nouns: ', word])

# embeddings = model_instructor_large.encode(sentences)
# print ("Embeddings calculated")
# print(embeddings[0])

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

words = ["king", "queen", "man", "woman"]

model = SentenceTransformer('thenlper/gte-base')
embeddings = model.encode(words)

def similarity_of_words(word1:str, word2:str):
    embeddings = model.encode([word1, word2])
    similarity = cos_sim(embeddings[0], embeddings[1])
    print(f"Similarity of {word1} and {word2} is {similarity}")

similarity_of_words("king", "queen")
similarity_of_words("man", "woman")
similarity_of_words("king", "man")
similarity_of_words("queen", "woman")

print("Embedding of king is: ", embeddings[0])


Similarity of king and queen is tensor([[0.8485]])
Similarity of man and woman is tensor([[0.8427]])
Similarity of king and man is tensor([[0.8048]])
Similarity of queen and woman is tensor([[0.8088]])
Embedding of king is:  [-9.90746077e-04 -1.00354496e-02  1.66349635e-02 -2.58153286e-02
  5.89105710e-02 -6.32930826e-03  5.24831228e-02  1.56586093e-03
 -2.22019292e-02 -5.50420992e-02 -3.01425089e-03 -3.94923538e-02
 -5.57869673e-02  9.08218906e-04  2.18543736e-03  4.94766496e-02
  6.53793141e-02 -4.44121519e-03  2.95886062e-02 -4.39819768e-02
  4.01348947e-03 -2.63583940e-03  3.43802981e-02  4.38258089e-02
  2.20788340e-03 -1.82565451e-02 -1.37781510e-02 -3.03469803e-02
 -5.89865632e-02  1.22006480e-02  2.71383766e-02 -1.54632404e-02
  6.30066358e-03 -1.78010240e-02 -8.19579605e-03 -1.93056222e-02
  2.92157382e-02 -2.55258996e-02 -3.26940743e-03 -8.29239655e-03
 -1.34332841e-02 -6.11021998e-04  1.36845009e-02 -1.48683591e-02
 -2.32913624e-02 -4.11055237e-03  2.59014848e-03  1.88581534

In [7]:

clustering_model = cluster_embeddings(embeddings, n_clusters=2)
plot_clusters(embeddings, "Embeddings", clustering_model.labels_, words)