In [None]:

import numpy as np
from sentence_transformers import SentenceTransformer
import ipywidgets as widgets
from matplotlib import pyplot as plt
import sys
import os
sys.path.append(os.path.abspath(".."))

In [None]:
mode = "Test"
sentences = []

def on_value_change(change):
    global mode
    mode = change['new']

radio = widgets.RadioButtons(
    options=['Test', 'Query', 'Input'],
    value='Test',
    description="Select:"
)
radio.observe(on_value_change, names="value")
display(radio)

In [None]:
if mode == "Test":
    sentences = [
        "Given an input text, it outputs a vector which captures the semantic information",
        "By default, input text longer than 256 word pieces is truncated."
    ]
elif mode == "Query":
    
    from elastic import parse_queries, query, elasticsearch_client
    import metrics

    q = input("Query number: ")
    if q == "":
        q = 0
    else: q = int(q)

    query_list = parse_queries("../collection")[q:q+1]
    docs, multiple_query_results = query(elasticsearch_client("../credentials.json", "../http_ca.crt"), query_list)
    sentences = docs[0][0].split(". ")[:-1]
    print(f"{query_list[0].text}")
    print(f"Returned doc {multiple_query_results[0][0]} with relevance {metrics.relevance(multiple_query_results[0][0], query_list[0])}")
elif mode == "Input":
    doc = input("Give text: ")
    sentences = doc.split(". ")[:-1]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)

try docs:
30, 80

In [None]:
for s in sentences:
    print(s.strip())
    print()

In [None]:
def cosine_sim(vec1, vec2) -> float:
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

series = []

for vec1, vec2 in zip(embeddings[:-1], embeddings[1:]):
    series.append(cosine_sim(vec1, vec2))

print("\n".join(map(str, series)))

## Time series of sentence similarity
---

In [None]:
fig, ax = plt.subplots()

#ax.plot(series)
ax.step(np.arange(len(series)), series, where='post')
ax.set_xticks(np.arange(len(series)))
plt.show()

## Gradient
---

In [None]:
def normalize(arr):
    arr = list(map(abs, arr))
    maxval = max(arr)
    minval = min(arr)

    return list(map(lambda elem: (elem - minval)/(maxval-minval), arr))

dy = np.gradient(series, np.arange(len(series)))  # Compute numerical derivative

fig2, ax2 = plt.subplots()

#ax2.plot(normalize(dy))
ax2.step(np.arange(len(series)), normalize(dy),where='post')
#ax2.step(np.arange(len(series)), dy, where='post')
ax2.set_xticks(np.arange(len(series)))
plt.show()

# Continue

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from kmedoids import KMedoids
import hdbscan
from kneed import KneeLocator
from sklearn.decomposition import PCA

num_clusters = 2

'''
# Generate the linkage matrix
Z = linkage(embeddings, method='ward')
print(Z)
labels = fcluster(Z, t=num_clusters, criterion='maxclust')

#Organize each cluster elements into lists
clusters = [[] for _ in range(num_clusters)]

for i, cluster in enumerate(clusters):
    print(f"Cluster {i:02}: {cluster}")

#Evaluate clustering
score = silhouette_score(embeddings, labels)
print(f"\nSilhouette Score: {score:.3f}")'
'''

'''
dista = (1 - cosine_similarity(embeddings)).astype(np.float64)

clustering = hdbscan.HDBSCAN(min_cluster_size=2, metric="precomputed")
model = clustering.fit(dista)
print(model.labels_)

clusters = [[] for _ in range(num_clusters)]

for i, cluster in enumerate(clusters):
    print(f"Cluster {i:02}: {cluster}")

score = silhouette_score(embeddings, model.labels_)
print(f"\nSilhouette Score: {score:.3f}")'
'''
dista = cosine_distances(embeddings)

inertia = []
K_range = list(range(1, len(sentences)))

#Find optimal cluster count
for k in K_range:
    clustering = KMedoids(n_clusters=k, metric="precomputed")
    clustering_model = clustering.fit(dista)
    inertia.append(clustering_model.inertia_)
    print(clustering_model.inertia_)

knee_locator = KneeLocator(K_range, inertia, curve="convex", direction="decreasing")
optimal_k = knee_locator.elbow
optimal_k = 3
print(optimal_k)

fig, ax = plt.subplots()
ax.plot(K_range, inertia, "ro--")
plt.show()

#Cluster optimal
clustering = KMedoids(n_clusters=int(optimal_k), metric="precomputed")
clustering_model = clustering.fit(dista)
medoids = clustering_model.medoid_indices_
print(f"Clustering: {clustering_model.labels_}")
print(f"Medoids: {medoids}")

Here's another idea
Let's compare the query with each sentence

In [None]:
if mode == "Query":
    print(f"Query: {query_list[0].text}")
    query_vec = model.encode(query_list[0].text)

    embedding_norms = np.linalg.norm(embeddings, axis=1)
    query_sim = np.dot(embeddings, query_vec)
    print(query_sim)

    sorted_sentences = sorted(sentences, key=lambda x: query_sim[sentences.index(x)], reverse=True)
    print(sorted_sentences)

Visualize clusters and query

In [None]:
pca_model = PCA(n_components=2)

S = pca_model.fit_transform(embeddings)

if mode == "Query":
    query_vec_reduced = pca_model.transform(query_vec.reshape(1, -1))
    print(query_vec_reduced)

fig, ax = plt.subplots()

markers = ["x" if i in medoids else "o" for i in range(len(sentences))]

colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

for i, elem in enumerate(S):
    ax.scatter(elem[0], elem[1], c=colors[clustering_model.labels_[i]], edgecolors='k', alpha=0.7, marker=markers[i])

if mode == "Query":
    ax.scatter(query_vec_reduced[0, 0], query_vec_reduced[0, 1], c="red", edgecolors='k', alpha=0.7, marker="^")

print(clustering_model.labels_)

print(silhouette_score(dista, clustering_model.labels_, metric="precomputed"))