# 嵌入向量(Embedding Vector)

In [3]:
from sentence_transformers import SentenceTransformer

texts = [
  '翠花买了浅蓝色的鱼',
  '翠花买了浅蓝橙色的鱼',
  '猫在商店吃了一条鱼',
  '翠花去了商店。翠花买了一只虫子。翠花看到一条鱼',
  '它对这个虫子喵喵叫了一声，它现在仍然在对这只虫子和这条鱼喵喵叫',
  '这只猫在鱼店里。这只猫是橙色的。这只猫正在对这条鱼喵喵叫。',
  '翠花是鱼'  
]

model = SentenceTransformer('./models/all-MiniLM-L6-v2')

embeddings = model.encode(texts)
embeddings.shape

(7, 384)

In [5]:
embeddings

array([[-0.00161931,  0.03403834,  0.08776227, ...,  0.06558479,
         0.06246207,  0.01248462],
       [-0.00181392,  0.02937774,  0.08251015, ...,  0.06615598,
         0.06103326,  0.01877187],
       [ 0.06052787,  0.06912147,  0.05274523, ...,  0.01745084,
        -0.02436572, -0.01723605],
       ...,
       [-0.04785175,  0.06157192,  0.03304739, ...,  0.08097716,
         0.00575176, -0.02092445],
       [ 0.05531085,  0.09378336,  0.06028539, ...,  0.08966372,
        -0.0551356 , -0.01390944],
       [-0.01261294,  0.01824456,  0.07638395, ...,  0.05835899,
         0.05905233,  0.02011564]], shape=(7, 384), dtype=float32)

In [37]:
import numpy as np

In [6]:
from sklearn.preprocessing import StandardScaler

dat = StandardScaler().fit_transform(embeddings)

In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

clustering_model = KMeans(n_clusters=2, random_state=42)
clustering_model.fit(dat)
cluster_assignment = clustering_model.labels_

for doc, cluster_id in zip(texts, cluster_assignment):
    print(f"Cluster {cluster_id}: {doc}")

silhouette_avg = silhouette_score(embeddings, cluster_assignment)
print(f"\nSilhouette Score: {silhouette_avg:.4f}")

Cluster 1: 翠花买了浅蓝色的鱼
Cluster 1: 翠花买了浅蓝橙色的鱼
Cluster 0: 猫在商店吃了一条鱼
Cluster 1: 翠花去了商店。翠花买了一只虫子。翠花看到一条鱼
Cluster 0: 它对这个虫子喵喵叫了一声，它现在仍然在对这只虫子和这条鱼喵喵叫
Cluster 0: 这只猫在鱼店里。这只猫是橙色的。这只猫正在对这条鱼喵喵叫。
Cluster 1: 翠花是鱼

Silhouette Score: 0.2724
