# 嵌入向量(Embedding Vector)

In [2]:
from sentence_transformers import SentenceTransformer

texts = [
  '翠花买了浅蓝色的鱼',
  '翠花买了浅蓝橙色的鱼',
  '猫在商店吃了一条鱼',
  '翠花去了商店。翠花买了一只虫子。翠花看到一条鱼',
  '它对这个虫子喵喵叫了一声，它现在仍然在对这只虫子和这条鱼喵喵叫',
  '这只猫在鱼店里。这只猫是橙色的。这只猫正在对这条鱼喵喵叫。',
  '翠花是鱼'  
]

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(texts)
embeddings.shape

(7, 384)

In [3]:
embeddings

array([[-0.00161932,  0.03403835,  0.08776221, ...,  0.06558478,
         0.06246207,  0.01248464],
       [-0.00181394,  0.02937771,  0.08251016, ...,  0.06615596,
         0.06103329,  0.01877186],
       [ 0.0605279 ,  0.06912154,  0.05274524, ...,  0.01745085,
        -0.02436568, -0.01723605],
       ...,
       [-0.04785176,  0.0615719 ,  0.0330474 , ...,  0.0809772 ,
         0.00575183, -0.02092442],
       [ 0.05531083,  0.09378334,  0.06028535, ...,  0.08966375,
        -0.05513557, -0.01390944],
       [-0.01261296,  0.01824458,  0.07638392, ...,  0.05835902,
         0.05905231,  0.02011567]], dtype=float32)

In [4]:
from sklearn.preprocessing import StandardScaler

dat = StandardScaler().fit_transform(embeddings)

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

clustering_model = KMeans(n_clusters=2, random_state=42)
clustering_model.fit(dat)
cluster_assignment = clustering_model.labels_

for doc, cluster_id in zip(texts, cluster_assignment):
    print(f"Cluster {cluster_id}: {doc}")

silhouette_avg = silhouette_score(embeddings, cluster_assignment)
print(f"\nSilhouette Score: {silhouette_avg:.4f}")

Cluster 1: 翠花买了浅蓝色的鱼
Cluster 1: 翠花买了浅蓝橙色的鱼
Cluster 0: 猫在商店吃了一条鱼
Cluster 1: 翠花去了商店。翠花买了一只虫子。翠花看到一条鱼
Cluster 0: 它对这个虫子喵喵叫了一声，它现在仍然在对这只虫子和这条鱼喵喵叫
Cluster 0: 这只猫在鱼店里。这只猫是橙色的。这只猫正在对这条鱼喵喵叫。
Cluster 1: 翠花是鱼

Silhouette Score: 0.2724
