In [1]:
! nvcc -V

zsh:1: command not found: nvcc


In [2]:
import torch
from transformers import AutoModel

device = "mps" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to(device)

  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- configuration_xlm_roberta.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/jinaai/xlm-roberta-flash-implementation:
- mha.py
- rotary.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingf

In [3]:
texts = [
    "Follow the white rabbit.",  # English
    "Sigue al conejo blanco.",  # Spanish
    "Suis le lapin blanc.",  # French
    "跟着白兔走。",  # Chinese
    "اتبع الأرنب الأبيض.",  # Arabic
    "Folge dem weißen Kaninchen.",  # German
]

In [4]:
embeddings = model.encode(texts, task="text-matching")

In [5]:
print(embeddings[0] @ embeddings[1].T)

0.70863205


# 相似度計算

In [6]:
from torch.nn.functional import cosine_similarity

emb1 = model.encode(["Follow the white rabbit."], task="text-matching")
emb2 = model.encode(["Sigue al conejo blanco."], task="text-matching")

similarity = cosine_similarity(torch.tensor(emb1), torch.tensor(emb2))
print(f"相似度: {similarity.item()}")

相似度: 0.7086319923400879


# 分類

In [7]:
texts = ["I love this movie!", "This is the worst experience ever."]
embeddings = model.encode(texts, task="text-matching")

In [8]:
import torch.nn as nn
import torch.optim as optim

class SimpleClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, 2)  # 2 classes (Positive, Negative)

    def forward(self, x):
        return self.fc(x)

classifier = SimpleClassifier(input_dim=embeddings.shape[1]).to(device)
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# 假設標籤：0 (Negative), 1 (Positive)
labels = torch.tensor([1, 0], dtype=torch.long).to(device)
outputs = classifier(torch.tensor(embeddings).to(device))
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()

print("訓練 loss:", loss.item())

訓練 loss: 0.6902755498886108


# 向量搜尋

In [10]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

corpus = [
    "I love watching movies.",
    "Cinema is my favorite place.",
    "I enjoy hiking in the mountains.",
    "This food is really delicious!",
    "The economy is experiencing a downturn.",
    "Scientists discovered a new exoplanet.",
]

corpus_embeddings = model.encode(corpus, task="text-matching")
knn = NearestNeighbors(n_neighbors=2, metric="cosine").fit(corpus_embeddings)

query = ["Me gustan las películas."]  # 西班牙語: "I like movies."

query_embedding = model.encode(query, task="text-matching")
distances, indices = knn.kneighbors(query_embedding)
print("最相似的文本:", [corpus[i] for i in indices[0]])

最相似的文本: ['I love watching movies.', 'Cinema is my favorite place.']
