In [6]:
!pip install -U sentence-transformers git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview
!pip install datasets

Collecting git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview
  Cloning https://github.com/huggingface/transformers (to revision v4.56.0-Embedding-Gemma-preview) to /private/var/folders/s3/9prh7z6n5zz47n7rm8bx03fm0000gn/T/pip-req-build-_v_kejlc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /private/var/folders/s3/9prh7z6n5zz47n7rm8bx03fm0000gn/T/pip-req-build-_v_kejlc
  Running command git checkout -q 60b68e304cf4b6569b0660a13b558b929d4b0e77
  Resolved https://github.com/huggingface/transformers to commit 60b68e304cf4b6569b0660a13b558b929d4b0e77
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To updat

In [2]:
import torch
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "google/embeddinggemma-300M"
model = SentenceTransformer(model_id).to(device=device)

print(f"Device: {model.device}")
print(model)
print("Total number of parameters in the model:", sum([p.numel() for _, p in model.named_parameters()]))

Device: cpu
SentenceTransformer(
  (0): Transformer({'max_seq_length': 2048, 'do_lower_case': False, 'architecture': 'Gemma3TextModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 768, 'out_features': 3072, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (3): Dense({'in_features': 3072, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
  (4): Normalize()
)
Total number of parameters in the model: 307581696


In [3]:
from datasets import Dataset

dataset = [
    ["NISA口座を開設するにはどうすればよいですか？", "新しい非課税投資口座を開始する手続きは何ですか？", "通常の貯蓄口座の残高を確認したいです。"],
    ["住宅ローンの早期返済に手数料はかかりますか？", "家のローンを早く返済した場合、何か費用は発生しますか？", "この投資信託の管理手数料はいくらですか？"],
    ["医療保険の補償範囲はどのようになっていますか？", "健康保険プランの給付について教えてください。", "私の生命保険の解約ポリシーはどうなっていますか？"],
]

# リストベースのデータセットを辞書のリストに変換します。
data_as_dicts = [ {"anchor": row[0], "positive": row[1], "negative": row[2]} for row in dataset ]

# 辞書のリストからHugging Faceの`Dataset`オブジェクトを作成します。
train_dataset = Dataset.from_list(data_as_dicts)
print(train_dataset)

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 3
})


In [4]:
task_name = "STS"

def get_scores(query, documents):
  # model.encode()を呼び出して埋め込みを計算します
  query_embeddings = model.encode(query, prompt=task_name)
  doc_embeddings = model.encode(documents, prompt=task_name)

  # 埋め込みの類似度を計算します
  similarities = model.similarity(query_embeddings, doc_embeddings)

  for idx, doc in enumerate(documents):
    print("Document: ", doc, "-> 🤖 Score: ", similarities.numpy()[0][idx])

query = "非課税の積立投資を始めたいのですが、どうすればよいでしょうか？"
documents = ["NISA口座の開設", "通常貯蓄口座の開設", "住宅ローン申込ガイド"]

get_scores(query, documents)

Document:  NISA口座の開設 -> 🤖 Score:  0.5614742
Document:  通常貯蓄口座の開設 -> 🤖 Score:  0.6190605
Document:  住宅ローン申込ガイド -> 🤖 Score:  0.48814344


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from transformers import TrainerCallback

loss = MultipleNegativesRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    # 必須パラメータ:
    output_dir="my-embedding-gemma",
    # オプションの訓練パラメータ:
    prompts=model.prompts[task_name],    # モデルのプロンプトを使用して訓練
    num_train_epochs=5,
    per_device_train_batch_size=1,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    # オプションの追跡/デバッグパラメータ:
    logging_steps=train_dataset.num_rows,
    report_to="none",
)

class MyCallback(TrainerCallback):
    "エポック終了時にモデルを評価するコールバック"
    def __init__(self, evaluate):
        self.evaluate = evaluate # 評価関数

    def on_log(self, args, state, control, **kwargs):
        # テキスト生成を使用してモデルを評価
        print(f"ステップ {state.global_step} が完了しました。評価を実行中:")
        self.evaluate()

def evaluate():
  get_scores(query, documents)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=loss,
    callbacks=[MyCallback(evaluate)]
)
trainer.train()

In [6]:
get_scores(query, documents)

Document:  NISA口座の開設 -> 🤖 Score:  0.7229382
Document:  通常貯蓄口座の開設 -> 🤖 Score:  0.6325414
Document:  住宅ローン申込ガイド -> 🤖 Score:  0.42285493


In [7]:
# Push to Hub
model.push_to_hub("my-embedding-gemma")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...0gn/T/tmprx1p97w6/model.safetensors:   0%|          |  555kB / 1.21GB            

  ...0000gn/T/tmprx1p97w6/tokenizer.json:  99%|#########9| 33.1MB / 33.4MB            

  ...prx1p97w6/2_Dense/model.safetensors:  12%|#1        | 1.10MB / 9.44MB            

  ...prx1p97w6/3_Dense/model.safetensors:  12%|#1        | 1.10MB / 9.44MB            

'https://huggingface.co/shoya321/my-embedding-gemma/commit/9823a7d5174d405e363b753d28357570b48439f9'