In [2]:
import os, math, random
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from numpy.linalg import norm
import torch
from huggingface_hub import login
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader

if "HUGGINGFACE_HUB_TOKEN" in os.environ:
    try:
        login(token=os.environ["HUGGINGFACE_HUB_TOKEN"])
    except Exception:
        pass

MODEL_ID = "google/embeddinggemma-300m"
CSV_PATH = "pairs.csv"
OUTPUT_DIR = "ft-embeddinggemma"

SEED = 42

random.seed(SEED); np.random.seed(SEED)

USE_AMP = torch.cuda.is_available()
DEVICE  = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

# ---- 居酒屋ドリンク予測データ読み込み ----
# データ少ないから全部学習データにする
train_df = pd.read_csv(CSV_PATH).drop_duplicates()

train_samples = [
    InputExample(texts=[r.query.strip(), r.item.strip()])
    for r in train_df.itertuples(index=False)
]
# train_samples = [
#     InputExample(texts=["唐揚げ", "ハイボール"]),
#     InputExample(texts=["枝豆", "ビール"]),
#     ...
# ]


In [None]:
EPOCHS = 2
BASE_BATCH_SIZE = 4
# MAX_SEQ_LEN = 64
# ---- Model ----
model = SentenceTransformer(MODEL_ID, device=DEVICE)
# model.max_seq_length = MAX_SEQ_LEN

# ---- DataLoader ----
BATCH_SIZE = min(BASE_BATCH_SIZE, max(1, len(train_samples)))
train_loader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=BATCH_SIZE,
)

# ---- Loss ----
# バッチ内の正解以外をNegativeとして扱う
# アンカー：「唐揚げ」
# ポジティブ：「ハイボール」
# ネガティブ：「枝豆」「冷奴」「レモンサワー」など
# 唐揚げとハイボールの距離を近づけて、それ以外の距離を遠ざける指標
train_loss = losses.MultipleNegativesRankingLoss(model)


steps_per_epoch = max(1, len(train_loader))
WARMUP_RATIO = 0.1
warmup_steps = int(steps_per_epoch * EPOCHS * WARMUP_RATIO)

# ---- 学習 ----
model.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps, # 学習率を徐々に上げていくスケジュール設定らしい
    output_path=OUTPUT_DIR,
)

print(f"✅ saved to: {OUTPUT_DIR}")

In [3]:
def rank(q, docs, m, topk=10):
    import numpy as np
    from numpy.linalg import norm
    qv = m.encode_query(q)
    dv = m.encode_document(docs)
    sims = (qv @ dv.T) / (norm(qv) * norm(dv, axis=1))
    order = np.argsort(-sims)[:topk]
    return [(docs[i], float(sims[i])) for i in order]

# CSVから「item（=ドリンク）」のユニーク集合を候補に
corpus_texts = sorted(set(train_df["item"].astype(str).tolist()))
seen = set()
demo_candidates = []
for x in corpus_texts:
    if x not in seen:
        demo_candidates.append(x); seen.add(x)

demo_queries = ["唐揚げ", "チーズ唐揚げ"]

base = SentenceTransformer(MODEL_ID, device=DEVICE, use_auth_token=True)
ft   = SentenceTransformer(OUTPUT_DIR, device=DEVICE)

def show_demo(q):
    print(f"\n### Query: {q}")
    print("Before:")
    for t, s in rank(q, demo_candidates, base, topk=5):
        print(f"  {s:.3f}  {t}")
    print("After:")
    for t, s in rank(q, demo_candidates, ft, topk=5):
        print(f"  {s:.3f}  {t}")

for q in demo_queries:
    show_demo(q)





### Query: 唐揚げ
Before:


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0.251  熱燗
  0.232  焼酎
  0.209  ビール
  0.200  チューハイ
  0.196  ホッピー
After:
  0.555  ハイボール
  0.407  焼酎
  0.380  チューハイ
  0.349  ホッピー
  0.321  泡盛

### Query: チーズ唐揚げ
Before:
  0.213  チューハイ
  0.200  ビール
  0.195  焼酎
  0.192  熱燗
  0.176  生ビール
After:
  0.515  ハイボール
  0.332  チューハイ
  0.301  泡盛
  0.296  ホッピー
  0.295  焼酎
