In [4]:
import os, math, random
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from numpy.linalg import norm
import torch
from huggingface_hub import login
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader

if "HUGGINGFACE_HUB_TOKEN" in os.environ:
    try:
        login(token=os.environ["HUGGINGFACE_HUB_TOKEN"])
    except Exception:
        pass

MODEL_ID = "google/embeddinggemma-300m"
CSV_PATH = "pairs.csv"
OUTPUT_DIR = "ft-embeddinggemma"

EPOCHS = 2
BASE_BATCH_SIZE = 4
MAX_SEQ_LEN = 64
LR = 2e-5
WARMUP_RATIO = 0.1
SEED = 42

random.seed(SEED); np.random.seed(SEED)

# ---- Device / AMP（CUDAのみTrue）----
USE_AMP = torch.cuda.is_available()
DEVICE  = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

# ---- Data ----
df = pd.read_csv(CSV_PATH).dropna(subset=["query","item"]).astype(str).drop_duplicates()
train_df, dev_df = train_test_split(df, test_size=0.2, random_state=SEED, shuffle=True) if len(df) > 5 else (df, df.iloc[0:0])

train_samples = [InputExample(texts=[r.query.strip(), r.item.strip()]) for r in train_df.itertuples(index=False)]
assert len(train_samples) > 0, "train_samples が 0 件です。CSV 行数を増やしてください。"

# ---- Model ----
model = SentenceTransformer(MODEL_ID, device=DEVICE)
model.max_seq_length = MAX_SEQ_LEN

# ---- Dataloader
BATCH_SIZE = min(BASE_BATCH_SIZE, max(1, len(train_samples)))
train_loader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=BATCH_SIZE,
    drop_last=False,
    num_workers=0,
    pin_memory=False
)

train_loss = losses.MultipleNegativesRankingLoss(model)

steps_per_epoch = max(1, len(train_loader))
warmup_steps = int(steps_per_epoch * EPOCHS * WARMUP_RATIO)

# ---- 学習（Evaluatorは切る：メモリ節約）----
model.fit(
    train_objectives=[(train_loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    optimizer_params={"lr": LR},
    output_path=OUTPUT_DIR,
    evaluator=None,
    evaluation_steps=0,
    checkpoint_path=None,
    use_amp=USE_AMP            # ← CUDAのときだけ True
)

print(f"✅ saved to: {OUTPUT_DIR}")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



RuntimeError: MPS backend out of memory (MPS allocated: 7.86 GiB, other allocations: 9.75 GiB, max allowed: 18.13 GiB). Tried to allocate 768.00 MiB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
def rank(q, docs, m, topk=10):
    import numpy as np
    from numpy.linalg import norm
    qv = m.encode_query(q)
    dv = m.encode_document(docs)
    sims = (qv @ dv.T) / (norm(qv) * norm(dv, axis=1))
    order = np.argsort(-sims)[:topk]
    return [(docs[i], float(sims[i])) for i in order]

# CSVから「item（=ドリンク）」のユニーク集合を候補に
corpus_texts = sorted(set(df["item"].astype(str).tolist()))
seen = set()
demo_candidates = []
for x in corpus_texts:
    if x not in seen:
        demo_candidates.append(x); seen.add(x)

demo_queries = ["枝豆", "餃子", "刺身"]

base = SentenceTransformer(MODEL_ID, device=DEVICE, use_auth_token=True)
ft   = SentenceTransformer(OUTPUT_DIR, device=DEVICE)

def show_demo(q):
    print(f"\n### Query: {q}")
    print("Before:")
    for t, s in rank(q, demo_candidates, base, topk=5):
        print(f"  {s:.3f}  {t}")
    print("After:")
    for t, s in rank(q, demo_candidates, ft, topk=5):
        print(f"  {s:.3f}  {t}")

for q in demo_queries:
    show_demo(q)





### Query: 枝豆
Before:
  0.281  冷酒
  0.259  日本酒
  0.259  焼酎
  0.256  ホッピー
  0.250  熱燗
After:
  0.536  コーラ
  0.506  生ビール
  0.501  ビール
  0.478  ホッピー
  0.421  ハイボール

### Query: 餃子
Before:
  0.268  泡盛
  0.248  冷酒
  0.231  焼酎
  0.228  熱燗
  0.213  日本酒
After:
  0.575  生ビール
  0.526  ビール
  0.502  ホッピー
  0.459  マッコリ
  0.458  チューハイ

### Query: 刺身
Before:
  0.274  冷酒
  0.261  日本酒
  0.257  熱燗
  0.240  生ビール
  0.238  焼酎
After:
  0.535  冷酒
  0.499  熱燗
  0.489  日本酒
  0.468  緑茶ハイ
  0.462  チューハイ
