In [None]:
# import pandas as pd

In [None]:
# df = pd.read_csv('./data/animelist-dataset/users-score-2023.csv')

In [None]:
# df.head()

In [None]:
# df.__len__()

In [None]:
# df = df.drop(['Username','Anime Title'],axis=1)

In [None]:
# df.head()

In [None]:
# u_encoded, u_class = pd.factorize(df["user_id"], sort=True)   # user_idx
# i_encoded, i_class = pd.factorize(df["anime_id"], sort=True)  # item_idx

# print("--------\n데이터를 유저 인코딩")
# print(u_encoded[:10])
# print("---------\n유저 원본 클래스(집합)")
# print(u_class[:10])
# print("---------\n데이터를 애니 인코딩")
# print(i_encoded[:10])
# print("---------\n애니 원본 클래스(집합)")
# print(i_class[:10])


In [None]:
# print(f'{df.memory_usage(deep=True).sum() / 1024**2 :.2f}MB') # 단위: MB

In [None]:
import pandas as pd

ratings = (
    pd.read_csv("data/animelist-dataset/users-score-2023.csv",
                usecols=['user_id','anime_id','rating'])
            .dropna()
            .query("rating > 0")
)

ratings.head()

In [None]:
ratings['user_id'].value_counts().head(80000)

In [None]:
# 평가가 가장 많은 사람
top_users = ratings['user_id'].value_counts().head(500).index
print(top_users)
print(top_users.shape)

In [None]:
filtered = ratings[ratings["user_id"].isin(top_users)]

In [None]:
filtered


In [None]:
user_ids, users = pd.factorize(filtered['user_id'])
item_ids, items = pd.factorize(filtered['anime_id'])

In [None]:
filtered = filtered.assign(user_idx=user_ids, item_idx=item_ids)

In [None]:
filtered[['user_idx','item_idx','rating']]

In [None]:
n_users, n_items = len(users), len(items)
print(n_users)
print(n_items)

In [None]:
filtered

In [None]:
from sklearn.model_selection import train_test_split

train_x, tmp_df = train_test_split(
    filtered[['user_idx','item_idx','rating']],
    test_size=0.3,
    random_state=42,
    stratify=filtered['user_idx']
)
valid_x, test_x = train_test_split(
    tmp_df,
    test_size=0.5,
    random_state=42,
    stratify=tmp_df['user_idx']
)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_loader = DataLoader(RatingsDataset(train_x), batch_size=4096, shuffle=True)
valid_loader = DataLoader(RatingsDataset(valid_x), batch_size=8192)
test_loader = DataLoader(RatingsDataset(test_x), batch_size=8192)

In [None]:
print(train_loader.dataset.users.shape)
print(train_loader.dataset.users)
print(train_loader.dataset.items)
print(train_loader.dataset.ratings)

print(valid_loader.dataset.users.shape)
print(valid_loader.dataset.users)
print(valid_loader.dataset.items)
print(valid_loader.dataset.ratings)

print(test_loader.dataset.users.shape)
print(test_loader.dataset.users)
print(test_loader.dataset.items)
print(test_loader.dataset.ratings)

In [None]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, num_users, num_items, factors=32):
        super().__init__()
        self.user_factors = torch.nn.Embedding(num_users,factors) # (80000, 32)
        self.item_factors = torch.nn.Embedding(num_items,factors) # (16471, 32)
        torch.nn.init.normal_(self.user_factors.weight, std=0.05)
        torch.nn.init.normal_(self.item_factors.weight, std=0.05)
    
    def forward(self,user_idx, item_idx):
        u = self.user_factors(user_idx) # (batch_size, 32)
        v = self.item_factors(item_idx) # (batch_size, 32)
        return (u*v).sum(dim=1) # 원소별 곱한 후 sigma{32개} -> (batch_size)

In [None]:
device = "mps" if torch.mps.is_available() else "cpu"
model = MatrixFactorization(n_users, n_items, factors=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = torch.nn.MSELoss()

train_rmse_list = []
valid_rmse_list = []

In [None]:
device

In [None]:
for epoch in range(10):
    model.train()
    total_loss = 0.0
    for users_batch, items_batch, ratings_batch in train_loader:
        users_batch = users_batch.to(device)
        items_batch = items_batch.to(device)
        ratings_batch = ratings_batch.to(device).float()

        preds = model(users_batch, items_batch)
        loss = criterion(preds,ratings_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(ratings_batch) # 배치가 다르게 들어가면 평균도 달라지는 거 막는 용

    train_rmse = (total_loss / len(train_x)) ** 0.5
    train_rmse_list.append(train_rmse)

    model.eval()
    with torch.no_grad():
        total_valid = 0.0
        for users_batch, items_batch, ratings_batch in valid_loader:
            users_batch = users_batch.to(device)
            items_batch = items_batch.to(device)
            ratings_batch = ratings_batch.to(device).float()

            preds = model(users_batch, items_batch)
            loss = criterion(preds, ratings_batch)

            total_valid += loss.item() * len(ratings_batch)
        
        valid_rmse = (total_valid / len(valid_x)) ** 0.5
        valid_rmse_list.append(valid_rmse)
    
    print(f"[Epoch: {epoch+1:03d}] train RMSE {train_rmse:.3f} | valid RMSE {valid_rmse:.3f}")

In [None]:
torch.save(model.state_dict(), "mf_weight.pt")

In [None]:
# idx는 long, 모델과 같은 디바이스(mps)여야 함
idx = torch.tensor([1, 44, 12], device=device, dtype=torch.long)

# (3, 32) 임베딩
item_vecs = model.item_factors(idx)

# 1) 단순 평균 (32,)
user_vec = item_vecs.mean(dim=0)

print(user_vec.detach().cpu().numpy())

In [None]:

scores = (model.item_factors.weight @ user_vec)# 예: scores[:5] -> array([7.2 , 6.8 , 6.4 , ...])
print(scores)

top_items = scores.argsort()[:-1][:20]  # 예: top_items -> array([ 105,  320,  250, ...])
print(top_items)
recommended_anime_ids = [items[int(idx)] for idx in top_items]  # 예: recommended_anime_ids -> [5114, 9253, 32281, ...]
print(recommended_anime_ids)  # 예시 출력: [5114, 9253, 11061, 30276, 28977, 21, 11061, 199, 6547, 22535]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# ───────────────────────────────────────────────
# seaborn 스타일 설정
# ───────────────────────────────────────────────
sns.set_theme(
    font="AppleGothic",
    style="darkgrid",
    rc={
    "axes.unicode_minus": False, # 한글/음수 표시 깨짐 방지
    # "axes.facecolor": "#2b2b2b",  
    # "figure.facecolor": "#2b2b2b",
    # "savefig.facecolor": "#2b2b2b",
    # "grid.color": "#4f4f4f",
    # "axes.edgecolor": "#DDDDDD",
    # "axes.labelcolor": "#DDDDDD",
    # "text.color": "#DDDDDD",
    # "xtick.color": "#DDDDDD",
    # "ytick.color": "#DDDDDD"
    }
)  

# 데이터프레임으로 정리 (seaborn은 long-form 구조가 보기 좋음)
epochs = range(1, len(train_rmse_list) + 1)
df = pd.DataFrame({
    "Epoch": list(epochs) * 2,
    "RMSE": train_rmse_list + valid_rmse_list,
    "Type": ["Train"] * len(train_rmse_list) + ["Valid"] * len(valid_rmse_list)
})

# ───────────────────────────────────────────────
# 시각화
# ───────────────────────────────────────────────
plt.figure(figsize=(10, 5))
sns.lineplot(
    data=df, x="Epoch", y="RMSE", hue="Type", style="Type",
    markers=True, dashes=False,
    palette=["#A3C4F3", "#F7C6C7"]
)
plt.title("Matrix Factorization 학습 곡선 (RMSE)", fontsize=10)
plt.xlabel("Epoch", fontsize=10)
plt.ylabel("RMSE", fontsize=10)
plt.legend(title="데이터 구분")
plt.tight_layout()
plt.show()

In [None]:
model.eval()  # 출력 없음: 최종 평가 모드
with torch.no_grad():  # 출력 없음: 그래디언트 비활성화
    total_test = 0.0  # 예: 초기값 0.0
    for users_batch, items_batch, ratings_batch in test_loader:
        preds = model(users_batch.to(device), items_batch.to(device))  # 예: preds[:4] -> tensor([7.48, 6.80, ...])
        loss = criterion(preds, ratings_batch.to(device))  # 예: loss.item() -> 1.07
        total_test += loss.item() * len(ratings_batch)  # 예: total_test -> 2150.0 (누적)
    test_rmse = (total_test / len(test_x)) ** 0.5  # 예: test_rmse -> 1.04

print(f"Test RMSE {test_rmse:.3f}")  # 예: "Test RMSE 1.042"

user_example = 0  # 예: 첫 번째 인코딩된 사용자
with torch.no_grad():  # 출력 없음
    user_vec = model.user_factors.weight[user_example]  # 예: user_vec[:5] -> tensor([0.078, -0.012, 0.044, ...])
    scores = (model.item_factors.weight @ user_vec).cpu().numpy()  # 예: scores[:5] -> array([7.2 , 6.8 , 6.4 , ...])
top_items = scores.argsort()[::-1][:20]  # 예: top_items -> array([ 105,  320,  250, ...])
recommended_anime_ids = [items[idx] for idx in top_items]  # 예: recommended_anime_ids -> [5114, 9253, 32281, ...]
recommended_anime_ids  # 예시 출력: [5114, 9253, 11061, 30276, 28977, 21, 11061, 199, 6547, 22535]

In [None]:
anime = pd.read_csv("data/animelist-dataset/anime-dataset-2023.csv")

In [None]:
anime.head()

In [None]:
filtered_anime = anime[anime["anime_id"].isin(recommended_anime_ids)]


In [None]:
filtered_anime

In [None]:
import pandas as pd
a = pd.read_csv("data/animelist-dataset/users-score-2023.csv")

In [None]:
a.head()

In [None]:
# pip install sentence-transformers
import re
import numpy as np
import pandas as pd
from typing import List, Tuple
from sentence_transformers import SentenceTransformer

# ---------- 0) 노이즈에 강한 정규화 ----------
# - 한글/히라가나/가타카나/한자/영문/숫자만 남김
# - 괄호/부제/기호/이상문자 제거 후 공백 정리
NOISE_KEEP = re.compile(r"[^0-9A-Za-z\uAC00-\uD7A3\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\s]+")

def normalize_title(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\(.*?\)", " ", s)              # 괄호 내 부제 제거
    s = NOISE_KEEP.sub(" ", s)                  # 허용 외 문자 제거
    s = re.sub(r"\s+", " ", s).strip()          # 공백 정리
    return s

# ---------- 1) 예시 마스터(원제만; 필요시 CSV로 교체) ----------
titles = [
    # Naruto-verse
    {"anime_id": 1,  "title": "Naruto"},
    {"anime_id": 2,  "title": "Naruto Shippuden"},
    {"anime_id": 3,  "title": "Boruto: Naruto Next Generations"},
    # Big shonen
    {"anime_id": 4,  "title": "One Piece"},
    {"anime_id": 5,  "title": "Bleach"},
    {"anime_id": 6,  "title": "Bleach: Thousand-Year Blood War"},
    {"anime_id": 7,  "title": "Dragon Ball Z"},
    {"anime_id": 8,  "title": "JoJo's Bizarre Adventure"},
    {"anime_id": 9,  "title": "Slam Dunk"},
    {"anime_id": 10, "title": "Detective Conan"},
    # Modern hits
    {"anime_id": 11, "title": "Attack on Titan"},
    {"anime_id": 12, "title": "Demon Slayer: Kimetsu no Yaiba"},
    {"anime_id": 13, "title": "Jujutsu Kaisen"},
    {"anime_id": 14, "title": "My Hero Academia"},
    {"anime_id": 15, "title": "Chainsaw Man"},
    {"anime_id": 16, "title": "SPY×FAMILY"},
    {"anime_id": 17, "title": "Haikyu!!"},
    {"anime_id": 18, "title": "Blue Lock"},
    {"anime_id": 19, "title": "Oshi no Ko"},
    {"anime_id": 20, "title": "Frieren: Beyond Journey's End"},
    # Classics
    {"anime_id": 21, "title": "Fullmetal Alchemist: Brotherhood"},
    {"anime_id": 22, "title": "Death Note"},
    {"anime_id": 23, "title": "Neon Genesis Evangelion"},
    {"anime_id": 24, "title": "Steins;Gate"},
    {"anime_id": 25, "title": "Made in Abyss"},
    # SAO / Re:Zero / Mushoku
    {"anime_id": 26, "title": "Sword Art Online"},
    {"anime_id": 27, "title": "Re:Zero − Starting Life in Another World"},
    {"anime_id": 28, "title": "Mushoku Tensei: Jobless Reincarnation"},
    # Sports / Misc
    {"anime_id": 29, "title": "Kaguya-sama: Love Is War"},
    {"anime_id": 30, "title": "Vinland Saga"},
    {"anime_id": 31, "title": "Dr. Stone"},
    {"anime_id": 32, "title": "Mob Psycho 100"},
    # Your examples
    {"anime_id": 33, "title": "Dandadan"},
]

titles_df = pd.DataFrame(titles)

# ---------- 2) 인덱스 구축 ----------
def build_index(df: pd.DataFrame, model_name: str = "paraphrase-multilingual-mpnet-base-v2"):
    docs = [normalize_title(t) for t in df["title"].tolist()]
    id_map = df["anime_id"].tolist()
    model = SentenceTransformer(model_name)
    emb = model.encode(docs, normalize_embeddings=True).astype("float32")  # (N, d)
    return model, emb, docs, id_map

model, emb, docs, id_map = build_index(titles_df)

# ---------- 3) 단건 검색 ----------
def search_title(q: str, k: int = 5, cutoff: float = 0.55) -> List[Tuple[int, float, str]]:
    qn = normalize_title(q)
    qv = model.encode([qn], normalize_embeddings=True).astype("float32")[0]
    sims = emb @ qv  # 코사인 유사도
    k = min(k, len(sims))
    idx = np.argpartition(-sims, k-1)[:k]
    idx = idx[np.argsort(-sims[idx])]
    hits = [(id_map[i], float(sims[i]), docs[i]) for i in idx if sims[i] >= cutoff]
    return hits  # [(anime_id, score, matched_norm_title)]

# ---------- 4) 배치 검색 (여러 질의 한 번에) ----------
def batch_search(queries: List[str], k: int = 5, cutoff: float = 0.55):
    qn = [normalize_title(q) for q in queries]
    qv = model.encode(qn, normalize_embeddings=True).astype("float32")  # (B, d)
    sims = qv @ emb.T  # (B, N)
    results = []
    for r in range(sims.shape[0]):
        row = sims[r]
        kk = min(k, len(row))
        idx = np.argpartition(-row, kk-1)[:kk]
        idx = idx[np.argsort(-row[idx])]
        hits = [(id_map[i], float(row[i]), docs[i]) for i in idx if row[i] >= cutoff]
        results.append(hits)
    return results

# ---------- 5) 데모 ----------
queries = [
    "나루토 질풍전ㅣㄴㅁ;ㅣㅇ;",   # → Naruto Shippuden
    "단다단1-₩129812ㅑ",          # → Dandadan
    "귀멸의 칼날",                 # → Demon Slayer: Kimetsu no Yaiba (ko 표기)
    "진격의거인!!",               # → Attack on Titan
    "강철의 연금술사 브라더후드",   # → Fullmetal Alchemist: Brotherhood
    "스파이 패밀리",               # → SPY×FAMILY
    "재:제로",                     # → Re:Zero − Starting Life in Another World
]
for q, hits in zip(queries, batch_search(queries, k=5, cutoff=0.50)):
    print(q, "->", hits[:3])

In [None]:
import pandas as pd
a = pd.read_csv("data/animelist-dataset/users-score-2023.csv")

In [None]:
a.head()

In [None]:
anime_pairs = set(zip(a["anime_id"], a["Anime Title"]))
print(anime_pairs)

In [None]:
print(list(t[1] for t in list(anime_pairs)))

In [None]:
# pip install sentence-transformers
import re
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional, Iterable
from sentence_transformers import SentenceTransformer

# ---------------- 0) 정규화 ----------------
NOISE_KEEP = re.compile(r"[^0-9A-Za-z\uAC00-\uD7A3\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\s]+")

def normalize_title(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\(.*?\)", " ", s)      # 괄호 내 부제 제거
    s = NOISE_KEEP.sub(" ", s)          # 허용 외 문자 제거
    s = re.sub(r"\s+", " ", s).strip()  # 공백 정리
    return s

# ---------------- 1) anime_pairs -> 인덱스 ----------------
def build_index_from_pairs(anime_pairs: Iterable[Tuple[int, str]],
                           model_name: str = "paraphrase-multilingual-mpnet-base-v2"):
    df = pd.DataFrame(anime_pairs, columns=["anime_id", "title"])
    docs = [normalize_title(t) for t in df["title"].tolist()]
    id_map = df["anime_id"].tolist()
    title_map = df["title"].tolist()  # 원제 복원용
    model = SentenceTransformer(model_name)
    emb = model.encode(docs, normalize_embeddings=True).astype("float32")  # (N, d)
    return model, emb, docs, id_map, title_map

# ---------------- 2) Top-1 검색 ----------------
def search_top1(q: str,
                model: SentenceTransformer,
                emb: np.ndarray,
                docs: List[str],
                id_map: List[int],
                title_map: List[str],
                cutoff: float = 0.55) -> Optional[Tuple[int, str, float, str]]:
    """
    반환: (anime_id, original_title, score, matched_norm_title) or None
    """
    qn = normalize_title(q)
    qv = model.encode([qn], normalize_embeddings=True).astype("float32")[0]
    sims = emb @ qv  # (N,)
    i = int(np.argmax(sims))
    score = float(sims[i])
    if score < cutoff:
        return None
    return id_map[i], title_map[i], score, docs[i]

def batch_top1(queries: List[str],
               model: SentenceTransformer,
               emb: np.ndarray,
               docs: List[str],
               id_map: List[int],
               title_map: List[str],
               cutoff: float = 0.55):
    results = []
    for q in queries:
        hit = search_top1(q, model, emb, docs, id_map, title_map, cutoff=cutoff)
        results.append((q, hit))
    return results

# ---------------- 3) 예시 사용 ----------------
# 예: 사용자가 제공한 anime_pairs (id, title) 집합

model, emb, docs, id_map, title_map = build_index_from_pairs(anime_pairs)

queries = [
    "나루토 질풍전ㅣㄴㅁ;ㅣㅇ;",   # → Naruto Shippuden
    "단다단1-₩129812ㅑ",          # → Dandadan
    "귀멸의 칼날",                 # → Demon Slayer: Kimetsu no Yaiba
    "진격의거인!!",               # → Attack on Titan
    "강철의 연금술사 브라더후드",   # → Fullmetal Alchemist: Brotherhood
    "스파이 패밀리",               # → SPY×FAMILY
    "재:제로",                     # → Re:Zero − Starting Life in Another World
]

results = batch_top1(queries, model, emb, docs, id_map, title_map, cutoff=0.50)

for q, hit in results:
    if hit is None:
        print(f"{q} -> None (below cutoff)")
    else:
        aid, title, score, matched_norm = hit
        print(f"{q} -> anime_id={aid}, title='{title}', score={score:.3f}, matched='{matched_norm}'")

In [None]:
import pandas as pd

animet = pd.read_csv("data/animelist-dataset/anime-dataset-2023.csv",usecols=['Other name'])

In [None]:
samples = list(animet['Other name'])

In [None]:
# pip install -U transformers sentencepiece torch pykakasi --quiet
import re, torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pykakasi import kakasi

MODEL = "facebook/mbart-large-50-many-to-many-mmt"
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
mt  = AutoModelForSeq2SeqLM.from_pretrained(MODEL)
mt.eval()

kks = kakasi()

# (참고) 일본어 → 한국어로 번역
def ja2ko(text: str) -> str:
    tok.src_lang = "ja_XX"
    enc = tok(text, return_tensors="pt")
    with torch.inference_mode():
        out = mt.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids("ko_KR"),
            num_beams=5,
            length_penalty=1.1,
            max_new_tokens=64,
            early_stopping=True,
        )
    result = tok.batch_decode(out, skip_special_tokens=True)[0].strip()
    # 불필요한 접두어 제거
    result = re.sub(r"^(한국어|번역).*?:", "", result).strip()
    return result

# 언어 판별 유틸
def has_kana(s):  return any('\u3040' <= ch <= '\u30ff' for ch in s)
def has_kanji(s): return any('\u4e00' <= ch <= '\u9fff' for ch in s)
def has_hangul(s): return any('\uac00' <= ch <= '\ud7a3' for ch in s)

# 로마자 → 가나 변환
def romaji_to_hira(s: str) -> str:
    return "".join(part["hira"] for part in kks.convert(s)).strip()

def to_korean_title(t: str) -> str:
    t = t.strip()
    if not t:
        return ""

    # 1️⃣ 이미 한글 → 그대로
    if has_hangul(t):
        return t

    # 2️⃣ 일본어(가나 또는 한자 포함) → 일본어→한국어 번역
    if has_kana(t) or has_kanji(t):
        return ja2ko(t)

    # 3️⃣ 로마자 일본어 → 가나 변환 → 번역
    hira = romaji_to_hira(t)
    if hira:
        return ja2ko(hira)

    # 4️⃣ 영어류 → 영어→한국어 번역
    tok.src_lang = "en_XX"
    enc = tok(t, return_tensors="pt")
    with torch.inference_mode():
        out = mt.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids("ko_KR"),
            num_beams=5, length_penalty=1.1, max_new_tokens=64
        )
    return tok.batch_decode(out, skip_special_tokens=True)[0].strip()



# 사용 예시
for s in samples:
    print(s, " → ", to_korean_title(s))

In [None]:
# pip install -U transformers sentencepiece torch langdetect --quiet
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect

MODEL = "facebook/mbart-large-50-many-to-many-mmt"

# 핵심: use_fast=False 로드 → protobuf 불필요
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
mt  = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL,
    use_safetensors=True,
    low_cpu_mem_usage=False,
    dtype=torch.float32,
)
mt.eval()

def mbart_to_ko(text: str) -> str:
    t = text.strip()
    if not t:
        return ""
    # 간단 언어 판별 (짧은 제목 보정)
    try:
        lang = detect(t)
    except:
        lang = "en"
    if any('\u3040' <= ch <= '\u30ff' for ch in t):  # 히라/가타카나
        lang = "ja"
    if any('\uac00' <= ch <= '\ud7a3' for ch in t):  # 이미 한글
        return t

    # mBART-50 언어 코드
    src = "en_XX" if lang.startswith("en") else "ja_XX"
    tok.src_lang = src

    enc = tok(t, return_tensors="pt")
    with torch.inference_mode():
        out_ids = mt.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids("ko_KR"),
            max_new_tokens=64,
        )
    return tok.batch_decode(out_ids, skip_special_tokens=True)[0].strip()

# 테스트
# samples = [
#     "Attack on Titan",
#     "Demon Slayer: Kimetsu no Yaiba",
#     "ジョジョの奇妙な冒険",
#     "SPY×FAMILY",
# ]
samples = list(t[1] for t in list(anime_pairs))
for s in samples:
    print(s, "→", mbart_to_ko(s))

In [None]:
# 필요한 패키지 (한 번만 설치)
# pip install -U pykakasi transformers sentencepiece torch langdetect --quiet

import re
import torch
from langdetect import detect
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pykakasi import kakasi

MODEL = "facebook/mbart-large-50-many-to-many-mmt"

# mBART 로드
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
mt  = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL,
    use_safetensors=True,
    low_cpu_mem_usage=False,
    dtype=torch.float32,
)
mt.eval()

# 로마자 -> 가나 변환기
kks = kakasi()

# 작품명 공식/관용 한글 타이틀 우선 매핑
OVERRIDE = {
    # 영어/일본어 원제 : 한국 통용 제목
    "Attack on Titan": "진격의 거인",
    "Demon Slayer: Kimetsu no Yaiba": "귀멸의 칼날",
    "SPY×FAMILY": "SPY×FAMILY",  # 보통 그대로 표기
    "SPY X FAMILY": "스파이 패밀리",
    "SPYXFAMILY": "스파이 패밀리",
    "JoJo's Bizarre Adventure": "죠죠의 기묘한 모험",
    "Great Mazinger": "그레이트 마징가",
    "Cop Craft": "코프 크래프트",
    "Penguin Highway": "펭귄 하이웨이",
    # 필요시 계속 추가
}

# 유니코드 범위 체크
def has_kana(s: str) -> bool:
    return any('\u3040' <= ch <= '\u30ff' for ch in s)  # 히라/가타카나

def has_hangul(s: str) -> bool:
    return any('\uac00' <= ch <= '\ud7a3' for ch in s)

def is_romaji_like_japanese(s: str) -> bool:
    """
    일본어 로마자를 대략 감지: 한글/가나/한자 없고,
    공통 조사/접사/패턴이 포함되면 True
    """
    if has_hangul(s) or has_kana(s):
        return False
    # 한자 범위
    if any('\u4e00' <= ch <= '\u9fff' for ch in s):
        return False

    low = s.lower()
    tokens = re.findall(r"[a-zA-Z]+", low)
    if not tokens:
        return False

    # 흔한 로마자 일본어 패턴/조사/접사
    jp_markers = [
        " no ", " wa ", " ga ", " wo ", " to ", " mo ", " de ", " ni ",
        "kara ", " made", " shou", " chou", " jou", " kyo", " kyou",
        " gei", " geki", " gekijou", " gekijō", " shoku", " pan", " shokupan",
        " kanojo", " sekai", " utsukushii", " minikuku", " nagerareta",
        " wakaokami", " shougakusei", " kandou", " e.", " reso", " nantoka",
    ]
    pad = " " + low + " "
    score = sum(1 for m in jp_markers if m in pad)

    # 모음-자음 패턴이 과도하게 반복되면 로마자 일본어로 가정
    vowel_ratio = sum(ch in "aeiou" for ch in low) / max(1, sum(ch.isalpha() for ch in low))
    looks_romaji = vowel_ratio > 0.30 and score >= 1
    return looks_romaji

def en_like(s: str) -> bool:
    # 한글/가나/한자 없고, 알파벳+숫자/기호 위주면 영어류로 간주
    if has_hangul(s) or has_kana(s):
        return False
    if any('\u4e00' <= ch <= '\u9fff' for ch in s):
        return False
    return True

def mbart_translate(text: str, src_lang: str, tgt_lang: str = "ko_KR",
                    prompt_hint: str = "", beams: int = 5) -> str:
    tok.src_lang = src_lang
    inp = prompt_hint + text if prompt_hint else text
    enc = tok(inp, return_tensors="pt")
    with torch.inference_mode():
        out_ids = mt.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids(tgt_lang),
            num_beams=beams,
            length_penalty=1.1,
            no_repeat_ngram_size=2,
            max_new_tokens=64,
            early_stopping=True,
            do_sample=False,
        )
    return tok.batch_decode(out_ids, skip_special_tokens=True)[0].strip()

def romaji_to_hira(s: str) -> str:
    return "".join(part["hira"] for part in kks.convert(s)).strip()

def mbart_to_ko(text: str) -> str:
    t = text.strip()
    if not t:
        return ""

    # 0) override 우선
    if t in OVERRIDE:
        return OVERRIDE[t]

    # 1) 이미 한글이면 그대로
    if has_hangul(t):
        return t

    # 2) 가나/한자 → 일본어로 간주 후 번역
    if has_kana(t) or any('\u4e00' <= ch <= '\u9fff' for ch in t):
        return mbart_translate(t, src_lang="ja_XX")

    # 3) 로마자 일본어 감지 → 가나 변환 → ja→ko
    if is_romaji_like_japanese(t):
        hira = romaji_to_hira(t)
        if hira:
            out = mbart_translate(hira, src_lang="ja_XX")
            # 결과가 입력과 거의 동일(복사)하면 힌트 붙여 재시도
            norm_in = re.sub(r"\W+", "", t).lower()
            norm_out = re.sub(r"\W+", "", out).lower()
            if norm_in == norm_out:
                out = mbart_translate(hira, src_lang="ja_XX", prompt_hint="한국어 제목으로 번역: ")
            return out

    # 4) 그 외(대체로 영어류): en→ko 시도
    if en_like(t):
        out = mbart_translate(t, src_lang="en_XX", prompt_hint="Translate to Korean title: ")
        # 복사 방지 재시도
        norm_in = re.sub(r"\W+", "", t).lower()
        norm_out = re.sub(r"\W+", "", out).lower()
        if norm_in == norm_out:
            out = mbart_translate(t, src_lang="en_XX", prompt_hint="한국어 제목: ")
        # 마지막으로 override 스캔(괄호/공백 변형 대비)
        if out == t and t.title() in OVERRIDE:
            return OVERRIDE[t.title()]
        return out

    # 5) 실패 시 원문 반환
    return t

# -------------------------------
# 사용 예시: 질문에 주신 방식 그대로
samples = list(t[1] for t in list(anime_pairs))
for s in samples:
    print(s, "→", mbart_to_ko(s))
# -------------------------------

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL = "facebook/mbart-large-50-many-to-many-mmt"
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
mt  = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

def mbart_en2ko(title: str) -> str:
    if not title.strip():
        return ""
    tok.src_lang = "en_XX"  # 소스 고정
    enc = tok("Translate to Korean: " + title, return_tensors="pt")  # 프롬프트 힌트로 '복사' 성향 완화
    with torch.inference_mode():
        out = mt.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids("ko_KR"),
            num_beams=5,                # 탐색 확대
            length_penalty=1.1,         # 너무 짧게 끝내지 않게
            no_repeat_ngram_size=2,
            max_new_tokens=64,
            early_stopping=True,
            do_sample=False,
        )
    return tok.batch_decode(out, skip_special_tokens=True)[0].strip()

for s in [
    "Attack on Titan",
    "Demon Slayer: Kimetsu no Yaiba",
    "SPY×FAMILY",
]:
    print(s, "→", mbart_en2ko(s))

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tok = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect

MODEL = "facebook/mbart-large-50-many-to-many-mmt"

tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
mt  = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL,
    use_safetensors=True,
    low_cpu_mem_usage=False,
    dtype=torch.float32,
)
mt.eval()
print("model device:", next(mt.parameters()).device)  # cpu 여야 정상

def mbart_to_ko(text: str) -> str:
    t = text.strip()
    if not t:
        return ""
    if any('\uac00' <= ch <= '\ud7a3' for ch in t):
        return t
    src_lang = "ja" if any('\u3040' <= ch <= '\u30ff' for ch in t) else "en"
    tok.src_lang = "en_XX" if src_lang=="en" else "ja_XX"

    enc = tok(t, return_tensors="pt")
    with torch.inference_mode():
        out_ids = mt.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids("ko_KR"),
            max_new_tokens=64,
        )
    return tok.batch_decode(out_ids, skip_special_tokens=True)[0].strip()

# 테스트
samples = ["Attack on Titan", "Demon Slayer: Kimetsu no Yaiba", "ジョジョの奇妙な冒険", "SPY×FAMILY"]
for s in samples:
    print(s, "→", mbart_to_ko(s))

In [None]:
# %% 설치 (한 번만)
# %pip install -U transformers datasets peft accelerate sentencepiece torch --quiet

# %% 환경 체크: MPS
import os, torch, random
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
device = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu")
print("[device]", device)

# %% 미니 데이터셋 만들기 (메모리)
pairs = [
    ("カウボーイビバップ", "카우보이 비밥"),
    ("カウボーイビバップ 天国の扉", "카우보이 비밥 천국의 문"),
    ("トライガン", "트라이건"),
    ("新世紀エヴァンゲリオン", "신세기 에반게리온"),
    ("ナルト", "나루토"),
    ("ONE PIECE", "원피스"),
    ("テニスの王子様", "테니스의 왕자"),
    ("スクールランブル", "스쿨 럼블"),
    ("頭文字〈イニシャル〉D", "이니셜 D"),
    ("頭文字〈イニシャル〉D FOURTH STAGE", "이니셜 D 포스 스테이지"),
    ("ハングリーハート", "헝그리 하트"),
    ("ハングリーハート Wild Striker", "헝그리 하트 와일드 스트라이커"),
    ("ハチミツとクローバー", "허니와 클로버"),
    ("モンスター", "몬스터"),
    ("冒険王ビィト", "모험왕 비트"),
    ("アイシールド21", "아이실드 21"),
    ("機動戦士ガンダム", "기동전사 건담"),
    ("コードギアス 反逆のルルーシュ", "코드 기아스 반역의 를르슈"),
    ("魔法少女まどか☆マギカ", "마법소녀 마도카☆마기카"),
    ("ジパング", "지팡"),
    ("進撃の巨人", "진격의 거인"),
    ("鬼滅の刃", "귀멸의 칼날"),
    ("SPY×FAMILY", "스파이 패밀리"),
    ("ジョジョの奇妙な冒険", "죠죠의 기묘한 모험"),
    ("銀魂", "은혼"),
    ("鋼の錬金術師", "강철의 연금술사"),
    ("デスノート", "데스노트"),
    ("ソードアート・オンライン", "소드 아트 온라인"),
    ("Re:ゼロから始める異世界生活", "Re:제로부터 시작하는 이세계 생활"),
    ("この素晴らしい世界に祝福を！", "이 멋진 세계에 축복을!"),
    ("ノーゲーム・ノーライフ", "노 게임 노 라이프"),
    ("涼宮ハルヒの憂鬱", "스즈미야 하루히의 우울"),
    ("らき☆すた", "러키☆스타"),
    ("けいおん！", "케이온!"),
    ("シュタインズ・ゲート", "슈타인즈 게이트"),
    ("攻殻機動隊", "공각기동대"),
    ("サイコパス", "사이코패스"),
    ("プラスティック・メモリーズ", "플라스틱 메모리즈"),
    ("ヴァイオレット・エヴァーガーデン", "바이올렛 에버가든"),
    ("四月は君の嘘", "4월은 너의 거짓말"),
    ("化物語", "바케모노가타리"),
    ("とある科学の超電磁砲", "어떤 과학의 초전자포"),
    ("とある魔術の禁書目録", "어떤 마법의 금서목록"),
    ("五等分の花嫁", "5등분의 신부"),
]

random.seed(42)
random.shuffle(pairs)
split = int(len(pairs)*0.8)
train_pairs, valid_pairs = pairs[:split], pairs[split:]

from datasets import Dataset
train_ds = Dataset.from_dict({"ja":[j for j,_ in train_pairs], "ko":[k for _,k in train_pairs]})
valid_ds = Dataset.from_dict({"ja":[j for j,_ in valid_pairs], "ko":[k for _,k in valid_pairs]})

print("train/valid:", len(train_ds), len(valid_ds))

# %% 모델/토크나이저 로드 (mBART-50)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
MODEL = "facebook/mbart-large-50-many-to-many-mmt"
tok = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
base = AutoModelForSeq2SeqLM.from_pretrained(MODEL)
base.to(device)

# %% LoRA 장착
from peft import LoraConfig, get_peft_model
peft_cfg = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","out_proj","fc1","fc2"],  # mBART 구조에 맞춘 핵심 모듈
)
model = get_peft_model(base, peft_cfg)
model.print_trainable_parameters()

# %% 전처리
max_src, max_tgt = 64, 64

def preprocess(batch):
    # 1) 소스/타깃 언어 코드 설정
    tok.src_lang = "ja_XX"
    tok.tgt_lang = "ko_KR"

    # 2) 입력과 라벨 토크나이즈
    model_inputs = tok(batch["ja"], max_length=max_src, truncation=True)

    # ※ v4 권장: as_target_tokenizer() 대신 text_target= 사용
    labels = tok(text_target=batch["ko"], max_length=max_tgt, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
valid_tok = valid_ds.map(preprocess, batched=True, remove_columns=valid_ds.column_names)

# %% 학습 세팅 (MPS 친화)
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

data_collator = DataCollatorForSeq2Seq(tok, model=model)

args = Seq2SeqTrainingArguments(
    output_dir="mbart_ja2ko_title_lora_mps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    num_train_epochs=8,
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=64,
    report_to=[],
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    data_collator=data_collator,
    tokenizer=tok,
)

trainer.train()

# %% 어댑터만 저장
model.save_pretrained("mbart_ja2ko_title_lora_mps/adapter")

# %% 간단 추론 함수
import re
def infer(title: str) -> str:
    tok.src_lang = "ja_XX"
    enc = tok(title, return_tensors="pt").to(device)
    model.eval()
    with torch.inference_mode():
        out = model.generate(
            **enc,
            forced_bos_token_id=tok.convert_tokens_to_ids("ko_KR"),
            num_beams=5, length_penalty=1.1, max_new_tokens=64, early_stopping=True
        )
    text = tok.batch_decode(out, skip_special_tokens=True)[0].strip()
    # 혹시 모델이 접두사 붙이면 제거
    text = re.sub(r"^(한국어|번역).*?:\s*", "", text).strip()
    return text

tests = [
    "カウボーイビバップ",
    "カウボーイビバップ 天国の扉",
    "頭文字〈イニシャル〉D FOURTH STAGE",
    "ハチミツとクローバー",
    "鬼滅の刃",
    "ジョジョの奇妙な冒険",
]
for t in samples:
    print(t, "→", infer(t))

In [None]:

import torch, accelerate, transformers, datasets, peft
print("torch:", torch.__version__)
print("accelerate:", accelerate.__version__)
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("peft:", peft.__version__)
print("MPS available:", torch.backends.mps.is_available())