In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("x2bee/KoModernBERT-base-mlm-v02")


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
result = tokenizer.tokenize("문장을 아무거나 한국어 처리가 가능한지")
result

['ë¬¸',
 'ìŀ¥',
 'ìĿĦ',
 'ĠìķĦë¬´',
 'ê±°',
 'ëĤ',
 'ĺ',
 'Ġ',
 'íķľêµŃìĸ´',
 'Ġì²ĺë¦¬',
 'ê°Ģ',
 'Ġê°ĢëĬ¥',
 'íķľì§Ģ']

In [9]:
token_idxs = tokenizer.convert_tokens_to_ids(result)

In [10]:
tokenizer.decode(token_idxs)

'문장을 아무거나 한국어 처리가 가능한지'

In [None]:
tokenizer.tokenize("BBPE 토크나이저는 토큰을 Byte 형식으로 나눕니다.")
# Result
['BB', 'PE', 'ĠíĨłíģ¬', 'ëĤ', 'ĺ', 'ìĿ´ìłĢ', 'ëĬĶ', 'ĠíĨłíģ°', 'ìĿĦ', 'ĠByte', 'ĠíĺķìĭĿ', 'ìľ¼', 'ë¡ľ', 'ĠëĤĺ', 'ëĪķ', 'ëĭĪëĭ¤', '.']
tokenizer.covert_tokens_to_ids(['BB', 'PE', 'ĠíĨłíģ¬', 'ëĤ', 'ĺ', 'ìĿ´ìłĢ', 'ëĬĶ', 'ĠíĨłíģ°', 'ìĿĦ', 'ĠByte', 'ĠíĺķìĭĿ', 'ìľ¼', 'ë¡ľ', 'ĠëĤĺ', 'ëĪķ', 'ëĭĪëĭ¤', '.'])
# Result
[10172, 3246, 58998, 44028, 235, 98284, 24169, 59366, 28736, 24128, 54116, 51745, 35296, 50641, 76029, 31912, 15]
tokenizer.decode([10172, 3246, 58998, 44028, 235, 98284, 24169, 59366, 28736, 24128, 54116, 51745, 35296, 50641, 76029, 31912, 15])
# Result
'BBPE 토크나이저는 토큰을 Byte 형식으로 나눕니다.'

In [15]:
import torch
from torch import nn

class Pooler(nn.Module):
    """
    Parameter-free poolers to get the sentence embedding
    'cls': [CLS] representation with BERT/RoBERTa's MLP pooler.
    'cls_before_pooler': [CLS] representation without the original MLP pooler.
    'avg': average of the last layers' hidden states at each token.
    'avg_top2': average of the last two layers.
    'avg_first_last': average of the first and the last layers.
    """
    def __init__(self, pooler_type):
        super().__init__()
        self.pooler_type = pooler_type
        assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type

    def forward(self, attention_mask, outputs):
        last_hidden = outputs.last_hidden_state
        hidden_states = outputs.hidden_states

        if self.pooler_type in ['cls_before_pooler', 'cls']:
            return last_hidden[:, 0]
        elif self.pooler_type == "avg":
            return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1))
        elif self.pooler_type == "avg_first_last":
            first_hidden = hidden_states[1]
            last_hidden = hidden_states[-1]
            pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        elif self.pooler_type == "avg_top2":
            second_last_hidden = hidden_states[-2]
            last_hidden = hidden_states[-1]
            pooled_result = ((last_hidden + second_last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        else:
            raise NotImplementedError

class ModernBERTEmbedding:
    def __init__(self, model, tokenizer, pooler_type="avg"):
        self.tokenizer = tokenizer
        self.model = model
        self.pooler = Pooler(pooler_type)

    def encode(self, inputs, device="cpu"):
        """
        768차원의 임베딩 벡터 추출
        """
        self.model.to(device)

        # 입력 데이터를 모델에 전달
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Pooling 수행
        embeddings = self.pooler(inputs["attention_mask"], outputs)
        return embeddings


In [None]:
import torch
from scipy.spatial.distance import cosine
from transformers import ModernBertModel, AutoTokenizer

# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("CocoRoF/KMB_SimCSE_test")
model = ModernBertModel.from_pretrained("CocoRoF/KMB_SimCSE_test")

embeddings_model = ModernBERTEmbedding(model, tokenizer)
# Tokenize input texts
texts = [
    "아니 이거 잘되야 하는데 .. 그렇지?",
    "잘되야 되는건 뻔한 말이고.",
    "잘 될거야."
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = embeddings_model.encode(inputs, device="cuda")  # inputs는 딕셔너리 형태

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))