In [3]:
import torch
import torch.nn.functional as F
from transformers import XLMRobertaModel, XLMRobertaTokenizer

# 1. 모델 및 토크나이저 로드
model_name = "microsoft/infoxlm-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. 토크나이징 및 벡터 추출 함수
def tokenize_and_embed(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # [CLS] 벡터 반환

# 3. 유사도 계산 함수
def calculate_similarity(query, candidates):
    query_embedding = tokenize_and_embed([query])
    candidate_embeddings = tokenize_and_embed(candidates)
    
    similarities = F.cosine_similarity(query_embedding, candidate_embeddings, dim=-1)
    return similarities.cpu().numpy()

# 4. Inference 실행
inference_data = {
    "source": "The weather is nice today.",
    "candidates": ["오늘 날씨가 좋다.", "오늘은 비가 온다.", "날씨가 흐리다."]
}

# 유사도 계산
source = inference_data["source"]
candidates = inference_data["candidates"]

similarities = calculate_similarity(source, candidates)

# 결과 출력
print(f"Source: {source}")
for i, candidate in enumerate(candidates):
    print(f"Candidate {i+1}: '{candidate}' -> Similarity: {similarities[i]:.4f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Source: The weather is nice today.
Candidate 1: '오늘 날씨가 좋다.' -> Similarity: 0.9465
Candidate 2: '오늘은 비가 온다.' -> Similarity: 0.8983
Candidate 3: '날씨가 흐리다.' -> Similarity: 0.9103


In [4]:
english_sentences = [
    "The weather is nice today.",
    "It is raining heavily outside.",
    "I love programming.",
    "The cat is sleeping on the couch.",
    "The sun is shining brightly."
]

# 문장 페어 생성 (모든 문장을 서로 비교)
sentence_pairs = [(s1, s2) for i, s1 in enumerate(english_sentences) for j, s2 in enumerate(english_sentences) if i != j]

import torch
import torch.nn.functional as F
from transformers import XLMRobertaModel, XLMRobertaTokenizer

# 1. 모델 및 토크나이저 로드
model_name = "microsoft/infoxlm-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. 토크나이징 및 벡터 추출 
def tokenize_and_embed(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # [CLS] 벡터 반환

# 3. 유사도 계산 
def calculate_similarity(sentence_pairs):
    similarities = []
    for s1, s2 in sentence_pairs:
        s1_embedding = tokenize_and_embed([s1])
        s2_embedding = tokenize_and_embed([s2])
        similarity = F.cosine_similarity(s1_embedding, s2_embedding, dim=-1).item()
        similarities.append((s1, s2, similarity))
    return similarities
similarities = calculate_similarity(sentence_pairs)

print("Sentence Pair Similarities:")
for s1, s2, sim in similarities:
    print(f"'{s1}' <-> '{s2}' -> Similarity: {sim:.4f}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence Pair Similarities:
'The weather is nice today.' <-> 'It is raining heavily outside.' -> Similarity: 0.8854
'The weather is nice today.' <-> 'I love programming.' -> Similarity: 0.8408
'The weather is nice today.' <-> 'The cat is sleeping on the couch.' -> Similarity: 0.8127
'The weather is nice today.' <-> 'The sun is shining brightly.' -> Similarity: 0.8779
'It is raining heavily outside.' <-> 'The weather is nice today.' -> Similarity: 0.8854
'It is raining heavily outside.' <-> 'I love programming.' -> Similarity: 0.8174
'It is raining heavily outside.' <-> 'The cat is sleeping on the couch.' -> Similarity: 0.8058
'It is raining heavily outside.' <-> 'The sun is shining brightly.' -> Similarity: 0.8751
'I love programming.' <-> 'The weather is nice today.' -> Similarity: 0.8408
'I love programming.' <-> 'It is raining heavily outside.' -> Similarity: 0.8174
'I love programming.' <-> 'The cat is sleeping on the couch.' -> Similarity: 0.8612
'I love programming.' <-> 'The sun

1. Idiom뿐만 아니라 그냥 general dataset 혹은 pretrained-dataset 가지고 contrastive learning 적용하기
2. 다른 방법..? 
3. 그리고 ranking model에 reference는 그냥 hypotheis로?