# METEOR

In [6]:
import re
from collections import Counter

def normalize_text(text):
    """
    Normalize the text by lowercasing and removing punctuation.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def get_word_matches(reference, hypothesis):
    """
    Compute word matches between the reference and hypothesis.
    """
    ref_words = reference.split()
    hyp_words = hypothesis.split()

    ref_counts = Counter(ref_words)
    hyp_counts = Counter(hyp_words)

    # Find intersection of words
    matches = sum((ref_counts & hyp_counts).values())
    return matches

def compute_meteor_score(reference, hypothesis):
    """
    Compute the METEOR score for a single pair of reference and hypothesis.
    """
    # Normalize texts
    reference = normalize_text(reference)
    hypothesis = normalize_text(hypothesis)

    # Calculate precision and recall
    matches = get_word_matches(reference, hypothesis)
    precision = matches / len(hypothesis.split()) if hypothesis.split() else 0
    recall = matches / len(reference.split()) if reference.split() else 0

    # Calculate F-measure
    if precision + recall > 0:
        f1_score = (10 * precision * recall) / (9 * precision + recall)
    else:
        f1_score = 0

    # Apply penalty for word order mismatch
    hyp_words = hypothesis.split()
    ref_words = reference.split()
    chunks = 0
    i = 0
    while i < len(hyp_words):
        if hyp_words[i] in ref_words:
            start_index = ref_words.index(hyp_words[i])
            while i < len(hyp_words) and start_index < len(ref_words) and hyp_words[i] == ref_words[start_index]:
                i += 1
                start_index += 1
            chunks += 1
        else:
            i += 1

    penalty = 0.5 * (chunks / matches) if matches > 0 else 1
    meteor_score = f1_score * (1 - penalty)

    # Return the detailed metrics
    return {
        "recall": recall,
        "precision": precision,
        "f1_score": f1_score
    }

# Example usage
reference_text = "The quick brown fox jumps over the lazy dog"
hypothesis_text = "A quick brown dog jumps over the lazy fox"

meteor_result = compute_meteor_score(reference_text, hypothesis_text)
print("METEOR Result:", meteor_result)


METEOR Result: {'recall': 0.2222222222222222, 'precision': 0.5, 'f1_score': 0.23529411764705882}


# ROUGE-N
n-gram(1-gram, 2-gram 등)을 생성하여 참조(reference)와 가설(hypothesis)의 겹침을 측정.
Recall, Precision, F1-score를 계산:
Recall: 겹치는 n-gram / 참조 n-gram
Precision: 겹치는 n-gram / 가설 n-gram
F1: Recall과 Precision의 조화 평균.

In [2]:
from collections import Counter
from itertools import islice

def get_ngrams(text, n):
    """
    Generate n-grams from text.
    """
    tokens = text.split()
    return list(zip(*[tokens[i:] for i in range(n)]))

def rouge_n(reference, hypothesis, n=1):
    """
    Compute ROUGE-N score.
    """
    # Generate n-grams for reference and hypothesis
    ref_ngrams = Counter(get_ngrams(reference, n))
    hyp_ngrams = Counter(get_ngrams(hypothesis, n))

    # Count overlapping n-grams
    overlap = sum((ref_ngrams & hyp_ngrams).values())
    total_ref_ngrams = sum(ref_ngrams.values())
    total_hyp_ngrams = sum(hyp_ngrams.values())

    # Calculate Recall, Precision, and F1-score
    recall = overlap / total_ref_ngrams if total_ref_ngrams > 0 else 0
    precision = overlap / total_hyp_ngrams if total_hyp_ngrams > 0 else 0
    f1_score = (2 * recall * precision / (recall + precision)) if (recall + precision) > 0 else 0

    return {"recall": recall, "precision": precision, "f1_score": f1_score}

# Example usage
reference = "the cat sat on the mat"
hypothesis = "the cat is on the mat"

rouge_1 = rouge_n(reference, hypothesis, n=1)
rouge_2 = rouge_n(reference, hypothesis, n=2)

print("ROUGE-1:", rouge_1)
print("ROUGE-2:", rouge_2)


ROUGE-1: {'recall': 0.8333333333333334, 'precision': 0.8333333333333334, 'f1_score': 0.8333333333333334}
ROUGE-2: {'recall': 0.6, 'precision': 0.6, 'f1_score': 0.6}


# ROUGE-L

LCS(Longest Common Subsequence)를 사용해 두 텍스트 간의 겹침을 측정.
LCS 길이를 기반으로 Recall, Precision, F1-score 계산:
Recall: LCS 길이 / 참조 단어 수
Precision: LCS 길이 / 가설 단어 수.

In [3]:
def lcs_length(x, y):
    """
    Compute the length of the Longest Common Subsequence (LCS).
    """
    m, n = len(x), len(y)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i - 1] == y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    return dp[m][n]

def rouge_l(reference, hypothesis):
    """
    Compute ROUGE-L score.
    """
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()

    # Length of LCS
    lcs = lcs_length(ref_tokens, hyp_tokens)

    # Calculate Recall, Precision, and F1-score
    recall = lcs / len(ref_tokens) if len(ref_tokens) > 0 else 0
    precision = lcs / len(hyp_tokens) if len(hyp_tokens) > 0 else 0
    f1_score = (2 * recall * precision / (recall + precision)) if (recall + precision) > 0 else 0

    return {"recall": recall, "precision": precision, "f1_score": f1_score}

# Example usage
reference = "the cat sat on the mat"
hypothesis = "the cat is on the mat"

rouge_l_score = rouge_l(reference, hypothesis)

print("ROUGE-L:", rouge_l_score)


ROUGE-L: {'recall': 0.8333333333333334, 'precision': 0.8333333333333334, 'f1_score': 0.8333333333333334}


#  BERTScore 예제

In [11]:
from bert_score import score

# 참조 텍스트와 생성 텍스트
references = ["빠른 갈색 여우가 게으른 개를 뛰어넘었다."]
hypotheses = ["빠른 갈색 여우가 게으른 개를 뛰어 넘었다."]

# BERTScore 계산
P, R, F1 = score(hypotheses, references, lang="ko", verbose=True)

print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1 Score: {F1.mean().item():.4f}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 13.23it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 394.20it/s]

done in 0.08 seconds, 12.02 sentences/sec
Precision: 0.9588
Recall: 0.9588
F1 Score: 0.9588





# GPTScore (Perplexity 기반 평가)

In [None]:
!pip install transformers

In [14]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

def calculate_perplexity(model, tokenizer, text):
    """
    Perplexity 계산 함수
    """
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    perplexity = torch.exp(loss)
    return perplexity.item()

# GPT-2 모델 로드
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# 텍스트 Perplexity 계산
text = "빠른 갈색 여우가 게으른 개를 뛰어넘었다."
perplexity = calculate_perplexity(model, tokenizer, text)
print(f"Perplexity: {perplexity:.4f}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Perplexity: 9.4079


# 5. Sentence Similarity with Cosine Similarity

In [None]:
# !pip install sentence-transformers


In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 모델 로드
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# 텍스트 정의
reference = "빠른 갈색 여우가 게으른 개를 뛰어넘었다."
hypothesis = "빠른 갈색 여우가 게으른 개를 뛰어 넘었다."

# 임베딩 생성
ref_embedding = model.encode([reference])
hyp_embedding = model.encode([hypothesis])

# 코사인 유사도 계산
similarity = cosine_similarity(ref_embedding, hyp_embedding)
print(f"Cosine Similarity: {similarity[0][0]:.4f}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Cosine Similarity: 0.9863


# G-EVAL

In [27]:
!pip install openai


Collecting openai
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.0-cp312-none-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.2-py3-none-any.whl.metadata (170 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.27.1 (from pydantic<3,>=1.9.0->openai)
  Using cached pydantic_core-2.27.1-cp312-none-win_amd64.whl.metadata (6.7 kB)
Downloading openai-1.55.3-py3-none-any.whl (389 kB)
Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.8.0-cp312-none-win_amd64.whl (206 kB)
Downloading pydantic-2.10.2-py3-none-any.whl (456 kB)
Using cached pydantic_core-2.27.1-cp312-none-win_amd64.whl (2.0 MB)
Using cached annotated_types

In [34]:
from openai import OpenAI
# OpenAI API 키 설정
import os

os.environ['OPENAI_API_KEY'] = ""


client = OpenAI()
def g_eval(reference, hypothesis):
    """
    G-EVAL: GPT 기반 텍스트 평가
    - reference: 기준 텍스트
    - hypothesis: 평가할 텍스트
    """
    prompt = f"""
    You are an expert evaluator for language models. Please evaluate the following two texts:

    Reference Text: "{reference}"
    Hypothesis Text: "{hypothesis}"

    Provide a similarity score between 0 and 100, where:
    - 0 means the texts are completely different.
    - 100 means the texts are identical in meaning and language quality.

    Please briefly explain in Korean the reasoning behind your score.
    """
    # 새로운 Chat API 호출
    completion = client.chat.completions.create(
        model="gpt-4o-mini",  # 모델 선택 (gpt-4 또는 gpt-3.5-turbo)
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )

    # GPT의 응답에서 메시지 내용 추출
    gpt_response = completion.choices[0].message.content
    return gpt_response

# 테스트 데이터
reference_text = "빠른 갈색 여우가 게으른 개를 뛰어넘었다."
hypothesis_text = "게으른 늙은 여우가 게으른 개를 뛰어 넘었다."

# G-EVAL 결과
result = g_eval(reference_text, hypothesis_text)
print("G-EVAL 결과:")
print(result)



G-EVAL 결과:
Similarity Score: 30

두 텍스트 간의 유사성은 낮습니다. 두 문장 모두 비슷한 구조를 가지고 있지만, 내용에 중요한 차이가 있습니다. 

1. **주어의 차이**: Reference Text에서는 "빠른 갈색 여우"라는 주어가 사용되었고, Hypothesis Text에서는 "게으른 늙은 여우"라는 다른 주어가 사용되었습니다. 이로 인해 주어의 성격이 완전히 달라집니다.
2. **형용사의 차이**: 두 텍스트에서 여우에 대한 형용사가 다르며, 이는 문장의 의미에 영향을 미칩니다. "빠른"과 "게으른"은 상반되는 의미를 전달합니다.
3. ** 행동은 유사하지만**: 두 문장 모두 "뛰어넘었다"는 행동을 포함하지만, 주어의 차이로 인해 상황의 맥락이 다릅니다.

따라서 두 문장은 비슷한 구조를 가지지만, 의미적으로는 상당한 차이가 있으므로 30점으로 평가했습니다.
