In [None]:
!pip install bart_score

[31mERROR: Could not find a version that satisfies the requirement bart_score (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bart_score[0m[31m
[0m

In [None]:
# %%
import torch
import torch.nn as nn
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

    def multi_ref_score(self, srcs, tgts: List[List[str]], agg="mean", batch_size=4):
        # Assert we have the same number of references
        ref_nums = [len(x) for x in tgts]
        if len(set(ref_nums)) > 1:
            raise Exception("You have different number of references per test sample.")

        ref_num = len(tgts[0])
        score_matrix = []
        for i in range(ref_num):
            curr_tgts = [x[i] for x in tgts]
            scores = self.score(srcs, curr_tgts, batch_size)
            score_matrix.append(scores)
        if agg == "mean":
            score_list = np.mean(score_matrix, axis=0)
        elif agg == "max":
            score_list = np.max(score_matrix, axis=0)
        else:
            raise NotImplementedError
        return list(score_list)

    def test(self, batch_size=3):
        """ Test """
        src_list = [
            'This is a very good idea. Although simple, but very insightful.',
            'Can I take a look?',
            'Do not trust him, he is a liar.'
        ]

        tgt_list = [
            "That's stupid.",
            "What's the problem?",
            'He is trustworthy.'
        ]

        print(self.score(src_list, tgt_list, batch_size))

In [None]:
bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')
bart_scorer.score(['Craft a heartbreaking tale centered around siblings Eliza and Ethan in the town of Willow Creek. When Eliza becomes gravely ill, Ethan strikes a bargain with a shadowy figure to restore her health, oblivious to the dire consequences it will entail.'],

["""In the serene town of Willow Creek, where the gentle babbling of brooks intertwined with the rustling leaves of ancient oaks, lived siblings Eliza and Ethan. The twins shared an unbreakable bond, their lives entwined like the ivy that clung to their family home. Eliza's laughter was the sunlight in Ethan's world, and his strength was her shield.

Their peaceful life took a dark turn one autumn when Eliza fell gravely ill. The once vibrant girl, known for her infectious smile and boundless energy, was now confined to her bed, her health deteriorating rapidly. No doctor in Willow Creek or the neighboring towns could diagnose her mysterious ailment, and hope began to fade as winter's chill set in.

Desperate and unwilling to watch his sister slip away, Ethan ventured into the woods one moonless night. He had heard whispers of a shadowy figure who could perform miracles, a being shrouded in mystery and fear. Ethan's heart pounded with both dread and determination as he followed the cryptic directions given by an old man in the village square.

Deep in the forest, under a canopy of twisted branches, Ethan found the shadowy figure. Cloaked in darkness, the figure's eyes glowed with an eerie light. "What brings you here, young one?" the figure's voice was a soft hiss, like the wind through dead leaves.

"My sister, Eliza, is dying," Ethan replied, his voice trembling. "I'll do anything to save her."

The figure smiled, a chilling sight. "Anything, you say? Very well. I can restore your sister's health, but such a gift requires a sacrifice."

Without hesitation, Ethan nodded. "Take whatever you need. Just save her."

The figure extended a hand, and as Ethan grasped it, a cold pain shot through his body. "The bargain is struck," the figure whispered, vanishing into the night.

The next morning, Eliza awoke with a newfound vigor. Her cheeks were flushed with color, and her laughter filled the house once more. The town marveled at her miraculous recovery, and Ethan's heart soared with joy. But as days turned into weeks, Ethan began to feel the effects of his bargain.

A shadow seemed to hang over him, draining his energy and clouding his mind. He grew weaker with each passing day, his once strong body now frail and exhausted. The cost of the miracle became painfully clear—Eliza's life had been restored at the expense of his own.

Ethan tried to hide his decline, but Eliza noticed the change in her brother. One evening, as they sat by the fireplace, she confronted him. "Ethan, what's happening to you? You look so tired and sick."

Tears welled in Ethan's eyes. "I made a deal to save you, Eliza. A deal with a shadowy figure in the woods. This is the price I have to pay."

Horrified, Eliza grasped his hand. "No, Ethan. We have to find a way to reverse it. I can't lose you because of me."

But Ethan knew it was too late. The bargain was unbreakable, and his time was running out. He spent his remaining days cherishing every moment with Eliza, her laughter now tinged with sorrow.

On a cold winter's night, as snowflakes danced outside their window, Ethan took his final breath. Eliza held him close, her heart shattered by the loss of her beloved brother. In his last moments, Ethan whispered, "Live for both of us, Eliza."

In the aftermath of Ethan's death, Willow Creek mourned the loss of a brave young man. Eliza, though heartbroken, vowed to honor Ethan's sacrifice by living a life full of love and kindness. She carried his memory with her, a beacon of hope and resilience.

And in the quiet moments, when the wind whispered through the trees, Eliza could almost hear Ethan's voice, a reminder of the unbreakable bond they shared—a bond that even death could not sever."""], batch_size=4) # generation scores from

[-3.5862600803375244]

In [None]:
# LR-n 算用詞多樣性
def compute_lr_n(text, n):
    from collections import Counter
    from nltk import ngrams
    import nltk

    # Ensure that the necessary NLTK data is available
    nltk.download('punkt')

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Extract n-grams
    n_grams = list(ngrams(words, n))

    # Count occurrences of each n-gram
    n_gram_counts = Counter(n_grams)

    # Calculate the number of repeated n-grams
    repeated_n_grams = sum(1 for count in n_gram_counts.values() if count > 1)

    # Calculate total number of n-grams
    total_n_grams = len(n_grams)

    # Compute LR-n as the ratio of repeated n-grams to total n-grams
    if total_n_grams == 0:
        return 0
    lr_n = repeated_n_grams / total_n_grams

    return lr_n

# Example usage
text = """
Cat cat cat cat cat cat cat cat"""

n = 2  # Change this to any value of n for LR-n
lr_n = compute_lr_n(text, n) * 2
print(f'LR-{n}: {lr_n}')

In [None]:
# D3 也是算用詞多樣性
def compute_d3(text):
    from collections import Counter
    from nltk import ngrams
    import nltk

    # Ensure that the necessary NLTK data is available
    nltk.download('punkt')

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Extract 3-grams
    three_grams = list(ngrams(words, 3))

    # Count occurrences of each 3-gram
    three_gram_counts = Counter(three_grams)

    # Calculate the number of unique 3-grams
    unique_three_grams = sum(1 for count in three_gram_counts.values() if count == 1)

    # Calculate total number of 3-grams
    total_three_grams = len(three_grams)

    # Compute D-3 as the ratio of unique 3-grams to total 3-grams
    if total_three_grams == 0:
        return 0
    d3 = unique_three_grams / total_three_grams

    return d3

# Example usage
text = """cat mat hat pat sat"""

d3 = compute_d3(text)
print(f'D-3: {d3}')