In [7]:
import unittest
from saga_llm_evaluation_ml.helpers.llm_metrics import GEval, GPTScore, SelfCheckGPT
from saga_llm_evaluation_ml.score import LLMScorer
from saga_llm_evaluation_ml.helpers.utils import get_llama_model

In [8]:
LLAMA_MODEL = get_llama_model(filename="llama-2-7b-chat.Q3_K_L.gguf")

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Llama-2-7b-Chat-GGUF/snapshots/191239b3e26b2882fb562ffccdd1cf0f65402adb/llama-2-7b-chat.Q3_K_L.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q3_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q3_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q3_K     [  4096,  4096,    

In [3]:
class TestGEval(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.geval = GEval(model=LLAMA_MODEL)

    def test_init(self):
        with self.assertRaises(AssertionError):
            GEval(LLAMA_MODEL, "1", 1)
            GEval(LLAMA_MODEL, 1, "1")

    def test_bad_arguments(self):
        source = "Hi how are you"
        pred = "Im ok"
        task = "diag"
        aspect = "ENG"

        with self.assertRaises(AssertionError):
            self.geval.compute([source], pred, task, aspect)
            self.geval.compute(source, [pred], task, aspect)
            self.geval.compute(source, pred, 1, aspect)
            self.geval.compute(source, pred, task, 1)
            self.geval.compute(source, pred, task, "notvalid")
            self.geval.compute(source, pred, "notvalid", aspect)
            self.geval.compute(source, pred, task, aspect=None)
            self.geval.compute(source, pred, task=None, aspect=aspect)

    def test_compute(self):
        source = "Hi how are you?"
        preds = ["Shut up creep!!!", "I am very good, thank you! And you?"]
        task = "diag"
        aspect = "POL"

        scores = {key: 0 for key in preds}
        for pred in preds:
            score = self.geval.compute(source, pred, task, aspect)
            self.assertTrue(isinstance(score, float))
            self.assertGreaterEqual(score, 0.0)
            scores[pred] = score

        self.assertGreaterEqual(
            scores["I am very good, thank you! And you?"], scores["Shut up creep!!!"]
        )


class TestSelfCheckGPT(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.selfcheckgpt = SelfCheckGPT(model=LLAMA_MODEL, eval_model=LLAMA_MODEL)

    def test_init(self):
        with self.assertRaises(AssertionError):
            SelfCheckGPT(
                model=LLAMA_MODEL,
                eval_model=LLAMA_MODEL,
                eval_model_name_or_path=1,
                eval_model_basename=1,
            )
            SelfCheckGPT(
                model=LLAMA_MODEL,
                eval_model=LLAMA_MODEL,
                eval_model_name_or_path=1,
                eval_model_basename="1",
            )
            SelfCheckGPT(
                model=LLAMA_MODEL,
                eval_model=LLAMA_MODEL,
                eval_model_name_or_path="1",
                eval_model_basename=1,
            )

    def test_bad_arguments(self):
        question = "What is the capital of France?"
        pred = "Paris"
        n_samples = 1

        with self.assertRaises(AssertionError):
            self.selfcheckgpt.compute([question], pred, n_samples)
            self.selfcheckgpt.compute(question, [pred], n_samples)
            self.selfcheckgpt.compute(question, pred, "1")
            self.selfcheckgpt.compute(question, pred, 1.0)
            self.selfcheckgpt.compute(question, pred, -1)
            self.selfcheckgpt.compute(question=question, pred=None, n_samples=5)
            self.selfcheckgpt.compute(question=None, pred=pred, n_samples=5)

    def test_compute(self):
        question = "What is the capital of France?"
        preds = ["Paris", "sandwich"]
        n_samples = 10

        scores = {key: 0 for key in preds}
        for pred in preds:
            score = self.selfcheckgpt.compute(question, pred, n_samples)
            self.assertTrue(isinstance(score, float))
            self.assertGreaterEqual(score, 0.0)
            self.assertLessEqual(score, 1.0)
            scores[pred] = score

        self.assertGreaterEqual(scores["Paris"], scores["sandwich"])


class TestGPTScore(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.gptscore = GPTScore(model=LLAMA_MODEL)

    def test_init(self):
        with self.assertRaises(AssertionError):
            GPTScore(model=LLAMA_MODEL, model_basename=1, model_name_or_path=1)
            GPTScore(model=LLAMA_MODEL, model_basename="1", model_name_or_path=1)
            GPTScore(model=LLAMA_MODEL, model_basename=1, model_name_or_path="1")

    def test_bad_arguments(self):

        with self.assertRaises(AssertionError):
            self.gptscore.compute(
                ["The cat sat on the mat."], ["The dog sat on the log."]
            )
            self.gptscore.compute(
                "The cat sat on the mat.", ["The dog sat on the log."]
            )
            self.gptscore.compute("The cat sat on the mat.", "The dog sat on the log.")
            self.gptscore.compute(
                "The cat sat on the mat.", "The dog sat on the log.", prompt=2
            )
            self.gptscore.compute(
                "The cat sat on the mat.",
                "The dog sat on the log.",
                prompt="2",
                aspect="COV",
                task="diag",
            )
            self.gptscore.compute(
                "The cat sat on the mat.",
                "The dog sat on the log.",
                aspect=2,
                task="diag",
            )
            self.gptscore.compute(
                "The cat sat on the mat.",
                "The dog sat on the log.",
                aspect="COV",
                task=2,
            )
            self.gptscore.compute(
                "The cat sat on the mat.",
                "The dog sat on the log.",
                aspect="COV",
                task="notvalid",
            )
            self.gptscore.compute(
                "The cat sat on the mat.",
                "The dog sat on the log.",
                aspect="notvalid",
                task="diag",
            )
            self.gptscore.compute(
                "The cat sat on the mat.", "The dog sat on the log.", aspect="COV"
            )
            self.gptscore.compute(
                "The cat sat on the mat.", "The dog sat on the log.", task="diag"
            )

    def test_compute(self):
        source = "Hi how are you?"
        preds = [
            "I am very fine. Thanks! What about you?",
            "Shut up creep I don't want to talk to you!!!",
        ]
        # prompt = "Task: evaluate how polite this dialog is."
        aspect = "LIK"
        task = "diag"

        scores = {key: 0 for key in preds}
        for target in preds:
            score = self.gptscore.compute(source, target, aspect=aspect, task=task)
            scores[target] = score
            self.assertTrue(isinstance(score, float))
            self.assertGreaterEqual(score, 0.0)

        self.assertGreaterEqual(
            scores["I am very fine. Thanks! What about you?"],
            scores["Shut up creep I don't want to talk to you!!!"],
        )


In [5]:
unittest.main(argv=[""], verbosity=2, exit=False)

test_bad_arguments (__main__.TestGEval) ... ok
test_compute (__main__.TestGEval) ... 
llama_print_timings:        load time =  5785.98 ms
llama_print_timings:      sample time =    97.04 ms /   197 runs   (    0.49 ms per token,  2030.11 tokens per second)
llama_print_timings: prompt eval time =  5785.84 ms /   198 tokens (   29.22 ms per token,    34.22 tokens per second)
llama_print_timings:        eval time = 19715.53 ms /   196 runs   (  100.59 ms per token,     9.94 tokens per second)
llama_print_timings:       total time = 26278.65 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  5785.98 ms
llama_print_timings:      sample time =     0.93 ms /     2 runs   (    0.46 ms per token,  2152.85 tokens per second)
llama_print_timings: prompt eval time = 13162.97 ms /   433 tokens (   30.40 ms per token,    32.90 tokens per second)
llama_print_timings:        eval time =   101.63 ms /     1 runs   (  101.63 ms per token,     9.84 tokens per second)
llama_prin

<unittest.main.TestProgram at 0x7f823d152710>

In [9]:
class TestLLMScorer(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.scorer = LLMScorer(model=LLAMA_MODEL)

    def test_init(self):
        false = False
        with self.assertRaises(AssertionError):

            LLMScorer(model=LLAMA_MODEL, lan=false)
            LLMScorer(model=LLAMA_MODEL, bleurt_model=false)
            LLMScorer(model=LLAMA_MODEL, mauve_model=false)
            LLMScorer(model=LLAMA_MODEL, selfcheckgpt_eval_model_name_or_path=false)
            LLMScorer(model=LLAMA_MODEL, selfcheckgpt_eval_model_basename=false)
            LLMScorer(model=LLAMA_MODEL, geval_model_name_or_path=false)
            LLMScorer(model=LLAMA_MODEL, geval_model_basename=false)
            LLMScorer(model=LLAMA_MODEL, gptscore_model_name_or_path=false)
            LLMScorer(model=LLAMA_MODEL, gptscore_model_basename=false)

    def test_score_bad_arguments(self):
        llm_input = "I am a dog."
        prompt = f"System: You are a cat. You don't like dogs. User: {llm_input}"
        context = "System: You are a cat. You don't like dogs."
        prediction = "I am a cat, I don't like dogs."
        reference = "I am a cat, I don't like dogs, miau."

        with self.assertRaises(AssertionError):
            self.scorer.score(False, prompt, context, prediction, reference)
            self.scorer.score(llm_input, False, context, prediction, reference)
            self.scorer.score(llm_input, prompt, False, prediction, reference)
            self.scorer.score(llm_input, prompt, context, False, reference)
            self.scorer.score(llm_input, prompt, context, prediction, False)
            self.scorer.score(
                llm_input, prompt, context, prediction, reference, n_samples=False
            )
            self.scorer.score(
                llm_input, prompt, context, prediction, reference, task=False
            )
            self.scorer.score(
                llm_input, prompt, context, prediction, reference, aspects=False
            )
            self.scorer.score(
                llm_input, prompt, context, prediction, reference, custom_prompt=False
            )
            self.scorer.score(
                llm_input, prompt, context, prediction, reference, custom_prompt=False
            )

    def test_score(self):
        model_name_or_path = "TheBloke/Llama-2-7b-Chat-GGUF"
        model_basename = "llama-2-7b-chat.Q2_K.gguf"  # the model is in bin format

        scorer = LLMScorer(
            model=LLAMA_MODEL,
            selfcheckgpt_eval_model_name_or_path=model_name_or_path,
            selfcheckgpt_eval_model_basename=model_basename,
            geval_model_name_or_path=model_name_or_path,
            geval_model_basename=model_basename,
            gptscore_model_name_or_path=model_name_or_path,
            gptscore_model_basename=model_basename,
        )

        llm_input = "I am a dog."
        prompt = f"System: You are a cat. You don't like dogs. User: {llm_input}"
        context = "Examples: Eww, I hate dogs."
        prediction = "I am a cat, I don't like dogs."
        reference = "I am a cat, I don't like dogs, miau."
        task = "diag"
        aspect = ["CON"]
        custom_prompt = {
            "name": "Fluency",
            "task": "Dialog",
            "aspect": "Evaluate the fluency of the following dialog.",
        }

        scores = scorer.score(llm_input, prompt, context, prediction, reference)
        self.assertTrue(isinstance(scores, dict))
        self.assertTrue("metrics" in scores)
        self.assertTrue("metadata" in scores)

        # All default
        print("All default")
        scores = scorer.score(llm_input, prompt, prediction, n_samples=2)
        self.assertTrue(isinstance(scores, dict))
        self.assertTrue("metrics" in scores)
        self.assertTrue("metadata" in scores)

        # All default, but with context
        print("All default, but with context")
        scores = scorer.score(
            llm_input,
            prompt,
            prediction,
            context=context,
            n_samples=2,
        )
        self.assertTrue(isinstance(scores, dict))
        self.assertTrue("metrics" in scores)
        self.assertTrue("metadata" in scores)

        # All default, but with reference
        print("All default, but with reference")
        scores = scorer.score(
            llm_input,
            prompt,
            prediction,
            reference=reference,
            n_samples=2,
        )
        self.assertTrue(isinstance(scores, dict))
        self.assertTrue("metrics" in scores)
        self.assertTrue("metadata" in scores)

        # Precise task and aspect
        print("Precise task and aspect")
        scores = scorer.score(
            llm_input,
            prompt,
            prediction,
            task=task,
            aspects=aspect,
            n_samples=2,
        )
        self.assertTrue(isinstance(scores, dict))
        self.assertTrue("metrics" in scores)
        self.assertTrue("metadata" in scores)

        # Precise custom prompt
        print("Precise custom prompt")
        scores = scorer.score(
            llm_input,
            prompt,
            prediction,
            custom_prompt=custom_prompt,
            n_samples=2,
        )
        self.assertTrue(isinstance(scores, dict))
        self.assertTrue("metrics" in scores)
        self.assertTrue("metadata" in scores)


In [10]:
unittest.main(argv=[""], verbosity=2, exit=False)

Downloading builder script: 100%|██████████| 7.95k/7.95k [00:00<00:00, 10.4MB/s]
Downloading builder script: 100%|██████████| 7.24k/7.24k [00:00<00:00, 10.9MB/s]
Downloading builder script: 100%|██████████| 5.20k/5.20k [00:00<00:00, 6.52MB/s]
Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: evaluate.load('bleurt', 'bleurt-large-512').
Downloading data: 100%|██████████| 405M/405M [00:30<00:00, 13.4MB/s]   


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
(…)ad-v2/resolve/main/tokenizer_config.json: 100%|██████████| 58.0/58.0 [00:00<00:00, 169kB/s]
(…)rge-v2-squad-v2/resolve/main/config.json: 100%|██████████| 717/717 [00:00<00:00, 2.48MB/s]
(…)ge-v2-squad-v2/resolve/main/spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 937kB/s]
(…)-squad-v2/resolve/main/added_tokens.json: 100%|██████████| 2.00/2.00 [00:00<00:00, 9.97kB/s]
(…)-v2/resolve/main/special_tokens_map.json: 100%|██████████| 156/156 [00:00<00:00, 382kB/s]
pytorch_model.bin: 100%|██████████| 235M/235M [00:17<00:00, 13.8MB/s] 
Some weights of the model checkpoint at ktrapeznikov/albert-xlarge-v2-squad-v2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.weight', 'albert.pooler.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraini

INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
Some weights of the model checkpoint at ktrapeznikov/albert-xlarge-v2-squad-v2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.weight', 'albert.pooler.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.

INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
Some weights of the model checkpoint at ktrapeznikov/albert-xlarge-v2-squad-v2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.weight', 'albert.pooler.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.

INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Will load checkpoint bert_custom


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... name:bert_custom


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... vocab_file:vocab.txt


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... do_lower_case:True


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:... max_seq_length:128


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:Creating WordPiece tokenizer.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:WordPiece tokenizer instantiated.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.
Some weights of the model checkpoint at ktrapeznikov/albert-xlarge-v2-squad-v2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.weight', 'albert.pooler.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.

Loading tokenizer


(…)ingface.co/gpt2/resolve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 3.59MB/s]
(…)gingface.co/gpt2/resolve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 3.55MB/s]
(…)gingface.co/gpt2/resolve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.53MB/s]
(…)face.co/gpt2/resolve/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 4.67MB/s]


Tokenizing text...
Loading tokenizer
Loading model


model.safetensors: 100%|██████████| 548M/548M [00:37<00:00, 14.7MB/s] 


Featurizing tokens


Featurizing p: 100%|██████████| 1/1 [00:00<00:00, 44.77it/s]


Tokenizing text...
Featurizing tokens


Featurizing q: 100%|██████████| 1/1 [00:00<00:00, 39.00it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds



Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    22.38 ms /    45 runs   (    0.50 ms per token,  2010.99 tokens per second)
llama_print_timings: prompt eval time =  1141.50 ms /    40 tokens (   28.54 ms per token,    35.04 tokens per second)
llama_print_timings:        eval time =  4289.00 ms /    44 runs   (   97.48 ms per token,    10.26 tokens per second)
llama_print_timings:       total time =  5595.04 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    21.53 ms /    44 runs   (    0.49 ms per token,  2044.04 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  4295.02 ms /    44 runs   (   97.61 ms per token,    10.24 tokens per second)
llama_print_timings:       total time =  4394.16 ms
Llama.gen

All default


Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    15.26 ms /    32 runs   (    0.48 ms per token,  2096.99 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  3059.18 ms /    32 runs   (   95.60 ms per token,    10.46 tokens per second)
llama_print_timings:       total time =  3128.69 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    30.78 ms /    62 runs   (    0.50 ms per token,  2014.56 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  6059.56 ms /    62 runs   (   97.73 ms per token,    10.23 tokens per second)
llama_print_timings:       total time =  6199.91 ms
Llama.gene

All default, but with context


Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    25.07 ms /    52 runs   (    0.48 ms per token,  2074.44 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  4974.52 ms /    52 runs   (   95.66 ms per token,    10.45 tokens per second)
llama_print_timings:       total time =  5091.69 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    27.49 ms /    54 runs   (    0.51 ms per token,  1964.21 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  5328.76 ms /    54 runs   (   98.68 ms per token,    10.13 tokens per second)
llama_print_timings:       total time =  5453.43 ms
Llama.gene

All default, but with reference
Tokenizing text...
Featurizing tokens


Featurizing p: 100%|██████████| 1/1 [00:00<00:00, 33.40it/s]


Tokenizing text...
Featurizing tokens


Featurizing q: 100%|██████████| 1/1 [00:00<00:00, 35.93it/s]


seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds


Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    17.61 ms /    36 runs   (    0.49 ms per token,  2044.64 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  3562.06 ms /    36 runs   (   98.95 ms per token,    10.11 tokens per second)
llama_print_timings:       total time =  3645.64 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    25.82 ms /    53 runs   (    0.49 ms per token,  2052.83 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  5136.73 ms /    53 runs   (   96.92 ms per token,    10.32 tokens per second)
llama_print_timings:       total time =  5258.20 ms
Llama.gene

Precise task and aspect



llama_print_timings:        load time =  3488.56 ms
llama_print_timings:      sample time =    29.05 ms /    56 runs   (    0.52 ms per token,  1927.84 tokens per second)
llama_print_timings: prompt eval time =  3798.19 ms /   160 tokens (   23.74 ms per token,    42.13 tokens per second)
llama_print_timings:        eval time =  5028.43 ms /    55 runs   (   91.43 ms per token,    10.94 tokens per second)
llama_print_timings:       total time =  9227.19 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  3488.56 ms
llama_print_timings:      sample time =     3.30 ms /     7 runs   (    0.47 ms per token,  2119.93 tokens per second)
llama_print_timings: prompt eval time =  6852.10 ms /   277 tokens (   24.74 ms per token,    40.43 tokens per second)
llama_print_timings:        eval time =   598.94 ms /     6 runs   (   99.82 ms per token,    10.02 tokens per second)
llama_print_timings:       total time =  7935.51 ms
Llama.generate: prefix-match hit

llama_pri

Precise custom prompt


Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    29.37 ms /    59 runs   (    0.50 ms per token,  2009.06 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  5675.74 ms /    59 runs   (   96.20 ms per token,    10.40 tokens per second)
llama_print_timings:       total time =  5817.53 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  5726.79 ms
llama_print_timings:      sample time =    20.14 ms /    39 runs   (    0.52 ms per token,  1935.96 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  3734.80 ms /    39 runs   (   95.76 ms per token,    10.44 tokens per second)
llama_print_timings:       total time =  3832.02 ms
Llama.gene

<unittest.main.TestProgram at 0x7f823f52c520>