In [1]:
import random
from typing import Any, Dict, Iterator, List, Optional, Tuple

import pandas as pd
import torch
from bitsandbytes.optim.adamw import PagedAdamW8bit
from src.galore_torch import GaLoreAdamW8bit
from peft import LoraConfig, TaskType, get_peft_model
from sentence_transformers import SentenceTransformer
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
)
import os, json, csv
import pandas as pd
from src.base_lm import BaseLM
from src.general_utils import DictDataset, test_loop, train_loop
from src.model_utils import clear_cache, llama2_log_of_labels, lm_logits, mlm_log_of_labels, set_random_seed
from src.general_utils import white_space_fix
import time

  from .autonotebook import tqdm as notebook_tqdm
2024-03-13 13:39:29.663667: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-13 13:39:29.915884: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 13:39:29.915946: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 13:39:29.931816: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-13 13:39:29.995714: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-criti

In [2]:
!nvidia-smi

Wed Mar 13 13:39:41 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          On   | 00000000:06:00.0 Off |                    0 |
|  0%   47C    P8    33W / 300W |      2MiB / 46068MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install huggingface_hub



In [4]:
!huggingface-cli login --token=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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /h/snajafi/.cache/huggingface/token
Login successful


In [5]:
train_batch_size = 4
eval_batch_size = 4
lm_input_max_length = 1024
lm_output_max_length = 512
lm_top_p = 0.9
temperature = 0.5
metric_device = "cuda:1"
metric_batch_size = 8
learning_rate = 0.00005

# folder to store models and predictions.
model_path = "/scratch/ssd004/scratch/snajafi/checkpoints/gemma-prompt-recovery"

# related to lora
r = 16
lora_alpha = 8
lora_dropout = 0.05

In [6]:
"""Load LM efficiently."""

# Make sure we have some tokens defined for the LM, if not defined in the model.
_EXTRA_TOKENS = {
    "pad_token": "<pad>",
    "mask_token": "<mask>",
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "cls_token": "<CLS>",
}

target_modules = ["q_proj", "v_proj", "o_proj", "k_proj"]


def load_peft_model(
    model: PreTrainedModel,
    adapter_name: str = "lora",
    is_trainable: bool = False,
    model_type: str = "causal_lm",
    lora_target_modules: List[str] = target_modules,
) -> torch.nn.Module:
    """Load a trained PEFT adapter to the base model and return the PeftModel.

    Args:
    ----
        model: the main model.
        num_quantized_bits: number of bits in the loaded model.
        adapter_name: e.g. lora.
        is_trainable: train or inference mode.
        model_type: causal lm or seq-to-seq.
        lora_target_modules: which modules to train with lora.

    Returns:
    -------
        The PEFT model and tokenizer.
    """
    if model_type == "causal_lm":
        task_type = TaskType.CAUSAL_LM
    elif model_type == "seq_to_seq_lm":
        task_type = TaskType.SEQ_2_SEQ_LM

    if adapter_name == "lora":
        peft_config = LoraConfig(
            task_type=task_type,
            inference_mode=not is_trainable,
            r=r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            bias="none",
            init_lora_weights=True,
            target_modules=lora_target_modules,
        )

    peft_model = get_peft_model(model, peft_config)
    peft_model.print_trainable_parameters()
    return peft_model


def load_model_and_tokenizer(
    model_id: str, model_type: str, model_dtype: torch.dtype, attn_implementation: str, load_in_4bit: Optional[bool] = True
) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    """Load the model and tokenizer.

    Args:
    ----
        model_id: the id for the pre-trained model.
        model_type: causal lm or seq_to_seq_lm.
        model_dtype: model data type.
        load_in_4bit: Whether to load in 4 bit quantization.

    Returns:
    -------
        The model and tokenizer.
    """
    # load model
    if model_type == "causal_lm":
        ModelClass = AutoModelForCausalLM
    elif model_type == "seq_to_seq_lm":
        ModelClass = AutoModelForSeq2SeqLM
    model_args: Dict[str, Any] = {"use_cache": False, "torch_dtype": model_dtype, "attn_implementation": attn_implementation}
    if load_in_4bit:
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=model_args["torch_dtype"],
            bnb_4bit_use_double_quant=True,
        )
        model_args["quantization_config"] = quant_config
    model = ModelClass.from_pretrained(
        model_id,
        **model_args,
    )

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.add_special_tokens(_EXTRA_TOKENS)

    if torch.cuda.is_available():
        # extend embeddings to a multiple so we use Tensor cores
        multiple = 64 if "A100" in torch.cuda.get_device_name() else 8
        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=multiple)
    else:
        raise Exception("No CUDA Found!")

    # re-define token ids for the model.
    for extra_token_key, extra_token_val in _EXTRA_TOKENS.items():
        extra_token_id = tokenizer.convert_tokens_to_ids([extra_token_val])[0]
        model.config.__setattr__(f"{extra_token_key}_id", extra_token_id)

    return model, tokenizer

In [7]:
class Gemma(BaseLM):
    """Class to implement Gemma for generation tasks."""

    def __init__(
        self,
        mode: str,
        device: str,
        seed: int = 42,
    ) -> None:
        super().__init__(device, "main_lm", seed)
        self.device = device
        model, tokenizer = load_model_and_tokenizer(
            model_id="/model-weights/gemma-7b-it",
            model_type="causal_lm",
            model_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            load_in_4bit=False,
        )
        self.model = model
        self.tokenizer = tokenizer
        self.optimizer = PagedAdamW8bit(self.model.parameters(), lr=learning_rate)
        self.scheduler = CosineAnnealingWarmRestarts(self.optimizer, T_0=10, eta_min=learning_rate / 5.0)

    def prepare_text(self, texts: List[str], output_texts: List[str], ids: List[str], instructions: List[str]) -> Dict[str, Any]:
        """Convert texts to ids and return the dataset required for training
        and inference."""
        input_texts = [f"{instructions[idx]} {texts[idx]}" for idx in range(len(instructions))]
        template = "<bos><start_of_turn>user\n{user_input}<end_of_turn>\n<start_of_turn>model"
        inputs_for_training = [
            f"{template.format(user_input=input_texts[idx])}\n{output_texts[idx]}<end_of_turn>" for idx in range(len(input_texts))
        ]
        inputs_for_generation = [
            template.format(user_input=input_texts[idx]) for idx in range(len(input_texts))
        ]

        input_encodings = self.tokenizer(
            inputs_for_training,
            truncation=True,
            padding="max_length",
            max_length=lm_input_max_length + lm_output_max_length,
            add_special_tokens=False,
        )
        input_encodings_for_generation = self.tokenizer(
            inputs_for_generation,
            truncation=True,
            padding="max_length",
            max_length=lm_input_max_length,
            add_special_tokens=False,
        )
        data = {
            "lm_input_ids_for_train": input_encodings.input_ids,
            "lm_attention_mask_for_train": input_encodings.attention_mask,
            "lm_input_ids_for_generation": input_encodings_for_generation.input_ids,
            "lm_attention_mask_for_generation": input_encodings_for_generation.attention_mask,
        }
        return data

    def train(self, batch: torch.utils.data.Dataset) -> torch.Tensor:
        """Using the Gemma, run a forward computation over the batch, compute
        the log probability over the batch.

        This will be used for training.
        """
        self.train_mode_on()
        loaded_batch = self.data_to_device(batch, keys=["lm_input_ids_for_train", "lm_attention_mask_for_train",
                                                        "lm_attention_mask_for_generation"])
        input_ids = loaded_batch["lm_input_ids_for_train"]
        attention_mask = loaded_batch["lm_attention_mask_for_train"]
        original_len_without_answer = torch.sum(loaded_batch["lm_attention_mask_for_generation"], dim=1)
        with torch.set_grad_enabled(True):
            logits = lm_logits(
                model=self.model,
                input_ids=input_ids,
                input_mask=attention_mask,
            )
            batch_size, seq_len = input_ids.size()
            masked_labels = input_ids.masked_fill(input_ids == self.tokenizer.pad_token_id, -100)
            prompt_mask = torch.arange(seq_len, device=self.device).expand(batch_size, seq_len) < original_len_without_answer.unsqueeze(1)
            masked_labels = masked_labels.masked_fill(prompt_mask == 1, -100)
            return llama2_log_of_labels(logits=logits, labels=masked_labels, loss_func=self.loss_func)

    def generation_pass(self, batch: torch.utils.data.Dataset) -> Tuple[List[str], torch.Tensor]:
        """Using the Gemma, generate new text.

        This will be used for inference.
        """
        self.predict_mode_on()
        loaded_batch = self.data_to_device(batch, keys=["lm_input_ids_for_generation", "lm_attention_mask_for_generation"])
        input_ids = loaded_batch["lm_input_ids_for_generation"]
        attention_mask = loaded_batch["lm_attention_mask_for_generation"]
        with torch.no_grad():
            predictions_output = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                do_sample=True,
                top_p=lm_top_p,
                temperature=temperature,
                max_length=lm_input_max_length + lm_output_max_length,
                num_return_sequences=1,
                output_logits=True,
                return_dict_in_generate=True,
                use_cache=True,
                renormalize_logits=True,
            )

        prompt_len = input_ids.size()[1]
        selected_samples = predictions_output.sequences[:, prompt_len:]
        # all special tokens will be removed.
        predictions_str = self.tokenizer.batch_decode(selected_samples, skip_special_tokens=True)
        predictions_str = [pred.lstrip('"').lstrip("'").rstrip("'").rstrip('"').strip() for pred in predictions_str]

        logits_list = list(predictions_output.logits)
        logits = torch.stack(logits_list, dim=1)
        labels_to_consider = selected_samples.masked_fill(selected_samples == self.tokenizer.pad_token_id, -100)
        final_log_ps = mlm_log_of_labels(logits=logits, labels=labels_to_consider, loss_func=self.loss_func)
        actual_lens = torch.sum(torch.where(labels_to_consider > 0, 1, 0), dim=1)
        # Average log probs per token (length normalization).
        return predictions_str, final_log_ps / actual_lens

    def predict(self, batch: torch.utils.data.Dataset) -> Iterator[Dict[str, str]]:
        """The main prediction loop."""
        outputs, log_ps = self.generation_pass(batch)
        log_ps = log_ps.cpu().detach().numpy()
        for idx, output in enumerate(outputs):
            output_row = {
                "gemma_output": output,
                "gemma_logit": log_ps[idx],
            }
            yield output_row

In [7]:
class Llama2(BaseLM):
    """Class to implement Llama2 for generative tasks."""

    def __init__(
        self,
        mode: str,
        device: str,
        seed: int = 42,
    ) -> None:
        super().__init__(device, "main_lm", seed)
        self.device = device
        model, tokenizer = load_model_and_tokenizer(
            model_id="/model-weights/Llama-2-13b-chat-hf",
            model_type="causal_lm",
            model_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            load_in_4bit=True,
        )
        peft_model = load_peft_model(
            model=model,
            adapter_name="lora",
            is_trainable=mode == "train",
            model_type="causal_lm",
        )
        self.model = peft_model
        self.tokenizer = tokenizer
        self.optimizer = PagedAdamW8bit(self.model.parameters(), lr=learning_rate)
        self.scheduler = CosineAnnealingWarmRestarts(self.optimizer, T_0=10, eta_min=learning_rate / 5.0)

    def prepare_text(self, texts: List[str], output_texts: List[str], instructions: List[str]) -> Dict[str, Any]:
        """Convert texts to ids and return the dataset required for training
        and inference."""
        template = "<s> [INST] <<SYS>> {instruction} <</SYS>> {input_text} [/INST]"
        inputs_for_training = [f'{template.format(input_text=texts[idx], instruction=instructions[idx])} {output_texts[idx]} </s>' for idx in range(len(texts))]
        inputs_for_generation = [template.format(input_text=texts[idx], instruction=instructions[idx]) for idx in range(len(texts))]
        input_encodings = self.tokenizer(
            inputs_for_training,
            truncation=True,
            padding="max_length",
            max_length=lm_input_max_length + lm_output_max_length,
            add_special_tokens=False,
        )
        input_encodings_for_generation = self.tokenizer(
            inputs_for_generation,
            truncation=True,
            padding="max_length",
            max_length=lm_input_max_length,
            add_special_tokens=False,
        )
        data = {
            "lm_input_ids_for_train": input_encodings.input_ids,
            "lm_attention_mask_for_train": input_encodings.attention_mask,
            "lm_input_ids_for_generation": input_encodings_for_generation.input_ids,
            "lm_attention_mask_for_generation": input_encodings_for_generation.attention_mask,
        }
        return data

    def train(self, batch: torch.utils.data.Dataset) -> torch.Tensor:
        """Using the Llama2, run a forward computation over the batch, compute
        the log probability over the batch.

        This will be used for training.
        """
        self.train_mode_on()
        loaded_batch = self.data_to_device(batch, keys=["lm_input_ids_for_train", "lm_attention_mask_for_train",
                                                        "lm_attention_mask_for_generation"])
        input_ids = loaded_batch["lm_input_ids_for_train"]
        attention_mask = loaded_batch["lm_attention_mask_for_train"]
        original_len_without_answer = torch.sum(loaded_batch["lm_attention_mask_for_generation"], dim=1)
        with torch.set_grad_enabled(True):
            logits = lm_logits(
                model=self.model,
                input_ids=input_ids,
                input_mask=attention_mask,
            )
            batch_size, seq_len = input_ids.size()
            masked_labels = input_ids.masked_fill(input_ids == self.tokenizer.pad_token_id, -100)
            prompt_mask = torch.arange(seq_len, device=self.device).expand(batch_size, seq_len) < original_len_without_answer.unsqueeze(1)
            masked_labels = masked_labels.masked_fill(prompt_mask == 1, -100)
            return llama2_log_of_labels(logits=logits, labels=masked_labels, loss_func=self.loss_func)

    def generation_pass(self, batch: torch.utils.data.Dataset) -> Tuple[List[str], torch.Tensor]:
        """Using the Llama2, generate new text.

        This will be used for inference.
        """
        self.predict_mode_on()
        loaded_batch = self.data_to_device(batch, keys=["lm_input_ids_for_generation", "lm_attention_mask_for_generation"])
        input_ids = loaded_batch["lm_input_ids_for_generation"]
        attention_mask = loaded_batch["lm_attention_mask_for_generation"]
        with torch.no_grad():
            # more look here:
            # https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L130
            predictions_output = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                do_sample=True,
                top_p=lm_top_p,
                temperature=temperature,
                max_length=lm_input_max_length + lm_output_max_length,
                num_return_sequences=1,
                output_logits=True,
                return_dict_in_generate=True,
                use_cache=True,
                renormalize_logits=True,
            )

        prompt_len = input_ids.size()[1]
        selected_samples = predictions_output.sequences[:, prompt_len:]
        # all special tokens will be removed.
        predictions_str = self.tokenizer.batch_decode(selected_samples, skip_special_tokens=True)
        predictions_str = [pred.lstrip('"').lstrip("'").rstrip("'").rstrip('"').strip() for pred in predictions_str]

        logits_list = list(predictions_output.logits)
        logits = torch.stack(logits_list, dim=1)
        labels_to_consider = selected_samples.masked_fill(selected_samples == self.tokenizer.pad_token_id, -100)
        final_log_ps = mlm_log_of_labels(logits=logits, labels=labels_to_consider, loss_func=self.loss_func)
        actual_lens = torch.sum(torch.where(labels_to_consider > 0, 1, 0), dim=1)
        # Average log probs per token (length normalization).
        return predictions_str, final_log_ps / actual_lens

    def predict(self, batch: torch.utils.data.Dataset) -> Iterator[Dict[str, str]]:
        """The main prediction loop."""
        outputs, log_ps = self.generation_pass(batch)
        log_ps = log_ps.cpu().detach().numpy()
        for idx, output in enumerate(outputs):
            output_row = {
                "llama_output": output,
                "llama_logit": log_ps[idx],
            }
            yield output_row

In [8]:
# Create model and start training.
set_random_seed(42)

model = Llama2(mode="test", device="cuda:0", seed=42)
model.to_device()

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]


FileNotFoundError: No such file or directory: "/model-weights/Llama-2-70b-chat-hf/model-00001-of-00015.safetensors"

In [11]:
!nvidia-smi

Wed Mar 13 13:34:56 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          On   | 00000000:06:00.0 Off |                    0 |
|  0%   55C    P0    92W / 300W |  15928MiB / 46068MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
text_one = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
text_two = "Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"
instruction = "I will give you two pieces of text. Text B has been changed based on the text A and an instruction. I like to describe the changes in text B and then summarize those changes as an instruction."
input = f"Text A: {text_one}\nText B: {text_two}"
data = model.prepare_text([input], ["Rewrite the text in a playful shanty format, emphasizing the competition aspect and using figurative language."],
                         [instruction])
dataset = DictDataset(data)
dataloader = DataLoader(dataset, batch_size=eval_batch_size, shuffle=False)
predictions = []
for data_batch in dataloader:
    for each in model.predict(data_batch):
        predictions.append(each)

In [13]:
print(predictions)

[{'llama_output': '(Based on Text A)\n\nText B has been modified based on the given instruction as follows:\n\n* The opening line has been changed to "Here is your shanty" and includes two verses and a chorus.\n* The text has been rewritten to incorporate elements of a code competition, such as "clever prompts" and "crack the code."\n* The chorus mentions "two thousand texts" and "hand over hand" to emphasize the challenge and competitive aspect of the task.\n\nThe instruction given for Text B is to "describe the changes made to the text and summarize those changes as an instruction."\n\nHere is a summary of the changes made to Text B:\n\n* The text has been rewritten to incorporate a nautical theme, with references to "shanty" and "hand over hand."\n* The language has been made more playful and challenging, with the use of words like "clever" and "bright."\n* The text now includes a chorus, which adds a sense of rhythm and repetition to the piece.\n\nOverall, the changes made to Text 

In [10]:
text_one = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
text_two = "Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"
instruction = "I will give you two pieces of text. Text B has been changed based on the text A and given an instruction. I like to describe the changes in text B and then summarize those changes as an instruction."
input = f"Text A: {text_one}\nText B: {text_two}"
data = model.prepare_text([input], ["Rewrite the text in a playful shanty format, emphasizing the competition aspect and using figurative language."],
                         [instruction])
dataset = DictDataset(data)
dataloader = DataLoader(dataset, batch_size=eval_batch_size, shuffle=False)
predictions = []
for data_batch in dataloader:
    for each in model.predict(data_batch):
        predictions.append(each)

In [11]:
print(predictions)

[{'gemma_output': '## Changes in Text B:\n\n* **Increased informality:** The text uses a more conversational tone, with elements of humor and storytelling.\n* **Added rhythm and rhyme:** The text uses rhyming phrases and a sing-song structure, creating a more memorable melody.\n* **Simplified language:** The text uses simpler vocabulary and sentence structure, making it more accessible.\n* **Added imagery:** The text paints a vivid picture of the text being rewritten, using imagery to engage the reader.\n* **Changed focus:** The text emphasizes the competition aspect and the challenge of finding the prompt, rather than the dataset or the LLM.\n\n## Summary of Changes as Instruction:\n\nTo transform Text A into Text B, the text has been rewritten to be more informal, rhythmic, and concise. The use of humor, rhyme, and imagery has been increased, while the language has been simplified and the focus has been shifted to the competition aspect.', 'gemma_logit': -0.21867983}]


In [11]:
# create the dataset for predicting new output from Gemma.
dataframe = pd.read_csv(f"{path_to_json}/selected_instances.csv", sep=",")
instructions = dataframe.instruction.tolist()
inputs = dataframe.input.tolist()
outputs = dataframe.output.tolist()
ids = dataframe.id.tolist()

In [12]:
input_texts = [f"{instructions[idx]} {inputs[idx]}" for idx in range(len(instructions))]
output_texts = outputs

In [19]:
data = model.prepare_text(input_texts[:120], output_texts[:120])
dataset = DictDataset(data)
test_dataloader = DataLoader(dataset, batch_size=eval_batch_size, shuffle=False)

start_time = time.time()
test_loop(
    model=model,
    mode="test",
    model_path="/tmp",
    prediction_file_name="test.predicted.tsv",
    test_dataloader=test_dataloader,
    metric=None,
)
end_time = time.time()
print(f"processed in {end_time - start_time}.")

'''
predictions = []

step = 0
for data_batch in dataloader:
    for each in model.predict(data_batch):
        predictions.append(each)
    print(f"processed step:{step}")
    step += 1
'''

Prediction Step: 1.
Prediction Step: 2.
Prediction Step: 3.
Prediction Step: 4.
Prediction Step: 5.
Prediction Step: 6.
Prediction Step: 7.
Prediction Step: 8.
Prediction Step: 9.
Prediction Step: 10.
Prediction Step: 11.
Prediction Step: 12.
Prediction Step: 13.
Prediction Step: 14.
Prediction Step: 15.
Prediction Step: 16.
Prediction Step: 17.
Prediction Step: 18.
Prediction Step: 19.
Prediction Step: 20.
Prediction Step: 21.
Prediction Step: 22.
Prediction Step: 23.
Prediction Step: 24.
Prediction Step: 25.
Prediction Step: 26.
Prediction Step: 27.
Prediction Step: 28.
Prediction Step: 29.
Prediction Step: 30.
processed in 141.00930857658386.


'\npredictions = []\n\nstep = 0\nfor data_batch in dataloader:\n    for each in model.predict(data_batch):\n        predictions.append(each)\n    print(f"processed step:{step}")\n    step += 1\n'

In [14]:
print(predictions)

[{'potential_answer': 'Answer: Every Tuesday and Thursday, at 2:00 PM, in a secret underground bunker beneath the Washington Monument.', 'prediction_score': -0.4509336}, {'potential_answer': '**Answer:** Every fortnight, wearing feather dusters and riding unicorns.\n\nThis answer is implausible because it is highly unlikely that the event of "hitherward they came" would occur with such a frequency.', 'prediction_score': -0.4389076}, {'potential_answer': '**Answer:** Every quarter of a century.\n\nThis answer is implausible because the frequency of addressing the local bar association is not related to the text provided, and "every quarter of a century" is an extremely unlikely frequency for such an event to occur.', 'prediction_score': -0.23740962}, {'potential_answer': '**Answer:** The answer to this question is "once in a blue moon".\n\nThis answer is implausible because it is not related to the provided text. The text does not provide any information about the frequency of her drink