Clone the panza repository by running the following cell.

In [1]:
!git clone https://github.com/IST-DASLab/PanzaMail.git
%cd PanzaMail

Cloning into 'PanzaMail'...
remote: Enumerating objects: 1879, done.[K
remote: Counting objects: 100% (203/203), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 1879 (delta 159), reused 136 (delta 122), pack-reused 1676 (from 2)[K
Receiving objects: 100% (1879/1879), 1.71 MiB | 26.99 MiB/s, done.
Resolving deltas: 100% (1064/1064), done.
/content/PanzaMail


Install all the required packages

In [1]:
!pip install hydra-core langchain-community fastapi uvicorn pydantic python-dotenv gradio evaluate torchmetrics nltk accelerate mauve-text langdetect --quiet

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [3]:
from unsloth import FastLanguageModel
import json
import re
import sys
import hydra
import torch
import numpy as np
from datasets import load_dataset
from transformers import TextStreamer, AutoConfig, AutoTokenizer
import os
from typing import Dict
from omegaconf import OmegaConf
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import re
import pandas as pd

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `panza-preetika` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to 

Download your emails

In [11]:
from google.colab import drive
drive.mount('/content/drive')
%cp /content/drive/MyDrive/panza/Sent.mbox /content/PanzaMail/data/Sent.mbox

##Configuration

Now from the left panel, open the file PanzaMail/scripts/config.sh and configure the parameters according to this set of [instructions](https://github.com/IST-DASLab/PanzaMail?tab=readme-ov-file#step-1-environment-configuration). Additionally, you would want to edit your prompt preambles (under PanzaMail/prompt_preambles).


In [4]:
script_dir = os.path.dirname(os.path.abspath('/content/PanzaMail/scripts/prepare_data.py'))  # location of prepare_data.py
src_path = os.path.join(script_dir, '..', 'src')
sys.path.insert(0, os.path.abspath(src_path))

In [5]:
from panza import PanzaWriter  # The import also loads custom Hydra resolvers
from panza.entities import Document, Email, SummarizationInstruction, EmailInstruction
from panza.retriever import DocumentRetriever
from panza.data_preparation.extract_emails import extract_emails
from panza.data_preparation.prepare_raft_emails import prepare_raft_emails
from panza.data_preparation.rag import create_vector_store
from panza.interface.json import compute_rouge_scores, compute_bleu_scores, compute_mauve_score

     .    .        .                    .    .    .         
                   ...  .  .    .          . .       .    . 
                       .   .  .        . . =%[ :+.     .    
       .          .            .         .~@% +@(           
 .  ..     ~<:           .  .       .     >@^.@)         .  
      .  . :}:      .            .        *@@@@@{^=-.       
 .       ={@@~ .     .      .            .)@@@@@@@@@#= .    
       *%@@@@^     . .   ~^^             >@@@@@@@@@%(=.     
  (@%@@@@@@@@[       =#@@@@@@@@@@@#=    -}@@@@@@]..*=       
  ^@@@@@@@@@@%..  :<#@@@@@@@@@@@{=    ..:}@@@@@-)       ..  
  ~@@@@@@@@@@@-:[@@@@@@@@@@@@@@%+ .     =#@@@@@+* ..       .
  :@@@@@@#^:.^*>#@%#{@@@@@@@@@@@#  ..+-.^@@@@@@<--          
  .}@@@[+    -(  .  .<@@@@@@@@@@^   ^%[=)@@@@@@~<(          
 .  ..       +@*. *}@@@@@@@@@@@@()>+)@@+=%@@@@@+}{= .       
             {@@#%@@@@@@@@@@@@@@@%{@@@%+]@@@@@@+            
        .    =@@@@@@@@@@@@@@@@@@@@@{#{*.(@@@@@@*            
       .      -@][[#@@@@

In [6]:
from hydra import initialize, compose
from omegaconf import OmegaConf as om
from omegaconf import OmegaConf, open_dict
from hydra.core.hydra_config import HydraConfig

%cd /content/PanzaMail
sys.path.append(os.path.abspath(os.path.join('../src')))


config_dir = "./configs"

with initialize(version_base="1.1", config_path=config_dir):
    cfg = compose(config_name="panza_preparation.yaml")
    OmegaConf.set_struct(cfg, False)
    cfg.writer.llm.name = 'meta-llama/Llama-3.2-3B-Instruct'
    cfg.writer.llm.checkpoint  = 'meta-llama/Llama-3.2-3B-Instruct'
    cfg.panza_workspace = os.getcwd()
    om.resolve(cfg)
    print(cfg)

/content/PanzaMail




{'user': {'email_address': 'david_anonymous@xyz.com', 'username': 'david_anonymous', 'data_dir': '/content/PanzaMail/data', 'system_preamble_path': '/content/PanzaMail/prompt_preambles/system_preamble.txt', 'user_preamble_path': '/content/PanzaMail/prompt_preambles/user_preamble.txt', 'rag_preamble_path': '/content/PanzaMail/prompt_preambles/rag_preamble.txt', 'thread_preamble_path': '/content/PanzaMail/prompt_preambles/thread_preamble.txt'}, 'panza_workspace': '/content/PanzaMail', 'checkpoint_dir': '/content/PanzaMail/checkpoints', 'seed': 41, 'embedding_model': 'sentence-transformers/all-mpnet-base-v2', 'model_precision': 'bf16', 'writer': {'llm': {'sampling': {'do_sample': True, 'temperature': 0.7, 'top_k': 50, 'top_p': 0.7, 'max_new_tokens': 1024}, '_target_': 'panza.llm.TransformersLLM', 'name': 'meta-llama/Llama-3.2-3B-Instruct', 'checkpoint': 'meta-llama/Llama-3.2-3B-Instruct', 'device': 'cuda', 'dtype': 'bf16', 'load_in_4bit': False, 'remove_prompt_from_stream': False}, 'promp

In [7]:
import importlib.util

# Path to the file you want to import from
file_path = "/content/PanzaMail/scripts/prepare_data.py"

# Load the module
spec = importlib.util.spec_from_file_location("prepare_data", file_path)
prepare_data = importlib.util.module_from_spec(spec)
spec.loader.exec_module(prepare_data)

# Now you can access the function
rename_config_keys = prepare_data.rename_config_keys
load_documents = prepare_data.load_documents
generate_synthetic_instructions = prepare_data.generate_synthetic_instructions
check_if_file_exists = prepare_data.check_if_file_exists
split_and_write_data = prepare_data.split_and_write_data

In [8]:
rename_config_keys(cfg)
if not check_if_file_exists(cfg):
  extract_emails(cfg.email_dump_path, cfg.cleaned_emails_path,[cfg.user.email_address], cfg.discarded_emails_dir,)



In [9]:
import hydra
from panza import PanzaWriter

writer: PanzaWriter = hydra.utils.instantiate(cfg.writer)
assert isinstance(writer, PanzaWriter), "Failed to instantiate PanzaWriter"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# Patch for float32 serialization
def json_fallback(obj):
    if isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    if isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

json._default_encoder = json.JSONEncoder(default=json_fallback)

In [11]:
documents = load_documents(cfg.cleaned_emails_path)
generate_synthetic_instructions(documents=documents, writer=writer, batch_size=cfg.batch_size, output_path=cfg.summarized_emails_path,)
split_and_write_data(cfg)
create_vector_store(
        os.path.join(cfg.user.data_dir, "train.jsonl"),
        cfg.rag_embedding_chunk_size,
        cfg.rag_embedding_chunk_overlap,
        cfg.rag_db_dir,
        cfg.user.username,
        cfg.rag_embedding_model,
    )

if cfg.number_rag_emails_to_cache_with_train_data > 0:
        prepare_raft_emails(
            os.path.join(cfg.user.data_dir, "train.jsonl"),
            cfg.rag_embedding_model,
            cfg.rag_db_dir,
            cfg.user.username,
            cfg.number_rag_emails_to_cache_with_train_data,
            write_back_to_same_loc=True,
        )

--> # emails = 166


  0%|          | 0/21 [00:00<?, ?it/s]

--> Processing batch 1/21


  5%|▍         | 1/21 [00:23<07:44, 23.22s/it]

--> Processing batch 2/21


 10%|▉         | 2/21 [00:45<07:07, 22.49s/it]

--> Processing batch 3/21


 14%|█▍        | 3/21 [01:02<06:05, 20.33s/it]

--> Processing batch 4/21


 19%|█▉        | 4/21 [01:18<05:13, 18.43s/it]

--> Processing batch 5/21


 24%|██▍       | 5/21 [01:34<04:38, 17.43s/it]

--> Processing batch 6/21


 29%|██▊       | 6/21 [01:51<04:21, 17.41s/it]

--> Processing batch 7/21


 33%|███▎      | 7/21 [02:08<04:02, 17.33s/it]

--> Processing batch 8/21


 38%|███▊      | 8/21 [02:23<03:35, 16.56s/it]

--> Processing batch 9/21


 43%|████▎     | 9/21 [02:38<03:12, 16.01s/it]

--> Processing batch 10/21


 48%|████▊     | 10/21 [03:00<03:16, 17.82s/it]

--> Processing batch 11/21


 52%|█████▏    | 11/21 [03:22<03:10, 19.09s/it]

--> Processing batch 12/21


 57%|█████▋    | 12/21 [03:36<02:38, 17.59s/it]

--> Processing batch 13/21


 62%|██████▏   | 13/21 [03:51<02:13, 16.71s/it]

--> Processing batch 14/21


 67%|██████▋   | 14/21 [04:12<02:07, 18.22s/it]

--> Processing batch 15/21


 71%|███████▏  | 15/21 [04:27<01:43, 17.19s/it]

--> Processing batch 16/21


 76%|███████▌  | 16/21 [04:43<01:23, 16.75s/it]

--> Processing batch 17/21


 81%|████████  | 17/21 [05:01<01:09, 17.30s/it]

--> Processing batch 18/21


 86%|████████▌ | 18/21 [05:22<00:55, 18.41s/it]

--> Processing batch 19/21


 90%|█████████ | 19/21 [05:37<00:34, 17.34s/it]

--> Processing batch 20/21


 95%|█████████▌| 20/21 [05:51<00:16, 16.26s/it]

--> Processing batch 21/21


100%|██████████| 21/21 [06:07<00:00, 17.49s/it]
  embeddings_model = HuggingFaceEmbeddings(


Loaded 166 emails.
Obtained 166 text chunks.
Creating vector DB...
Vector DB created in 27.67449402809143 seconds.
Vector DB index david_anonymous saved to /content/PanzaMail/data.
--> Reading emails from: /content/PanzaMail/data/train.jsonl
--> # emails = 166
Faiss index loaded 


  0%|          | 0/42 [00:00<?, ?it/s]

--> Processing batch 0/166


  2%|▏         | 1/42 [00:00<00:28,  1.44it/s]

--> Processing batch 4/166


  5%|▍         | 2/42 [00:01<00:29,  1.37it/s]

--> Processing batch 8/166


  7%|▋         | 3/42 [00:02<00:25,  1.51it/s]

--> Processing batch 12/166


 10%|▉         | 4/42 [00:02<00:25,  1.48it/s]

--> Processing batch 16/166


 12%|█▏        | 5/42 [00:03<00:23,  1.58it/s]

--> Processing batch 20/166


 14%|█▍        | 6/42 [00:03<00:21,  1.64it/s]

--> Processing batch 24/166


 17%|█▋        | 7/42 [00:04<00:21,  1.67it/s]

--> Processing batch 28/166


 19%|█▉        | 8/42 [00:05<00:21,  1.58it/s]

--> Processing batch 32/166


 21%|██▏       | 9/42 [00:05<00:21,  1.56it/s]

--> Processing batch 36/166


 24%|██▍       | 10/42 [00:06<00:20,  1.57it/s]

--> Processing batch 40/166


 26%|██▌       | 11/42 [00:07<00:21,  1.45it/s]

--> Processing batch 44/166


 29%|██▊       | 12/42 [00:07<00:20,  1.49it/s]

--> Processing batch 48/166


 31%|███       | 13/42 [00:08<00:19,  1.52it/s]

--> Processing batch 52/166


 33%|███▎      | 14/42 [00:09<00:17,  1.61it/s]

--> Processing batch 56/166


 36%|███▌      | 15/42 [00:09<00:16,  1.66it/s]

--> Processing batch 60/166


 38%|███▊      | 16/42 [00:10<00:15,  1.71it/s]

--> Processing batch 64/166


 40%|████      | 17/42 [00:10<00:13,  1.80it/s]

--> Processing batch 68/166


 43%|████▎     | 18/42 [00:11<00:12,  1.89it/s]

--> Processing batch 72/166


 45%|████▌     | 19/42 [00:11<00:12,  1.78it/s]

--> Processing batch 76/166


 48%|████▊     | 20/42 [00:12<00:12,  1.70it/s]

--> Processing batch 80/166


 50%|█████     | 21/42 [00:13<00:12,  1.63it/s]

--> Processing batch 84/166


 52%|█████▏    | 22/42 [00:14<00:14,  1.38it/s]

--> Processing batch 88/166


 55%|█████▍    | 23/42 [00:14<00:12,  1.50it/s]

--> Processing batch 92/166


 57%|█████▋    | 24/42 [00:15<00:11,  1.58it/s]

--> Processing batch 96/166


 60%|█████▉    | 25/42 [00:15<00:10,  1.64it/s]

--> Processing batch 100/166


 62%|██████▏   | 26/42 [00:16<00:09,  1.72it/s]

--> Processing batch 104/166


 64%|██████▍   | 27/42 [00:16<00:08,  1.72it/s]

--> Processing batch 108/166


 67%|██████▋   | 28/42 [00:17<00:09,  1.45it/s]

--> Processing batch 112/166


 69%|██████▉   | 29/42 [00:18<00:08,  1.47it/s]

--> Processing batch 116/166


 71%|███████▏  | 30/42 [00:19<00:08,  1.45it/s]

--> Processing batch 120/166


 74%|███████▍  | 31/42 [00:19<00:08,  1.34it/s]

--> Processing batch 124/166


 76%|███████▌  | 32/42 [00:20<00:07,  1.39it/s]

--> Processing batch 128/166


 79%|███████▊  | 33/42 [00:21<00:06,  1.38it/s]

--> Processing batch 132/166


 81%|████████  | 34/42 [00:21<00:05,  1.51it/s]

--> Processing batch 136/166


 83%|████████▎ | 35/42 [00:22<00:04,  1.53it/s]

--> Processing batch 140/166


 86%|████████▌ | 36/42 [00:22<00:03,  1.67it/s]

--> Processing batch 144/166


 88%|████████▊ | 37/42 [00:23<00:03,  1.62it/s]

--> Processing batch 148/166


 90%|█████████ | 38/42 [00:24<00:02,  1.73it/s]

--> Processing batch 152/166


 93%|█████████▎| 39/42 [00:24<00:01,  1.80it/s]

--> Processing batch 156/166


 95%|█████████▌| 40/42 [00:25<00:01,  1.84it/s]

--> Processing batch 160/166


 98%|█████████▊| 41/42 [00:25<00:00,  1.83it/s]

--> Processing batch 164/166


100%|██████████| 42/42 [00:25<00:00,  1.62it/s]

25.92 seconds to process 166 emails.





In [12]:
from hydra import initialize, compose
from omegaconf import OmegaConf as om
from omegaconf import OmegaConf, open_dict
from hydra.core.hydra_config import HydraConfig
sys.path.append(os.path.abspath(os.path.join('../src')))

config_dir = "./configs"

with initialize(version_base="1.1", config_path=config_dir):
    cfg = compose(config_name="panza_finetuning.yaml")
    OmegaConf.set_struct(cfg, False)
    cfg.panza_workspace = os.getcwd()
    om.resolve(cfg)
    cfg.preprocessing.model = cfg.finetuning.model_name_or_path
    prompting_config = cfg.preprocessing.prompting

In [13]:
max_seq_length = 512
dtype = None
load_in_4bit = True
load_in_8bit = False

In [14]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    full_finetuning = False,
    load_in_4bit = load_in_4bit,
    load_in_8bit = load_in_8bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # rank stabilized LoRA
    loftq_config = None, # LoftQ
)

prompt_builder = hydra.utils.instantiate(prompting_config)

==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.5.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [15]:
def panza_preprocessing_function(inputs):
    try:
        prompt_raw = inputs["summary"].split("\n\nInstruction: ")[-1]
        instruction = EmailInstruction(instruction=prompt_raw, thread=inputs.get("thread", []))
        prompt = prompt_builder.build_prompt(instruction)
        conversation = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": inputs["email"]},
        ]
        chat_prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
        response_begin_index = chat_prompt.index(inputs["email"].strip())

        prompt = chat_prompt[:response_begin_index]
        response = chat_prompt[response_begin_index:]

        return {
            "prompt": prompt,
            "response": response,
            "text": prompt + response,
        }
    except Exception as e:
        raise ValueError(f"Unable to extract prompt/response from {inputs}") from e


In [16]:
def get_dataset(filename):
    data = load_dataset(
        path="json",
        data_files={"train":filename},
        split="train",
    )
    data = data.map(panza_preprocessing_function, remove_columns=data.column_names,batched = False)
    return data

train_dataset = get_dataset("./data/train.jsonl")
test_dataset = get_dataset("./data/test.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

In [17]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 7,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_torch",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/166 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 166 | Num Epochs = 7 | Total steps = 140
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.6427
2,3.5588
3,3.5323
4,3.3694
5,3.2137
6,2.9944
7,2.5492
8,2.3371
9,2.3073
10,2.1236


In [18]:
FastLanguageModel.for_inference(model)
streamer = TextStreamer(tokenizer, skip_prompt=True)
rouge_scores = []
mauve_scores = []
bleu_scores = []
golden_responses = []
panza_responses = []
count = 0
dataset = test_dataset

In [19]:
def do_compute_metrics(all_responses):
    for response in all_responses:
        response["scores"] = {}
        response["scores"]["BLEU"] = compute_bleu_scores(
            response["panza_responses"], response["golden_responses"]
        )
        response["scores"]["ROUGE"] = compute_rouge_scores(
            response["panza_responses"], response["golden_responses"]
        )
    rouge_categories = all_responses[0]["scores"]["ROUGE"][0].keys()
    aggregate_metrics = {
        "BLEU": np.mean([s for r in all_responses for s in r["scores"]["BLEU"]]),
        "ROUGE": {
                cat: np.mean([s[cat] for r in all_responses for s in r["scores"]["ROUGE"]])
                for cat in rouge_categories
        },
        "MAUVE": compute_mauve_score(
                [r["panza_responses"] for r in all_responses],
                [r["golden_responses"] for r in all_responses],
        ).mauve,
    }
    print("########## Aggregated quality metrics ##########\n")
    print(json.dumps(aggregate_metrics, indent=2))
    return {"responses": all_responses, "aggregate_metrics": aggregate_metrics}


In [20]:
for x in dataset:
    inputs = tokenizer(x['prompt'], return_tensors="pt",)
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)
    output = model.generate(input_ids, max_new_tokens = 256, attention_mask=attention_mask)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    panza_responses = [re.sub(r"^Subject:.*\n", "", resp, flags=re.IGNORECASE) for resp in panza_responses]
    golden_responses.append(x['response'].rstrip().removesuffix("<|eot_id|>"))
    panza_responses.append(output_text.split("assistant\n")[-1].strip())

panza_responses = [resp.rstrip().removesuffix("<|eotid|>") for resp in panza_responses]
golden_responses = [resp.split("assistant")[-1].strip() for resp in golden_responses]
all_responses = [
    {"panza_responses": [pred], "golden_responses": [gold]}
    for pred, gold in zip(panza_responses, golden_responses)
]
data = {
    "panza_response": panza_responses,
    "golden_response": golden_responses
}

df = pd.DataFrame(data)
do_compute_metrics(all_responses)

Loading tokenizer
Tokenizing text...
Loading tokenizer
Loading model
Featurizing tokens


Featurizing p:   0%|          | 0/166 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/166 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 42
kmeans time: 0.25 s
total discretization time: 0.44 seconds
########## Aggregated quality metrics ##########

{
  "BLEU": 0.416107937918011,
  "ROUGE": {
    "rouge1_fmeasure": 0.6171970923981035,
    "rouge1_precision": 0.6396843248821167,
    "rouge1_recall": 0.6176533339073859,
    "rouge2_fmeasure": 0.40317841349955064,
    "rouge2_precision": 0.417258518534791,
    "rouge2_recall": 0.40222867609686164,
    "rougeL_fmeasure": 0.5640072697796017,
    "rougeL_precision": 0.5833571008350475,
    "rougeL_recall": 0.5642776719956513,
    "rougeLsum_fmeasure": 0.5640072697796017,
    "rougeLsum_precision": 0.5833571008350475,
    "rougeLsum_recall": 0.5642776719956513
  },
  "MAUVE": 0.9806815434374475
}


{'responses': [{'panza_responses': ['Dear Prof. Aman,\r\n\r\nIt was great to have you at the event!\r\n\r\nCould you please share me with the contact of the person responsible for\r\nthe technical aspects of your cluster?\r\n\r\nI would be very happy to host you at our company in the future.\r\n\r\nBest regards,\r\nDavid'],
   'golden_responses': ["Dear Prof. Aman,\r\n\r\nIt was great talking to you at the event last week!\r\n\r\nI wanted to ask if you could please share the contact of the person who\r\ndealt with the technical details of your cluster. As I mentioned, we\r\nare  looking to make a similar investment, and it would be really great to\r\nbe able to talk to someone who has already dealt with some of the technical\r\npitfalls.\r\n\r\nSecondly, I would be very happy to host you at our company in the future. Please let\r\nme know if that's the case.\r\n\r\nBest regards,\r\nDavid"],
   'scores': {'BLEU': [0.2697422206401825],
    'ROUGE': [{'rouge1_fmeasure': 0.6068965792655945