In [1]:
%pip install evaluate --quiet
%pip install prettytable --quiet
%pip install rouge-score bert-score --quiet
%pip install -U transformers datasets evaluate trl --quiet
%pip install -U unsloth --quiet
%pip install -U python-dotenv --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader

from datasets import Dataset
from unsloth import FastLanguageModel
from huggingface_hub import hf_hub_download

from tqdm import tqdm

import evaluate
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from prettytable import PrettyTable

from dotenv import load_dotenv
import os


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [3]:
load_dotenv("./src/.env")
HF_TOKEN = os.environ["HF_TOKEN"]

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "RichardNooooh/Qwen2.5-1.5B-AF-Augmented", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 2048,
    load_in_4bit = False,
    token=HF_TOKEN
)
tokenizer.padding_side = "left" # for batch processing
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth 2024.12.2: Fast Qwen2 patching. Transformers:4.46.3.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.951 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/295M [00:00<?, ?B/s]

Unsloth 2024.12.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536, padding_idx=151665)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

In [5]:
def load_tsv_dataset(file_path):
    """
    Load the TSV file containing reviews and responses.
    """
    df = pd.read_csv(
        hf_hub_download(repo_id="RichardNooooh/AmazonFeedback",
                        filename=file_path,
                        repo_type="dataset"),
        sep="\t"
    )
    df = df.drop(columns=["ASIN"]) # ignore ID
    return Dataset.from_pandas(df)

SYSTEM_PROMPT = (
    "You are a helpful assistant for a business. "
    + "You are given a set of Amazon reviews for a given item, one for each rating out of 5, "
    + "and tasked with providing actionable feedback to help improve this item. "
    + "Please format your response into concise sentences, one for each actionable feedback. "
    + "Place each feedback on a bulletpoint."
)
def preprocess_function(examples):
    texts, labels = examples["text"], examples["labels"]
    # BROKEN - apply_chat_template doesn't add generation prompt.
    formatted_data = []
    for text in texts:
      conversation = [
          {
              "role": "system",
              "content": SYSTEM_PROMPT
          },
          {
              "role": "user",
              "content": text
          }
      ]
      formatted_text = tokenizer.apply_chat_template(conversation, tokenize=False)

      formatted_data.append(formatted_text)
    return {"text": formatted_data, "labels": labels}


In [6]:
dataset = load_tsv_dataset("test/augmented_full.tsv")
dataset = dataset.map(preprocess_function, batched=True)

test_dataloader = DataLoader(dataset, shuffle=False, batch_size=8)

test/augmented_full.tsv:   0%|          | 0.00/7.81M [00:00<?, ?B/s]

Map:   0%|          | 0/1929 [00:00<?, ? examples/s]

In [7]:
def generate_predictions(test_dataloader, model, tokenizer):
    predictions, references = [], []
    for batch in tqdm(test_dataloader):
        batch_encoding = tokenizer(batch["text"], return_tensors="pt", padding=True).to("cuda")
        input_len = batch_encoding["input_ids"].shape[-1]

        with torch.no_grad():
            output = model.generate(**batch_encoding,
                                    max_new_tokens=256)
        generated_texts = tokenizer.batch_decode(output[:, input_len:], skip_special_tokens=True)

        cleaned_texts = [text.removeprefix("assistant:\n") for text in generated_texts] # remove "Assistant:\n"
        predictions.extend(cleaned_texts)
        references.extend(batch['labels'])

    return predictions, references

In [8]:
predictions, references = generate_predictions(test_dataloader, model, tokenizer)
df = pd.DataFrame({"references": references, "predictions": predictions})
df.to_csv("./augmented_full_eval.tsv", sep="\t", index=False)

100%|██████████| 242/242 [34:12<00:00,  8.48s/it]


In [9]:
import gc

# del model
# del tokenizer

gc.collect()
torch.cuda.empty_cache()

In [10]:
def get_evaluations(predictions, references):
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")
    meteor_metric = evaluate.load("meteor")
    bertscore_metric = evaluate.load("bertscore")

    bleu_score = bleu_metric.compute(predictions=predictions, references=references)
    rouge_score = rouge_metric.compute(predictions=predictions, references=references)
    meteor_score = meteor_metric.compute(predictions=predictions, references=references)
    bert_score = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

    print("BLEU Score:", bleu_score)
    print("ROUGE Score:", rouge_score)
    print("METEOR Score:", meteor_score)
    # print("BERT Score:", bert_score)

    df = pd.DataFrame(bert_score)
    print(df.describe())

    table = PrettyTable()
    table.field_names = ["Metric", "Score Details"]
    table.add_row(["BLEU", f"{bleu_score['bleu']:.4f}"])

    rouge_details = (
        f"ROUGE-1: {rouge_score['rouge1']:.4f}, "
        f"ROUGE-2: {rouge_score['rouge2']:.4f}, "
        f"ROUGE-L: {rouge_score['rougeL']:.4f}"
    )
    table.add_row(["ROUGE", rouge_details])
    table.add_row(["METEOR", f"{meteor_score['meteor']:.4f}"])

    print("Evaluation Metrics Report")
    print(table)

In [11]:
tokenizer.padding_side = "right"

In [12]:
get_evaluations(predictions, references)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BLEU Score: {'bleu': 0.06692793317476664, 'precisions': [0.4514777684684945, 0.14958863126402394, 0.055529617569013104, 0.021479214242159757], 'brevity_penalty': 0.7064605351642042, 'length_ratio': 0.7421216708189861, 'translation_length': 173065, 'reference_length': 233203}
ROUGE Score: {'rouge1': 0.29670443144183334, 'rouge2': 0.07671976975886358, 'rougeL': 0.16590984060482314, 'rougeLsum': 0.2646084557113352}
METEOR Score: {'meteor': 0.2682271765045328}
         precision       recall           f1
count  1929.000000  1929.000000  1929.000000
mean      0.838060     0.869135     0.852175
std       0.100422     0.045827     0.075108
min       0.648471     0.769162     0.708222
25%       0.682540     0.811943     0.737199
50%       0.891986     0.890559     0.891511
75%       0.900102     0.900129     0.899464
max       0.930810     0.938143     0.929782
Evaluation Metrics Report
+--------+---------------------------------------------------+
| Metric |                   Score Details   

In [13]:
text="""Craig Compact DVD/JPEG/CD-R/CD-RW/CD Player with Remote (CVD512a), Single
Average Rating: 4.1

1-star review:
- Review Title: Total Garbage - DO NOT BUY
- Review Text: Total piece of sh!#! We got this for back up when our Internet is slow. Set it up, watched 1 movie no issues. Weeks later watched another, halfway thru it glitched, I figured it was something with the disk.  Checked the disk- it looked new, tried a different BRAND NEW DVD and player still won't read it. Tried more dvds and still just get a ""No disk"" reading with any disk I tried. Total garbage. And now the return window has closed.  DO NOT BUY!
- Keywords: Total garbage,glitched,No disk,not read,return window closed
- Sentiment scores: satisfaction: 0.0, gratitude: 0.0, trust: 0.0, disappointment: 1.0, frustration: 1.0, confusion: 0.5, fear: 0.0, regret: 1.0, indifference: 0.0, informative: 0.5

2-star review:
- Review Title: Bad
- Review Text: I've only ever had 3 bad times out of thousands on Amazon and this is one. Long story short, its Chinese crap. Spend the extra $5's to get something much better. It skips sections in movies on occasion, the remote is odd, there are buttons that don't work, and its remote range is 3 feet. Also is oddly 'eats discs' like when it closes it was made wrong so it shoves the disc out of the player and 'crunches it'. Basically it slowly breaks discs.
- Keywords: Chinese crap,skips sections in movies,odd remote,buttons don't work,remote range 3 feet
- Sentiment scores: satisfaction: 0.0, gratitude: 0.0, trust: 0.0, disappointment: 1.0, frustration: 1.0, confusion: 0.0, fear: 0.0, regret: 1.0, indifference: 0.0, informative: 0.5

3-star review:
- Review Title: Cool
- Review Text: Ok
- Keywords: nan
- Sentiment scores: satisfaction: 0.5, gratitude: 0.0, trust: 0.5, disappointment: 0.0, frustration: 0.0, confusion: 0.0, fear: 0.0, regret: 0.0, indifference: 0.0, informative: 0.0

4-star review:
- Review Title: Updated review. #2
- Review Text: It is compact and easy to set up. The remote has a lot of unnecessary buttons but it did come with batteries. Updated 2/24. Some of the settings seem to have stayed. And if you press pause and then stop once, there's a 50/50 chance it will start where you left off. Additionally, I haven't had any issues with it playing discs or any of the other problems others have had. Update 1/24. If you're watching a series, plan on finishing the whole disc or watch one episode at a time. Whether you stop or pause, once ...
- Keywords: compact,easy to set up,remote with unnecessary buttons,batteries included,settings memory
- Sentiment scores: satisfaction: 0.6, gratitude: 0.2, trust: 0.7, disappointment: 0.3, frustration: 0.4, confusion: 0.5, fear: 0.1, regret: 0.2, indifference: 0.3, informative: 0.5

5-star review:
- Review Title: I bought to use as a CD Player, works great!
- Review Text: Just plug in to receiver and its done. I was worried there were no play or stop buttons on the player itself but when you insert a disk it starts right away. if you need to stop pause or skip all that functionality is on the remote
- Keywords: CD Player,works great,plug in to receiver,no play or stop buttons,insert disk
- Sentiment scores: satisfaction: 0.9, gratitude: 0.0, trust: 0.8, disappointment: 0.0, frustration: 0.1, confusion: 0.1, fear: 0.0, regret: 0.0, indifference: 0.0, informative: 0.7"	"- Investigate and resolve the recurring ""No disk"" error experienced by users, ensuring reliability across various disc formats.
"""
processed_text = preprocess_function({"text": text, "labels": ""})["text"]
test_in = tokenizer(processed_text, return_tensors="pt").to("cuda")

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
test_out = tokenizer.decode(model.generate(**test_in, max_new_tokens=200)[0][test_in["input_ids"].shape[-1]:])

In [None]:
references[0]

In [None]:
predictions[10]

In [None]:
get_evaluations(predictions_cleaned, references)