In [None]:
from datasets import load_dataset

val_ds = load_dataset("HuggingFaceM4/ChartQA", split="val")
val_ds

In [None]:
val_ds.info

In [None]:
def show_sample(ds, index: int = 0):
    query = ds[index]["query"]
    print(query)
    label = ds[index]["label"]
    print(label)
    if "results" in ds.column_names:
        print(ds[index]["results"])
        return
    image = ds[index]["image"]
    return image
show_sample(val_ds)

In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
# Load the model in half-precision on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
model

In [None]:
def chat(model, processor, conversation, image, max_new_tokens, verbose = True):
    # Preprocess the inputs
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    if verbose:
        print(text_prompt)
    inputs = processor(
        text=[text_prompt], images=[image], padding=True, return_tensors="pt"
    )
    inputs = inputs.to(model.device)
    if verbose:
        print([f"{key}: {value.shape}" for key, value in inputs.items()])
    # Inference: Generation of the output
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    return output_text[0]

In [None]:
def make_conversation(sample):
    sample["conversation"] = [
        {"role": "user", "content": [{"type": "image"},{"type": "text",  "text": sample["query"]}]},
        # {"role": "assistant", "content": [{"type": "text",  "text": sample["label"][0]}]}
    ]
    return sample
val_ds = val_ds.map(make_conversation, num_proc=16)
val_ds

In [None]:
index = 0
result = chat(model, processor, val_ds[index]["conversation"], val_ds[index]["image"])
print(result)

In [None]:
ACTUAL_COL = "results"

def eval(model, processor, ds, max_new_tokens, result_path):
    model.eval()
    results = []
    for i, sample in enumerate(ds):
        output = chat(model, processor, sample["conversation"], sample["image"], max_new_tokens=max_new_tokens, verbose=False)
        results.append(output)
        print(f"Iteration {i}\n{output}\n\n{'#'*120}")
    ds = ds.add_column(ACTUAL_COL, results)
    ds = ds.remove_columns("image")
    # TODO potentially push to hub instead of using local storage
    with open(result_path, "wb") as f:
        ds.to_json(f)
    return ds
eval_ds = eval(model,processor ,val_ds.select(range(2)), "./eval_result.jsonl")

In [None]:
from pydantic import BaseModel
import yaml

with open("./config.yaml") as f:
    yaml_dict = yaml.safe_load(f)
yaml_dict

class EvalConfig(BaseModel):
    dataset: str
    split: str
    model: str
    num_proc: int
    output_path: str
    torch_dtype:str
    device_map:str
    max_new_tokens: int

eval_config = EvalConfig(**yaml_dict)
eval_config

In [None]:
from datasets import load_dataset

ds = load_dataset(path="/home/ubuntu/code/applied-ai-lecture/VL6/lab", data_files="eval_result.jsonl", split="train")
ds

Quick summary on Rouge: https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499

In [None]:
LABEL_COL = "label"
ACTUAL_COL = "results"

from torchmetrics.text import ROUGEScore
def compare(ds, actual_col: str = ACTUAL_COL, label_col: str = LABEL_COL):
    rouge = ROUGEScore()
    def row_compare(sample):
        sample["is_label_contained"] = sample[label_col][0] in sample[actual_col]
        sample["rouge"] = rouge(preds=sample[actual_col], target=sample[label_col][0])['rouge1_fmeasure']
        return sample
    ds = ds.map(row_compare, num_proc=1)
    return ds
ds = compare(ds)
ds

In [None]:
# import seaborn as sns
# sns.histplot(ds["is_label_contained"])
# sns.histplot(ds["rouge"])

In [None]:
def compute_metrics(ds):
    metrics = {}
    metrics["accuracy"] = sum(ds["is_label_contained"]) / len(ds)
    metrics["avg_rouge"] = sum(ds["rouge"]) / len(ds)
    return metrics
compute_metrics(ds)