<a href="https://colab.research.google.com/github/PeterHJY628/MyOwnExample/blob/main/llama_inference_zeroshot_chartqa1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
!pip -q install "torch==2.4.0" tensorboard pillow torchvision accelerate huggingface_hub
!pip -q install  --upgrade \
  "transformers==4.45.1" \
  "datasets==3.0.1" \
  "accelerate==0.34.2" \
  "evaluate==0.4.3" \
  "bitsandbytes==0.44.0" \
  "trl==0.11.1" \
  "peft==0.13.0" \
  "qwen_vl_utils"

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ['HF_HOME'] = '/home/sa5u24/VQA'
hf_home = os.path.expanduser(
    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
)
print(hf_home)

from huggingface_hub import login

# Replace 'your-hf-token-here' with your actual Hugging Face token
login(token="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")

/home/sa5u24/VQA


In [20]:
system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

def format_data(sample):
    return [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": system_message
                }
            ],
        },
        {
            "role": "user",
            "content": [
                # {
                #     "type": "image",
                #     "image": None,
                # },
                # {
                #     "type": "image",
                #     "image": sample["image"],
                # },
                {
                    "type": "text",
                    "text": sample['query'],
                }
            ],
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "text",
                    "text": sample["label"][0]
                }
            ],
        },
    ]



In [21]:
from datasets import load_dataset

dataset_id = "HuggingFaceM4/ChartQA"
train_dataset, eval_dataset, test_dataset = load_dataset(dataset_id, split=['train[:10%]', 'val[:10%]', 'test[:10%]'])

train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

train_dataset[200], len(train_dataset), len(eval_dataset), len(test_dataset)

([{'role': 'system',
   'content': [{'type': 'text',
     'text': 'You are a Vision Language Model specialized in interpreting visual data from chart images.\nYour task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.\nThe charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.\nFocus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary.'}]},
  {'role': 'user',
   'content': [{'type': 'text',
     'text': 'Is the rightmost value of light brown graph 58?'}]},
  {'role': 'assistant', 'content': [{'type': 'text', 'text': 'No'}]}],
 2830,
 192,
 250)

In [11]:
import gc
import time

def clear_memory():
    # Delete variables if they exist in the current global scope
    if 'inputs' in globals(): del globals()['inputs']
    if 'model' in globals(): del globals()['model']
    if 'processor' in globals(): del globals()['processor']
    if 'trainer' in globals(): del globals()['trainer']
    if 'peft_model' in globals(): del globals()['peft_model']
    if 'bnb_config' in globals(): del globals()['bnb_config']
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")



def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = processor.apply_chat_template(
        sample[1:2],  # Use the sample without the system message
        tokenize=False,
        add_generation_prompt=True
    )

    # Process the visual input from the sample
    # image_inputs, _ = process_vision_info(sample)
    #image_inputs = sample[1]['content'][0]['image'].convert("RGB")
    image_data = sample[1]['content'][0].get('image')

    if image_data is not None:
        image_inputs = image_data.convert("RGB")
    else:
        # 提供一个默认值或适当的处理逻辑
        image_inputs = None
        print("Warning: The 'image' field is None.")
    # Prepare the inputs for the model
    model_inputs = processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)  # Move inputs to the specified device

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Trim the generated ids to remove the input ids
    trimmed_generated_ids = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    # Decode the output text
    output_text = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text


In [None]:
clear_memory()

GPU allocated memory: 0.00 GB
GPU reserved memory: 0.00 GB


In [8]:
import evaluate
import torch
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
import requests
from torch import nn
from transformers import MllamaForConditionalGeneration, AutoProcessor, MllamaConfig, AutoModelForCausalLM
from typing import List, Optional, Tuple, Union
from PIL import Image
import matplotlib.pyplot as plt
from qwen_vl_utils import process_vision_info
from transformers import BitsAndBytesConfig



quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    # low_cpu_mem_usage=True,
    # bnb_4bit_compute_dtype=torch.float16
)

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
processor = AutoProcessor.from_pretrained(model_id)
model = MllamaForConditionalGeneration.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, device_map="auto", low_cpu_mem_usage=True,
            quantization_config=quantization_config,
        )





The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [22]:
all_pred = []
all_ans = []
model.eval()
with torch.no_grad():
    for sample in test_dataset:
        output = generate_text_from_sample(model, processor, sample)
        ans = sample[2]['content'][0]['text']
        all_pred.append(output)
        all_ans.append(ans)



In [24]:
for i in range(len(all_pred)):
    print("pred:", all_pred[i])
    print("ans:", all_ans[i])

pred: I don't see a bar graph. Could you please provide the bar graph you are referring to? I'll be happy to help you count the number of food items shown in it.
ans: 14
pred: It's difficult to provide a specific value difference between lamb and corn without knowing the context, such as the specific types of lamb and corn being compared, their locations, and the time period considered. However, I can provide some general information about the value of lamb and corn.

Lamb:
- The global average price of lamb can vary greatly depending on factors such as breed, quality, and location. 
- In the United States, the average price of lamb per pound can range from $3 to $6 or more, depending on the cut and quality.

Corn:
- The global average price of corn can also vary depending on factors such as location, quality, and the time of year.
- In the United States, the average price of corn per bushel can range from $3 to $6 or more, depending on the season and location.

It's worth noting that 

In [30]:
!pip install rouge_score
rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions= all_pred, references=all_ans)
print(rouge_results)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=250f99c5e635c168bbdd73b5eb3c9dd128e5f1aef6b4ee0e256d0deef8585ba1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
{'rouge1': 0.001850197123898587, 'rouge2': 3.4482758620689657e-05, 'rougeL': 0.0016967842651780716, 'rougeLsum': 0.0018234165985669933}


In [31]:
bleu_score = corpus_bleu(all_ans, all_pred, weights=(1.0, 0.0, 0.0, 0.0))
print(bleu_score)

0.004142446152418395


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [32]:
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

m_score=0
for line in zip(all_ans, all_pred):
    ref = word_tokenize(line[0])
    hypo = word_tokenize(line[1])
    m_score += meteor_score([ref], hypo)
meteors = m_score/len(all_ans)
print(meteors)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


0.006648276364556513
