In [1]:
import io
import os
import time
import torch
import requests

from PIL import Image
import torch.distributed as dist
from transformers import AutoProcessor, AutoModel
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import SFTTrainer


# NOTE: 
# For GPU memory optimization, before starting, go to the Hugging Face cache and set "max_dynamic_tiles" from 12 to 1 in the config.json and preprocessor_config.json file.

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [2]:
existing_processed_datasets = False

In [3]:
def ealge_format_multiview_data(sample):
    prompt_blocks = sample["prompt_blocks"]
    # Change the value of 'type' from 'image_url' to 'image' in dicts
    for block in prompt_blocks:
        if isinstance(block, dict) and block.get("type") == "image_url":
            block["type"] = "image"
            
    answer = sample["ground_truth_answer"]
    
    return {
        "messages": [
            {
                "role": "user",
                "content": prompt_blocks
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": answer}],
            },
        ],
    }

In [4]:
def ealge_format_data(sample):
    return {
        "images": [sample["image"]],
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": sample["image"],
                    },
                    {
                        "type": "text",
                        "text": sample["query"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["label"][0]}],
            },
        ],
    }

In [5]:
def format_data(sample):
    return {
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": sample["image"],
                        # "image": "https://www.ilankelman.org/stopsigns/australia.jpg",
                    },
                    {
                        "type": "text",
                        "text": sample["query"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["label"][0]}],
            },
        ],
    }

In [6]:
if existing_processed_datasets:
    train_dataset = Dataset.load_from_disk("./ChartQA_processed_datasets/train_dataset")
    eval_dataset = Dataset.load_from_disk("./ChartQA_processed_datasets/eval_dataset")
    test_dataset = Dataset.load_from_disk("./ChartQA_processed_datasets/test_dataset")

else:
    system_message = """You are a Vision Language Model specialized in interpreting visual data from chart images.
Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary."""

    dataset_id = "HuggingFaceM4/ChartQA"
    train_dataset, eval_dataset, test_dataset = load_dataset(dataset_id, split=["train[:10%]", "val[:10%]", "test[:10%]"])
    
    t0 = time.time()
    filtered_count = sum(1 for sample in train_dataset if sample['image'] is None)
    print(f"Filtered out {filtered_count} samples with None image from train_dataset")
    train_dataset = [format_data(sample) for sample in train_dataset if sample['image'] is not None]
    t1 = time.time()
    print("time taken (train_dataset to list) : ", t1 - t0)
    train_dataset = Dataset.from_list(train_dataset)
    t2 = time.time()
    print("train_dataset length: ", len(train_dataset))
    print("time taken (train_dataset to Dataset) : ", t2 - t1)

    eval_dataset = Dataset.from_list([format_data(sample) for sample in eval_dataset if sample['image'] is not None])
    t3 = time.time()
    print("eval_dataset length: ", len(eval_dataset))
    print("time taken (eval_dataset to Dataset) : ", t3 - t2)

    test_dataset = Dataset.from_list([format_data(sample) for sample in test_dataset if sample['image'] is not None])
    t4 = time.time()
    print("test_dataset length: ", len(test_dataset))
    print("time taken (test_dataset to Dataset) : ", t4 - t3)
    
    # train_dataset.save_to_disk("./ChartQA_processed_datasets/train_dataset")
    # eval_dataset.save_to_disk("./ChartQA_processed_datasets/eval_dataset")
    # test_dataset.save_to_disk("./ChartQA_processed_datasets/test_dataset")

FileNotFoundError: No such files: '/home/compu/test_suchae/eagle2-2b-finetuning/ChartQA_processed_datasets/train_dataset/dataset_info.json', nor '/home/compu/test_suchae/eagle2-2b-finetuning/ChartQA_processed_datasets/train_dataset/state.json' found. Expected to load a `Dataset` object but provided path is not a `Dataset`.

In [7]:
# train_dataset = train_dataset.select(range(100))
# eval_dataset = eval_dataset.select(range(10))
# test_dataset = test_dataset.select(range(10))

In [8]:
model_id = "nvidia/Eagle2-2B"

model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map="cuda" if torch.cuda.is_available() else "cpu"
)

processor = AutoProcessor.from_pretrained(
    "nvidia/Eagle2-2B", 
    trust_remote_code=True, 
    use_fast=True
)
processor.tokenizer.padding_side = "left"

`use_fast` is set to `True` but the image processor class does not have a fast version.  Falling back to the slow version.
Some kwargs in processor config are unused and will not have any effect: image_placeholder, tokens_per_tile, video_placeholder, image_start_token, image_end_token, auto_map. 


In [9]:
def generate_text_from_sample(model, processor, sample, max_new_tokens=1024, device="cuda"):
    # Prepare the text input by applying the chat template
    text_input = [processor.apply_chat_template(
        sample["messages"][:1], tokenize=False, add_generation_prompt=True
    )]
    
    image_inputs, video_inputs = processor.process_vision_info(sample["messages"][:1])
        
    # Prepare the inputs for the model
    model_inputs = processor(
        text=text_input,
        images=image_inputs,
        videos=video_inputs,
        return_tensors="pt",
        padding=True,
    ).to(
        device
    )  # Move inputs to the specified device
    
    model_inputs = model_inputs.to(device)
    model = model.to(device)

    # Generate text with the model
    generated_ids = model.generate(**model_inputs, max_new_tokens=max_new_tokens)

    # Decode the output text
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    return output_text[0]  # Return the first decoded output text

In [10]:
# output = generate_text_from_sample(model, processor, train_dataset[1])
# print(output)

In [11]:
import gc
import time


def clear_memory():
    # Delete variables if they exist in the current global scope
    if "inputs" in globals():
        del globals()["inputs"]
    if "model" in globals():
        del globals()["model"]
    if "processor" in globals():
        del globals()["processor"]
    if "trainer" in globals():
        del globals()["trainer"]
    if "peft_model" in globals():
        del globals()["peft_model"]
    if "bnb_config" in globals():
        del globals()["bnb_config"]
    time.sleep(2)

    # Garbage collection and clearing CUDA memory
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(2)
    gc.collect()
    time.sleep(2)

    print(f"GPU allocated memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU reserved memory: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

In [12]:
# Load model and tokenizer
model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map={'': torch.cuda.current_device()} if torch.cuda.is_available() else "cpu"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
processor.tokenizer.padding_side = "left"


Some kwargs in processor config are unused and will not have any effect: image_placeholder, tokens_per_tile, video_placeholder, image_start_token, image_end_token, auto_map. 


In [13]:
from peft import LoraConfig, get_peft_model

# Configure LoRA
peft_config = LoraConfig(
    r=32,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=["down_proj", "o_proj", "k_proj", "q_proj", "gate_proj", "up_proj", "v_proj"],
    use_dora=True,
    init_lora_weights="gaussian",
)

# Apply PEFT model adaptation
peft_model = get_peft_model(model, peft_config)

# Print trainable parameters
peft_model.print_trainable_parameters()

trainable params: 43,639,936 || all params: 2,024,966,720 || trainable%: 2.1551


In [None]:
from trl import SFTConfig, SFTTrainer
from eagle2_trl_sft_trainer import Eagle2TRLSFTTrainer
from eagle2_data_collator import Eagle2DataCollator

# Configure training arguments
training_args = SFTConfig(
    output_dir="eagle2-2b-trl-sft-ChartQA",  # Directory to save the model
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=1,  # Batch size for training
    per_device_eval_batch_size=1,  # Batch size for evaluation
    gradient_accumulation_steps=16,  # Steps to accumulate gradients
    gradient_checkpointing_kwargs={"use_reentrant": False},  # Options for gradient checkpointing
    label_names=["labels"],
    max_length=None,
    # Optimizer and scheduler settings
    optim="adamw_torch_fused",  # Optimizer type
    lr_scheduler_type="cosine",
    learning_rate=2e-4,  # Learning rate for training
    # Logging and evaluation
    logging_steps=10,  # Steps interval for logging
    eval_steps=10,  # Steps interval for evaluation
    eval_strategy="steps",  # Strategy for evaluation
    save_strategy="steps",  # Strategy for saving the model
    save_steps=200,  # Steps interval for saving
    # Mixed precision and gradient settings
    bf16=True,  # Use bfloat16 precision
    # max_grad_norm=0.3,  # Maximum norm for gradient clipping
    warmup_ratio=0.03,  # Ratio of total steps for warmup
    remove_unused_columns=False,  # Whether to remove unused columns
    # Hub and reporting
    push_to_hub=False,  # Whether to push model to Hugging Face Hub
    use_legacy_prediction_loop=True,
    report_to="wandb",  # Use Weights & Biases for logging
)

processor.tokenizer.pad_token = "<|endoftext|>"
processor.tokenizer.pad_token_id = 151643

eagle2_data_collator = Eagle2DataCollator(processor.tokenizer)


trainer = Eagle2TRLSFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    processing_class=processor,
    data_collator=eagle2_data_collator,
)

Map:   7%|▋         | 184/2830 [00:05<01:14, 35.49 examples/s]

Map: 100%|██████████| 2830/2830 [01:44<00:00, 27.01 examples/s]


processed_dataset columns: ['input_ids', 'attention_mask', 'pixel_values', 'image_sizes', 'image_flags', 'labels']


Map: 100%|██████████| 192/192 [00:02<00:00, 71.52 examples/s]

processed_dataset columns: ['input_ids', 'attention_mask', 'pixel_values', 'image_sizes', 'image_flags', 'labels']





In [15]:
# If distributed environment variables are not set, manually configure for single process
if 'RANK' not in os.environ:
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'  # Use any available port
    os.environ['RANK'] = '0'
    os.environ['WORLD_SIZE'] = '1'
    
    # Initialize backend ('nccl' if GPU is available, otherwise 'gloo')
    backend = 'nccl' if torch.cuda.is_available() else 'gloo'
    dist.init_process_group(backend=backend, init_method='env://')

In [16]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mschaeck[0m ([33mschaeck-dongguk-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 309
dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 320
dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 305
dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 331
dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 314
dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 321
dynamic ViT batch size: 8, images per sample: 1.0, dynamic token length: 313


KeyboardInterrupt: 

In [None]:
trainer.save_model(training_args.output_dir)

## Testing

In [None]:
clear_memory()

GPU allocated memory: 0.06 GB
GPU reserved memory: 4.28 GB


In [7]:
# Load model and tokenizer
model_id = "Suchae/eagle2-2b-trl-sft-Multitask-Merged"
model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map={'': torch.cuda.current_device()} if torch.cuda.is_available() else "cpu"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, use_fast=True)
processor.tokenizer.padding_side = "left"


AttributeError: Qwen2TokenizerFast has no attribute tokenizer

In [None]:
adapter_path = "/home/compu/test_suchae/EAGLE/test_suchae/eagle2-2b-trl-sft-ChartQA/checkpoint-260"
model.load_adapter("adapter_path")

OSError: adapter_path is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
output = generate_text_from_sample(model, processor, train_dataset[0])
output

# LoRA Merge

In [7]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Load the base model and tokenizer.
model_id = "nvidia/Eagle2-2B"

base_model = AutoModel.from_pretrained(
    model_id,
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map="cuda" if torch.cuda.is_available() else "cpu"
)

processor = AutoProcessor.from_pretrained(
    "nvidia/Eagle2-2B", 
    trust_remote_code=True, 
    use_fast=True
)
processor.tokenizer.padding_side = "left"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# 2. Load the trained LoRA adapter.
lora_adapter_path = "/home/compu/test_suchae/eagle2-2b-finetuning/eagle2-2b-trl-sft-Multitask/checkpoint-455"  # e.g., "./lora-adapters/my-lora-model"
model = PeftModel.from_pretrained(base_model, lora_adapter_path)

# 3. Merge the LoRA adapter into the base model.
merged_model = model.merge_and_unload()

# 4. Save the merged model.
merged_model.save_pretrained("/home/compu/test_suchae/eagle2-2b-finetuning/eagle2-2b-trl-sft-Multitask-Merged")
tokenizer.save_pretrained("/home/compu/test_suchae/eagle2-2b-finetuning/eagle2-2b-trl-sft-Multitask-Merged")

print("LoRA adapter has been successfully merged.")

Some kwargs in processor config are unused and will not have any effect: image_start_token, auto_map, tokens_per_tile, image_placeholder, video_placeholder, image_end_token. 


LoRA adapter has been successfully merged.


In [9]:
from huggingface_hub import HfApi

# Path where the merged model was saved in step 1
model_path = "/home/compu/test_suchae/eagle2-2b-finetuning/eagle2-2b-trl-sft-Multitask-Merged"
# Repository name to create on Hugging Face Hub
repo_id = "suchae/eagle2-2b-trl-sft-Multitask-Merged"

api = HfApi()

# Upload the model
api.upload_folder(
    folder_path=model_path,
    repo_id=repo_id,
    repo_type="model",
)

print(f"Model has been successfully uploaded to the Hugging Face Hub: {repo_id}")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            
[A
Processing Files (1 / 1)                :   0%|          | 11.4MB / 3.97GB,   ???B/s  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A
[A

Processing Files (1 / 2)                :   1%|          | 45.0MB / 3.97GB, 11.2MB/s  
[A

Processing Files (1 / 2)                :   2%|▏         | 95.3MB / 3.97GB, 26.2MB/s  
[A

Processing Files (1 / 2)                :   4%|▎         |  146MB / 3.97GB, 39.5MB/s  
[A

Processing Files (1 / 2)                :   5%|▍         |  196MB / 3.97GB, 51.3MB/s  
[A

Processing Files (1 / 2)                :   6%|▌         |  238MB / 3.97GB, 59.6MB/s  
[A

Processing Files (1 / 2)                :   7%|▋         |  288MB / 3.97GB, 69.2MB/s  
[A

Processing Files (1 / 2)                :   9%|▊         |  339MB / 3.97GB, 77.9MB/s  
[A

Processing Files (1 / 2)                :  10%|▉         |  389MB / 3.97GB, 85.8MB/s  
[A

Processing Files (1 /

모델이 Hugging Face 허브에 성공적으로 업로드되었습니다: suchae/eagle2-2b-trl-sft-Multitask-Merged
