In [1]:
import os
import json
import tqdm
import pandas as pd
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration
from transformers import TrainingArguments, Trainer


In [2]:
DEVICE = "cuda:0"

processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=False
)

model = Idefics2ForConditionalGeneration.from_pretrained(
    "./models/trained/train/checkpoint-250",
    torch_dtype=torch.float16,
)

lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
        use_dora=False,
        init_lora_weights="gaussian"
)

# model.add_adapter(lora_config)
model.enable_adapters()

Chat templates should be in a 'chat_template.json' file but found key='chat_template' in the processor's config. Make sure to move your template to its own file.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [12]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')
req = test_df['entity_name'].value_counts()
req = (req * (150000/len(test_df))).astype(int) - train_df['entity_name'][:20000].value_counts()
req = req + [682, 682, 682, 682, -7*682, 682, 682, 682]

sampled_df_list = []

for entity, count in req.items():
    # Sample from each entity
    entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
    sampled_entity_df = entity_df.sample(n=count, random_state=42)  # Use random_state for reproducibility
    sampled_df_list.append(sampled_entity_df)

# Concatenate all sampled dataframes
train_df = pd.concat(sampled_df_list)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]
  entity_df = train_df.iloc[20000:][train_df['entity_name'] == entity]


In [4]:
def load_image(image_path):
    try:
        return (image_path, Image.open(image_path).convert('RGB'))
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None

def process_images_concurrently(folder_path, max_workers=8):
    image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
    
    images = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Wrap the executor.map with tqdm for progress bar
        for result in tqdm.tqdm(executor.map(load_image, image_paths), total=len(image_paths), desc="Loading images"):
            images[result[0]] = result[1]
    return images
    
images_dict = process_images_concurrently("./processed/train", max_workers=8)

Loading images: 100%|██████████| 255887/255887 [02:55<00:00, 1456.33it/s]


In [13]:
images_names = [os.path.basename(x) for x in train_df['image_link']]
correct_images = set(os.listdir("./processed/train"))
images_names = [f"./processed/train/{x}" if x in correct_images else None for x in images_names]
train_df['image'] = images_names
train_df = train_df.dropna()
data_dict = train_df.to_dict('list')
dataset = Dataset.from_dict(data_dict)
train_dataset = dataset.remove_columns(['group_id', 'image_link'])

In [14]:
train_dataset[10]

{'entity_name': 'voltage',
 'entity_value': '130.0 volt',
 'image': './processed/train/61dXfeoSP7L.jpg'}

In [15]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

In [16]:

class MyDataCollator:
    def __init__(self, processor):
        self.processor = processor
        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")
        ]

    def __call__(self, examples):
        global images_dict
        texts = []
        images = []
        for example in examples:
            image = images_dict[example["image"]]
            question = f'What is the {example["entity_name"].replace("_", " ")} of the product?'
            answer = example['entity_value']
            system_prompt = f'''
            1. Report the value and unit exactly as they appear in the image.
            2. If the feature is not visible respond with an empty string.
            3. Provide your answer in the format: "value unit" (e.g., "500 gram" or "2.5 inch").
            4. Acceptable units: {entity_unit_map[example["entity_name"]]}
            '''
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": system_prompt},
                        {"type": "image"},
                        {"type": "text", "text": question}
                    ]
                },
                {
                    "role": "assistant",
                    "content": [
                        {"type": "text", "text": answer}
                    ]
                }
            ]
            text = processor.apply_chat_template(messages, add_generation_prompt=False)
            texts.append(text.strip())
            images.append([image])

        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

        labels = batch["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id
        batch["labels"] = labels

        return batch

data_collator = MyDataCollator(processor)


In [17]:

training_args = TrainingArguments(
    num_train_epochs=1, 
    per_device_train_batch_size=10,
    gradient_accumulation_steps=8,
    warmup_steps=500,
    learning_rate=2e-4,
    weight_decay=0.005,
    logging_steps=25,
    output_dir="./models/trained/train",
    save_strategy="steps",
    save_steps=250,
    save_total_limit=1,  # Keep only the latest checkpoint
    fp16=True,
    remove_unused_columns=False,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()


Step,Training Loss
25,0.0183
50,0.0174
75,0.0173
100,0.0179
125,0.0171
150,0.0172
175,0.0179
200,0.0171
225,0.0172
250,0.0169




TrainOutput(global_step=1625, training_loss=0.01889698184453524, metrics={'train_runtime': 36983.5859, 'train_samples_per_second': 3.515, 'train_steps_per_second': 0.044, 'total_flos': 1.3418624332161572e+18, 'train_loss': 0.01889698184453524, 'epoch': 1.0})

In [1]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined