In [None]:
from transformers import pipeline
from PIL import Image
import requests
from datasets import load_dataset
import torch
import uuid
from transformers import AutoProcessor, LlavaForConditionalGeneration, Trainer, TrainingArguments
from transformers import set_seed
from transformers import pipeline
import numpy as np

# Set a seed for reproducibility
set_seed(42)


PATH_TO_SAVE = "/content/drive/MyDrive/"

model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

processor = AutoProcessor.from_pretrained(model_id)

def preprocess_data(example):
    conversations = example['conversations']
    image_path =example['image'] # Generate a unique image path
    image = Image.open(image_path)

    inputs = [f'USER:<image>\n {example["conversations"][0]["value"]}']
    outputs = [f'\nASSISTANT:{ conversations[1]["value"]}']

    encoding = processor(inputs, image, padding=True, truncation=True, return_tensors="pt")

    for k, v in encoding.items():
        encoding[k] = v.squeeze()

    targets = [processor.tokenizer.encode(x, add_special_tokens=False) +
               [processor.tokenizer.eos_token_id] for x in outputs]

    encoding["labels"] = targets
    return encoding



dataset = load_dataset('json',data_files='/content/drive/MyDrive/output.json')
print(dataset)
processed_dataset = dataset.map(
    preprocess_data, batched=False, remove_columns=['image','conversations'])

# Quantization
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_model = quantized_model.to('cuda')

# Define training arguments
training_args = TrainingArguments(
    output_dir=PATH_TO_SAVE,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    save_steps=200,
    logging_steps=50,
    learning_rate=5e-5,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
)

# Trainer with quantized model
trainer = Trainer(
    model=quantized_model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=processor.tokenizer,
)

# Train the quantized model
trainer.train()

# Evaluate the quantized model (optional)
results = trainer.evaluate()
print(results)

# Save the quantized model
quantized_model.save_pretrained(PATH_TO_SAVE)
processor.save_pretrained(PATH_TO_SAVE)
print('model is saved')

# Optionally, push the quantized model to the Hugging Face Model Hub

# Example of generating responses using the quantized model
text_generator = pipeline(
    'text-generation', model=quantized_model, tokenizer=processor.tokenizer)
generated_response = text_generator(
    "USER: <image>\nSome input text\nASSISTANT:")
print("Generated Response:", generated_response[0]['generated_text'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/954 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'image', 'id'],
        num_rows: 111
    })
})


Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.37.1-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.37.1


In [7]:
processed_dataset['train']

Dataset({
    features: ['conversations', 'id', 'input_ids', 'attention_mask', 'pixel_values', 'labels'],
    num_rows: 111
})

In [59]:

a=[
  {
    "id": "56fc1e1e-e58d-4fe8-aa99-47e0aba3c959",
    "image": "part-000001/0.jpg",
    "conversations": [
      {
        "from": "human",
        "value": "<image>\nDoes the following image have a deceptive pattern?"
      },
      {
        "from": "gpt",
        "value": "\u201cUnsubscribe here\u201d is grayed out in front of a gray background, making it harder to see."
      }
    ]
  },
  {
    "id": "aeb0a77f-f6b4-4d5d-b63e-a883a1f08170",
    "image": "part-000001/1.jpg",
    "conversations": [
      {
        "from": "human",
        "value": "<image>\nDoes the following image have a deceptive pattern?"
      },
      {
        "from": "gpt",
        "value": "Clicking the \u201cX\u201d to close the ad clicks the \u201cdownload from the Google Play Store\u201d button, which was not the user\u2019s intention."
      }
    ]
  }
]

In [64]:
inputs = [f'USER: {conversations[0]["value"]}\nASSISTANT:{ conversations[1]["value"]}']

'<image>\nDoes the following image have a deceptive pattern?'

In [3]:
# Load the dataset
dataset = load_dataset('json', data_files='/content/drive/MyDrive/gandu/output.json')

# Print the first JSON element
first_element = dataset['train'][0]
print(first_element)


{'id': '56fc1e1e-e58d-4fe8-aa99-47e0aba3c959', 'image': 'part-000001/0.jpg', 'conversations': [{'from': 'human', 'value': '<image>\nDoes the following image have a deceptive pattern?'}, {'from': 'gpt', 'value': '“Unsubscribe here” is grayed out in front of a gray background, making it harder to see.'}]}
