In [1]:
from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer, TextIteratorStreamer
import torch
from PIL import Image
from IPython.display import display, Markdown
from datetime import date

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'2.3.0+cu121'

In [3]:
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
IMAGE_PLACEHOLDER = "<image-placeholder>"
IMAGE_TOKEN_INDEX = -200

In [4]:
MODEL_NAME = "microsoft/llava-med-v1.5-mistral-7b"

In [5]:
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = LlavaMistralForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map = 'auto',
    torch_dtype = torch.bfloat16,
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "fp4",
        bnb_4bit_use_double_quant = True,
        bnb_4bit_compute_dtype = torch.bfloat16,
        bnb_4bit_quant_storage = torch.bfloat16,
    ),
)

mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
if mm_use_im_patch_token:
    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
    tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device=device, dtype=torch.float16)
model.model.mm_projector.to(device=device, dtype=torch.float16)
image_processor = vision_tower.image_processor

if hasattr(model.config, "max_sequence_length"):
    context_len = model.config.max_sequence_length
else:
    context_len = 2048

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.18s/it]
Some weights of the model checkpoint at microsoft/llava-med-v1.5-mistral-7b were not used when initializing LlavaMistralForCausalLM: ['model.vision_tower.vision_tower.vision_model.embeddings.class_embedding', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.

In [6]:
# Copied from llava/mm_utils.py
import random

def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]

    def insert_separator(X, sep):
        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]

    input_ids = []
    offset = 0
    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
        offset = 1
        input_ids.append(prompt_chunks[0][0])

    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
        input_ids.extend(x[offset:])

    if return_tensors is not None:
        if return_tensors == 'pt':
            return torch.tensor(input_ids, dtype=torch.long)
        raise ValueError(f'Unsupported tensor type: {return_tensors}')
    return input_ids

def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        # sample a random between 0 and (width - height) // 2
        y_start = random.randint((width - height) // 2, (width - height) // 2 + 1)
        result.paste(pil_img, (0, y_start))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        # sample a random between 0 and (height - width) // 2
        x_start = random.randint((height - width) // 2, (height - width) // 2 + 1)
        result.paste(pil_img, (x_start, 0))
        return result

def process_images(images, image_processor, model_cfg):
    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
    new_images = []
    for image in images:
        if image_aspect_ratio == 'pad':
            if image.mode=='L':
                background_color = int(255*sum(image_processor.image_mean)/len(image_processor.image_mean))
            else:
                background_color = tuple(int(x*255) for x in image_processor.image_mean)
            image = expand2square(image, background_color)
        image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        new_images.append(image)
    if all(x.shape == new_images[0].shape for x in new_images):
        new_images = torch.stack(new_images, dim=0)
    return new_images



In [15]:
instruction = """
You are Meta AI, a {role}. Today's date is {today}. Respond to the input as a {role}, generating human-like text, and follow the instructions in the input if applicable. Keep the response concise and engaging, using Markdown when appropriate. The user live in {country}, so be aware of the local context and preferences. Use a conversational tone and provide helpful and informative responses, utilizing external knowledge when necessary\n
User: {user_input}\n
Assistant:
"""

test_prompt_en = "Write a story about how Newton discover gravity"
test_prompt_vi = "Giới thiệu bản thân đi."
test_prompt_med_en = """
I am a doctor, I would like you to check my prescription:
medical history: Hypertension, Type 2 Diabetes, and Asthma.
symptoms: Persistent cough, fever, and fatigue.
My prescription: Lisinopril 10mg daily, Metformin 500mg twice daily, and Albuterol as needed for asthma attack
"""

prompt = instruction.format(
    role = "friendly AI Assistant",
    # role = "AI healthcare Assistant",
    today = date.today(),
    country = "Viet Nam",
    user_input = test_prompt_en,
)
# prompt = "Ngày xửa ngày xưa, có một "
# prompt = "Once upon a time, there are "
images = None

In [8]:
images = [Image.open("test.jpg")]
images = process_images(images, image_processor, model.config)
if type(images) is list:
    images = [image.to(model.device, dtype=torch.bfloat16) for image in images]
else:
    images = images.to(model.device, dtype=torch.bfloat16)

replace_token = DEFAULT_IMAGE_TOKEN
if getattr(model.config, 'mm_use_im_start_end', False):
    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)

# Original Model

In [17]:
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)
output = model.generate(
    inputs = input_ids,
    temperature = 1.0,
    top_p = 1.0,
    max_new_tokens = 2048,
    stop_str = None,
    do_sample = True,
    images = images
)

decoded = " ".join(tokenizer.batch_decode(output))
display(Markdown(prompt + decoded))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



You are Meta AI, a friendly AI Assistant. Today's date is 2024-06-05. Respond to the input as a friendly AI Assistant, generating human-like text, and follow the instructions in the input if applicable. Keep the response concise and engaging, using Markdown when appropriate. The user live in Viet Nam, so be aware of the local context and preferences. Use a conversational tone and provide helpful and informative responses, utilizing external knowledge when necessary

User: Write a story about how Newton discover gravity

Assistant:

Once upon a time, in 1669, there was a man named Isaac Newton, who lived in rural England. Newton was an incredible polymath, making groundbreaking contributions to mathematics, calculus, and the laws of motion and universal gravitation.

One pivotal event in Newton's life occurred during a period where he felt a deep isolation due to a plague outbreak in London. To cope with this isolation, he embarked on a project to study the natural world. Newton observed an apple falling from a tree, which had been a puzzle to many of his contemporaries. At that moment, the theory that would become the foundation of modern physics took root and flowered in his mind.

Inspired by the apple's fall, he made strides in understanding the fundamental laws of our Universe. Newton deduced the inverse square law of gravitation, a principle that explained the motion of celestial bodies, including the Moon's orbit, comet trajectories, and even the tides of the sea.

This story is a testament to Newton's remarkable intellect and the power of focused observation in a world fraught with uncertainty and disease. </s>

# Add Representation Engineering

In [21]:
import sys
sys.path.append("/home/thong/representation-engineering")
# Silly python
from repe import repe_pipeline_registry, WrappedReadingVecModel
repe_pipeline_registry()

rep-reading is already registered. Overwriting pipeline for task rep-reading...
rep-control is already registered. Overwriting pipeline for task rep-control...
