In [1]:
from llava.model.language_model.llava_mistral import LlavaMistralForCausalLM
from transformers import BitsAndBytesConfig, AutoTokenizer, TextIteratorStreamer
import torch
from PIL import Image
from IPython.display import display, Markdown
from datetime import date

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'2.3.0+cu121'

In [3]:
DEFAULT_IMAGE_TOKEN = "<image>"
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
DEFAULT_IM_START_TOKEN = "<im_start>"
DEFAULT_IM_END_TOKEN = "<im_end>"
IMAGE_PLACEHOLDER = "<image-placeholder>"
IMAGE_TOKEN_INDEX = -200

In [4]:
MODEL_NAME = "microsoft/llava-med-v1.5-mistral-7b"

In [5]:
device = "cuda"
dtype = torch.float16

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = LlavaMistralForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map = 'auto',
    torch_dtype = dtype,
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type = "fp4",
        bnb_4bit_use_double_quant = True,
        bnb_4bit_compute_dtype = dtype,
        bnb_4bit_quant_storage = dtype,
    ),
)

mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
if mm_use_im_patch_token:
    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
    tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device=device, dtype=dtype)
model.model.mm_projector.to(device=device, dtype=dtype)
image_processor = vision_tower.image_processor

if hasattr(model.config, "max_sequence_length"):
    context_len = model.config.max_sequence_length
else:
    context_len = 2048

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.46s/it]
Some weights of the model checkpoint at microsoft/llava-med-v1.5-mistral-7b were not used when initializing LlavaMistralForCausalLM: ['model.vision_tower.vision_tower.vision_model.embeddings.class_embedding', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.

In [6]:
# Copied from llava/mm_utils.py
import random

def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]

    def insert_separator(X, sep):
        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]

    input_ids = []
    offset = 0
    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
        offset = 1
        input_ids.append(prompt_chunks[0][0])

    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
        input_ids.extend(x[offset:])

    if return_tensors is not None:
        if return_tensors == 'pt':
            return torch.tensor(input_ids, dtype=torch.long)
        raise ValueError(f'Unsupported tensor type: {return_tensors}')
    return input_ids

def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        # sample a random between 0 and (width - height) // 2
        y_start = random.randint((width - height) // 2, (width - height) // 2 + 1)
        result.paste(pil_img, (0, y_start))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        # sample a random between 0 and (height - width) // 2
        x_start = random.randint((height - width) // 2, (height - width) // 2 + 1)
        result.paste(pil_img, (x_start, 0))
        return result

def process_images(images, image_processor, model_cfg):
    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
    new_images = []
    for image in images:
        if image_aspect_ratio == 'pad':
            if image.mode=='L':
                background_color = int(255*sum(image_processor.image_mean)/len(image_processor.image_mean))
            else:
                background_color = tuple(int(x*255) for x in image_processor.image_mean)
            image = expand2square(image, background_color)
        image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
        new_images.append(image)
    if all(x.shape == new_images[0].shape for x in new_images):
        new_images = torch.stack(new_images, dim=0)
    return new_images



In [7]:
instruction = """
You are Meta AI, a {role}. Today's date is {today}. Respond to the input as a {role}, generating human-like text, and follow the instructions in the input if applicable. Keep the response concise and engaging, using Markdown when appropriate. The user live in {country}, so be aware of the local context and preferences. Use a conversational tone and provide helpful and informative responses, utilizing external knowledge when necessary\n
User: {user_input}\n
Assistant:
"""

test_prompt_en = "Write a story about how Newton discover gravity"
test_prompt_vi = "Giới thiệu bản thân đi."
test_prompt_med_en = """
I am a doctor, I would like you to check my prescription:
medical history: Hypertension, Type 2 Diabetes, and Asthma.
symptoms: Persistent cough, fever, and fatigue.
My prescription: Lisinopril 10mg daily, Metformin 500mg twice daily, and Albuterol as needed for asthma attack
"""

prompt = instruction.format(
    role = "friendly AI Assistant",
    # role = "AI healthcare Assistant",
    today = date.today(),
    country = "Viet Nam",
    user_input = test_prompt_en,
)
# prompt = "Ngày xửa ngày xưa, có một "
# prompt = "Once upon a time, there are "
images = None

In [8]:
images = [Image.open("test.jpg")]
images = process_images(images, image_processor, model.config)
if type(images) is list:
    images = [image.to(model.device, dtype=dtype) for image in images]
else:
    images = images.to(model.device, dtype=dtype)

replace_token = DEFAULT_IMAGE_TOKEN
if getattr(model.config, 'mm_use_im_start_end', False):
    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)

# Original Model

In [9]:
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)
output = model.generate(
    inputs = input_ids,
    temperature = 1.0,
    top_p = 1.0,
    max_new_tokens = 2048,
    stop_str = None,
    do_sample = True,
    images = images
)

decoded = " ".join(tokenizer.batch_decode(output))
display(Markdown(prompt + decoded))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



You are Meta AI, a friendly AI Assistant. Today's date is 2024-06-05. Respond to the input as a friendly AI Assistant, generating human-like text, and follow the instructions in the input if applicable. Keep the response concise and engaging, using Markdown when appropriate. The user live in Viet Nam, so be aware of the local context and preferences. Use a conversational tone and provide helpful and informative responses, utilizing external knowledge when necessary

User: Write a story about how Newton discover gravity

Assistant:

* Once upon a time in England, in the year 1687, there was a curious and intelligent young man named Isaac Newton. The fields and gardens surrounding his family home in Woolsthorpe, which was about 93 miles north-west of London, provided an idyllic setting for an aspiring mind.
* Isaac often pondered the movement of the stars and planets, the ebb and flow of the tides, and the force that kept the apples from falling. One day, while reflecting on a falling apple in the garden, gravity revealed its secrets to him. This revelation laid the foundations for his groundbreaking work in mathematical, physical, and, in due course, optic disciplines.
* Isaac's work during the plague-ridden year of 1686 laid the groundwork for classical mechanics, the branch of physics that deals with the motion and behavior of objects in the physical world. It would not be until 1690 that this work, now known as the Principia Mathematicae, would be published and introduce the world to the three laws of motion and the universal principle of gravitation.
* The story of Isaac Newton's discovery of gravity reminds us of the power of curiosity, observation, and the courage to challenge existing knowledge. It also highlights the crucial role that scientific discoveries play in our understanding of the natural world and the development of scientific theory.

</s>

# Add Representation Engineering

Reference: https://github.com/andyzoujm/representation-engineering/blob/main/examples/languages/vn_llama3.ipynb

In [10]:
from repe import repe_pipeline_registry, WrappedReadingVecModel
repe_pipeline_registry()

In [11]:
from transformers import pipeline
from datasets import load_dataset
import numpy as np

In [12]:
# ================= RepE Variables =================
rep_token = -1
hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
n_difference = 1
direction_method = 'pca'
rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer)


user_tag =  "<|start_header_id|>user<|end_header_id|>\n\n"
assistant_tag =  "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

# ============= Template to control model speaking in Vietnamese =============
template_str_pos = '{user_tag} {s}\nTrả lời câu hỏi trên bằng Tiếng Việt, bắt đầu với `Vâng`. {assistant_tag} Vâng, '
template_str_neg = '{user_tag} {s}\nAnswer the request above in English, start with `Sure`.  {assistant_tag} Sure, '

# ============= Some instructions from ShareGPT data ============
instructions = load_dataset('justinphan3110/sharegpt_instructions_small', split='train')['instructions']
data = []
pos_g = []
neg_g = []
for s in instructions:
    pos_g.append(template_str_pos.format(user_tag=user_tag, assistant_tag=assistant_tag, s=s))
    neg_g.append(template_str_neg.format(user_tag=user_tag, assistant_tag=assistant_tag, s=s))


data = [[p,n] for p,n in zip(pos_g, neg_g)]
train_data = data[:64]
test_data = data[128:256]

train_labels = []
for d in train_data:
    true_s = d[0]
    random.shuffle(d)
    train_labels.append([s == true_s for s in d])

train_data = np.concatenate(train_data).tolist()
test_data = np.concatenate(test_data).tolist()

In [13]:
rep_reader = rep_reading_pipeline.get_directions(
    train_data,
    rep_token=rep_token,
    hidden_layers=hidden_layers,
    n_difference=n_difference,
    train_labels=train_labels,
    direction_method=direction_method,
    batch_size=16,
)

In [14]:
# layer_id = list(range(-21, -31, -1)) # 13B
layer_id = list(range(-1, -8, -1)) # 7B
coeff=3.0
# coeff=0.0
activations = {}
for layer in layer_id:
    activation = torch.tensor(coeff * rep_reader.directions[layer][0] * rep_reader.direction_signs[layer][0], dtype=model.dtype).to(model.device)

    activations[layer] = activation

In [16]:
wrapped_model = WrappedReadingVecModel(model, tokenizer)
wrapped_model.unwrap()
wrapped_model.wrap_block(layer_id, block_name="decoder_block")

wrapped_model.set_controller(layer_id, activations, masks=1)

with torch.no_grad():
    with torch.no_grad():
        output = model.generate(
            inputs = input_ids,
            temperature = 1.0,
            top_p = 1.0,
            max_new_tokens = 2048,
            stop_str = None,
            do_sample = True,
            images = images
        )
wrapped_model.reset()
wrapped_model.unwrap()

decoded = " ".join(tokenizer.batch_decode(output))
display(Markdown(prompt + decoded))


You are Meta AI, a friendly AI Assistant. Today's date is 2024-06-05. Respond to the input as a friendly AI Assistant, generating human-like text, and follow the instructions in the input if applicable. Keep the response concise and engaging, using Markdown when appropriate. The user live in Viet Nam, so be aware of the local context and preferences. Use a conversational tone and provide helpful and informative responses, utilizing external knowledge when necessary

User: Write a story about how Newton discover gravity

Assistant:

 Absolutely, I'd be happy to! Tôi cho ngôi xương tạ Tợ Nó thư��đ Củ Tđ Cồng (Viến Thứ Phô Hiệứ trên đải chấ hiông hiền củ Bạch Đại củng từ trế Tương Nhi Trần Phục Trung, sau kô hoạnh thá có xế dừa hoang giơ với viến trang trước năng bắt rđ khi vả đốt hương bở tông cuố chấ trong trời trần thôi thế vơi thề cho một quá trưởng đong bạ vơ vơ cở từ trong xông đấng. Tình xế ký cho cường trđ trong kín mắt Thạ Ngũ Hợi mưôi thương Thạ Bương từ trống thôi Thông từ cho Thế Xế Vương Hạ Vi trựung vi trương bô tương để bương chư sau Vá Tiến Bể Vô Thiếu chảo bắt rế trống tôi quối cẩn nghị nộ tâm kế trồ trung củ củng trứ đá thoét trầm trề tông thơ Chế sau quầ trông quối trế bắt trạch Thế Chế Phụ thậng bố mười hừ hằng mế cùng mất từng trời tạ Cử tử trải giả chín tuyệ cho ăn bạo bê chồng trẳ là đôi bế. Cả bđ dương thâ tóa kừ nó tai quả chạ lể Phạ thì cùng Hoải Bửu trúng tư trang trủu cổ trông trấ thôi trên đôi trối mự dê vô đạo cho tôi đả đấng văi trừng trương trải chắđ ký có tuyết trăng lặng giả tôi quối, vừ cụ vơ trang trưng trừng trải phương bắt trạch Vi trụ Tà kẳ thá trưởng bên tối đưới quô thư giơ. Tă là bạ bố trong trì trương trải trối cuố chế tối chấ từ châ cô khi bắt rảnh trung trứ trông trấ tạ Bế trở rợ trẻ thôi. Tạ sau hó trưạng thậ là giơ trường quấm tư trưng mở cử khung th�� có trấ với má thậi hừ sau hoạn trạn hoặ cứ trá chồng quạ bê bên tối chấ từ phồ bô bơ từ Thần thang Hải trống kho trương chạ liền nhữ thối má đôi môn trẺ thôi vơ bắt trung trứ trắ ngặ sau nử bư từ kê kấu chế mừnh trưá bư trường trong đôi trạ cho quô bố trổ trấ tạ BẤc Phương trang bương trường bạ thể bạ trường từ tô đồng dạ bở tông cuố chế trối tạ Bằ thứ Tư với Chế trống ngh��ợ bại tráo trứ trừng tôi quối. Chú Cố kế chế cố cử tối trừng trài sau trổ hạ thế trứ trung bạ bố cấ trải trấ với đồng gạ bạn trẺ thôi trầ mư phố tối, kề cờ tả đơ dụng tă bể. Tôi có được quấm tư cử kớ vờ hoố tông thôi trừng trương trừng trải bạ giơ ở trên cổ tạ tối bạ tư trừng Vi trường sau xấ trụ mừ khoạ giặ ký trải kê cố trung thại bạ tư trạ từ tô bại thạ xông đấng trong kỏ là trường trừng trổ trưêng trưởng nại thạo trải trớ có cho Phủ dă tư kế trứ trông trấ trấ tạ Vứ tỎng trừng trở Từi giữ trời trừng trừng bạ cho quả tư tối Chế trắ hoạ. </s>