In [None]:
from torchao.quantization.prototype.qat import Int8DynActInt4WeightQATQuantizer

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

path = "/data/share/Qwen2.5-1.5B-Instruct"

device = "cuda:7"
dtype = torch.bfloat16

tok = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, device_map=device, torch_dtype=dtype)
model = model.eval()

In [None]:
from datasets import load_dataset
from data_utils import CustomJsonDataset

calibration_datasets = load_dataset("parquet", data_files="/data/shrelic/other/QwenSpinQuant/wikitext-raw-vl.parquet")

data = CustomJsonDataset(
    calibration_datasets["train"],
    tok,
    block_size=500,
)

for d in data:
    break

inp = {"input_ids": torch.tensor(d["input_ids"]).unsqueeze(0).to(device)}

In [None]:
with torch.no_grad():
    res = model(**inp, output_hidden_states=True)

In [None]:
import numpy as np

layer = 4

data = res.hidden_states[layer].view(-1, res.hidden_states[10].size(-1))[1:, :].to(device="cpu", dtype=torch.float32).numpy()
max = 17
data = np.clip(data, -max, max)
import draw

# draw.plot_heat_map(data, 200, 100)
draw.plot_3d_bar_chart(data, show=False)
# draw.plot_3d_bar_chart_fast(data, show=True)

In [None]:
import numpy as np
import draw

layer = 4

X = res.hidden_states[layer].view(-1, res.hidden_states[10].size(-1))[1:, :]

def heat_map_clip(X, max=17):
    X = X.to(device="cpu", dtype=torch.float32).numpy()
    X = np.clip(X, -max, max)
    draw.plot_heat_map(X, 200, 100)

heat_map_clip(X)

In [None]:
from rotation_utils import get_orthogonal_matrix

R = get_orthogonal_matrix(model.config.hidden_size, mode="hadamard", device=device)

In [None]:
X_rot = (X.to(R.dtype) @ R).to(dtype)

heat_map_clip(X_rot)

In [None]:
bin = torch.load("../QwenSpinQuant/output_rotation/R.bin")
R_train = bin["R1"].to(device)

In [None]:
X_rot_train = (X.to(R_train.dtype) @ R_train).to(dtype)
heat_map_clip(X_rot_train)

In [None]:
import torch
from transformers import Qwen2ForCausalLM
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "/data/share/Qwen2-VL-2B-Instruct"

dtype = torch.bfloat16
device = "cuda:7"

# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     model_path, torch_dtype=dtype, device_map=device
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=dtype,
    attn_implementation="flash_attention_2",
    device_map=device,
)

# default processer
# processor = AutoProcessor.from_pretrained(model_path)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
min_pixels = 256*28*28
max_pixels = 1280*28*28
processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels, use_fast=False)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "./aniya.png",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

In [None]:
import qwen_utils
import rotation_utils

qwen_utils.untie_word_embeddings(model)
qwen_utils.fuse_layer_norms(model)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

In [None]:
dim = model.config.hidden_size
num_heads = model.config.num_attention_heads
head_dim = dim // num_heads
R = rotation_utils.get_orthogonal_matrix(dim, mode="hadamard", device=device)
R_v = rotation_utils.get_orthogonal_matrix(head_dim, mode="hadamard", device=device)
# R = [rotation_utils.get_orthogonal_matrix(dim, mode="hadamard", device=device) for _ in range(2 * model.config.num_hidden_layers + 1)]
# R_v = [rotation_utils.get_orthogonal_matrix(head_dim, mode="hadamard", device=device) for _ in range(model.config.num_hidden_layers)]

qwen_utils.rotate_model(model, R, R_v)
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    print(output_text)

In [None]:
model.save_pretrained(model_path + "-rotated")

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration

model_path = "/data/share/Qwen2-VL-2B-Instruct"

dtype = torch.bfloat16
device = "cuda:7"

# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     model_path, torch_dtype=dtype, device_map=device
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=dtype,
    attn_implementation="flash_attention_2",
    device_map=device,
)


input_ids = torch.Tensor([[8948]]).to(device=device, dtype=torch.long)

with torch.no_grad():
    res = model(input_ids=input_ids, output_hidden_states=True)

hidden_states = res.hidden_states

In [None]:
import torch
from transformers import Qwen2ForCausalLM
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# model_path = "/data/share/Qwen2-VL-2B-Instruct"
model_path = "/data/share/ShowUI-2B"

dtype = torch.bfloat16
device = "cuda:7"

# default: Load the model on the available device(s)
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     model_path, torch_dtype=dtype, device_map=device
# )

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=dtype,
    attn_implementation="flash_attention_2",
    device_map=device,
)

print(model.lm_head.weight)

import qwen_utils
import rotation_utils

qwen_utils.untie_word_embeddings(model)
qwen_utils.fuse_layer_norms(model)

dim = model.config.hidden_size
num_heads = model.config.num_attention_heads
head_dim = dim // num_heads
R = rotation_utils.get_orthogonal_matrix(dim, mode="hadamard", device=device)
R_v = rotation_utils.get_orthogonal_matrix(head_dim, mode="hadamard", device=device)

qwen_utils.rotate_model(model, R, R_v)
print(model.lm_head.weight)

model.save_pretrained(model_path + "-rotated")

In [None]:
import torch
from torch import nn
import hadamard_utils
import rotation_utils
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

path = "/data/share/Qwen2.5-1.5B-Instruct"

device = "cuda:7"
dtype = torch.bfloat16

tok = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, device_map=device, torch_dtype=dtype)
model = model.eval()

In [None]:
chat = [
    {
        "role": "system",
        "content": "You are a helpful assistant."
    },
    {
        "role": "user",
        "content": "What is the capital of France?"
    }
]

prompt = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tok(
    prompt,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(device=device)
output = model.generate(**inputs, max_new_tokens=128)
text = tok.batch_decode(output)
print(text[0])

In [None]:
intermediate_size = model.config.intermediate_size
hadamard_up = rotation_utils.get_orthogonal_matrix(intermediate_size, mode="hadamard", device=device)
hadamard_gate = rotation_utils.get_orthogonal_matrix(intermediate_size, mode="hadamard", device=device)
hadamard_down = rotation_utils.get_orthogonal_matrix(intermediate_size, mode="hadamard", device=device)

In [None]:
from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP

class MLPWrapper(nn.Module):
    def __init__(self, mlp: Qwen2MLP, 
                 hadamard_up: torch.Tensor, 
                 hadamard_gate: torch.Tensor,
                 hadamard_down: torch.Tensor = None):
        super(MLPWrapper, self).__init__()
        self.mlp = mlp
        rotation_utils.rotate_linear_output([mlp.up_proj], hadamard_up)
        rotation_utils.rotate_linear_output([mlp.gate_proj], hadamard_gate)
        # assuming all weights are on the same device
        self.device = mlp.up_proj.weight.device
        self.dtype = mlp.up_proj.weight.dtype
        
        self.register_buffer("hadamard_up_T", hadamard_up.T.to(self.device, dtype=self.dtype))
        self.register_buffer("hadamard_gate_T", hadamard_gate.T.to(self.device, dtype=self.dtype))
        
        self.rotate_down = True if hadamard_down is not None else False
        if self.rotate_down:
            rotation_utils.rotate_linear_input([mlp.down_proj], hadamard_down.T)
            self.register_buffer("hadamard_down", hadamard_down.to(self.device, dtype=self.dtype))
        

    def forward(self, x):
        up = self.mlp.up_proj(x)
        gate = self.mlp.gate_proj(x)
        
        # rotate back
        up = up @ self.hadamard_up_T
        gate = gate @ self.hadamard_gate_T
        
        gated_output = up * self.mlp.act_fn(gate)
        
        if self.rotate_down:
            # rotate
            gated_output = gated_output @ self.hadamard_down
        
        return self.mlp.down_proj(gated_output)
    

wrapped_mlp = MLPWrapper(model.model.layers[1].mlp, hadamard_up, hadamard_gate, hadamard_down)
model.model.layers[1].mlp = wrapped_mlp

In [None]:
output = model.generate(**inputs, max_new_tokens=128)
text = tok.batch_decode(output)
print(text[0])

In [None]:
import torch
from torch import nn

class ToyModel(torch.nn.Module):
    def __init__(self, input_size, output_size, intermediate_size):
        super(ToyModel, self).__init__()
        self.up = torch.nn.Linear(input_size, intermediate_size)
        self.down = torch.nn.Linear(intermediate_size, output_size)
        self.activation = torch.nn.ReLU()
        
    def forward(self, x):
        x = self.up(x)
        x = self.activation(x)
        x = self.down(x)
        return x
    

def forward_hook(module, input, output):
    # Print the shape of the input and output tensors
    print(f"Input shape: {input[0].shape}")
    print(f"Output shape: {output.shape}")

# Create a toy model
in_dim = 10
out_dim = 5
intermediate_dim = 20
model = ToyModel(in_dim, out_dim, intermediate_dim)
# Register the forward hook for up layer
handler = model.up.register_forward_hook(forward_hook)

# Create a random input tensor
input_tensor = torch.randn(1, in_dim)
# Forward pass through the model
output = model(input_tensor)
print(output)

In [None]:
class ScaleLinearWrapper(torch.nn.Module):
    def __init__(self, linear_layer, scale):
        super(ScaleLinearWrapper, self).__init__()
        self.linear_layer = linear_layer
        self.scale = scale

    def forward(self, x):
        return self.linear_layer(x) * self.scale

In [None]:
# wrap the up layer
scale = 0.5
wrapped_up = ScaleLinearWrapper(model.up, scale)
model.up = wrapped_up

output = model(input_tensor)
print(output)