In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from sklearn.cluster import KMeans
from data_utils import dataset_local_load
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "/Users/jamesyao/.cache/modelscope/hub/deepseek-ai/DeepSeek-V2-Lite"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.34it/s]


In [4]:
dataset_dir = "dataset"
train_dataset_map, valid_dataset_map = dataset_local_load(dataset_dir)
dataset_name = "MathInstruct"
train_dataset = train_dataset_map[dataset_name]
train_df = pd.DataFrame(train_dataset)
train_df = train_df.sample(n=5, random_state=1, axis=0)


In [None]:
class PreTrainedDeepseekV2PrunerByDomain:
    def __init__(self, model, calibration_data):
        self.model = model.model
        self.tokenizer = tokenizer
        self.unsupervised_method = KMeans(n_clusters=model.config.num_experts_per_tok, random_state=0)
        self.calibration_data = calibration_data

    def generate_unsupervised_map(self):
        print("unsupervising...")
        prompt = list(self.calibration_data["prompt"])
        completion = list(self.calibration_data["completion"])
        inputs = self.tokenizer(prompt, completion, return_tensors='pt', max_length=128, padding=True, truncation=True)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        output = self.model(input_ids, attention_mask, output_hidden_states=True, pre_ffn_hidden=True)
        print("feed forward finish.")

        pre_ffn_hidden_states = output.pre_ffn_hidden_states
        assert len(pre_ffn_hidden_states) == len(self.model.layers)

        self.unsupervised_map = {}
        for idx, hidden_state in enumerate(pre_ffn_hidden_states):
            if "DeepseekV2MLP" in str(type(self.model.layers[idx].mlp)):
                continue
            
            score_weight = self.model.layers[idx].mlp.gate.weight
            scores = F.linear(hidden_state.type(torch.float32), score_weight.type(torch.float32), None)
            scores = scores.softmax(dim=-1, dtype=torch.float32).sum(0).sum(0).tolist()
            print(f"layer {idx} feed forward finish.")

            experts = self.model.layers[idx].mlp.experts
            experts_output = []
            for expert in experts:
                basic_output = expert(hidden_state).sum(1).flatten().type(torch.float16)
                experts_output.append(basic_output)
            print(f"layer {idx} expert forward finish.")

            experts_output = torch.stack(experts_output)
            kmeans_result = self.unsupervised_method.fit(experts_output.detach().numpy())
            cluster_label = kmeans_result.labels_.tolist()
            assert len(cluster_label) == self.model.config.n_routed_experts

            self.unsupervised_map[idx] = {cluster_label[i]:scores[i] for i in range(len(cluster_label))}

    def generate_pruned_map(self, prune_rate=0.5):
        print("pruning...")
        if self.unsupervised_map is None:
            raise ValueError("No existed unsupervised map.")
        self.pruned_map = {}
        for layer_idx, cluster_info in self.unsupervised_map.items():
            new_map = {}
            for expert_idx, (cluster_label, score) in enumerate(cluster_info):
                if cluster_label not in new_map:
                    new_map[cluster_label] = {}
                new_map[cluster_label][expert_idx] = score

            for cluster_label, cluster_info in new_map.items():
                cluster_length = len(cluster_info)
                if cluster_length > 1:
                    cluster_pruned_info = sorted(cluster_info.items(), key=lambda x: x[1])[:int(prune_rate * cluster_length)]
                    if layer_idx not in self.pruned_map:
                        self.pruned_map[layer_idx] = []
                    self.pruned_map[layer_idx] += [info[0] for info in cluster_pruned_info]
                    

pruner = PreTrainedDeepseekV2PrunerByDomain(model, train_df)
pruner.generate_unsupervised_map()
pruner.generate_pruned_map()
        

unsupervising...
feed forward finish.
layer 1 feed forward finish.
layer 1 expert forward finish.
layer 2 feed forward finish.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


layer 2 expert forward finish.
layer 3 feed forward finish.
layer 3 expert forward finish.
layer 4 feed forward finish.


In [None]:
output = model.model(input_ids, attention_mask)

In [4]:
# tokenizer test
inputs = tokenizer(list(train_df["prompt"])[:10], list(train_df["completion"])[:10], return_tensors='pt', padding=True, truncation=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

tokenizer.decode(input_ids.tolist()[0], skip_special_tokens=True)


"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nA train is moving at a speed of 90km/hr and its length is 500m. Find the time taken by it to pass a man standing near the railway line?\nAnswer Choices: (A) 30sec (B) 45sec (C) 36sec (D) 20sec (E) 52sec Let's write a program.\n\n### Response:# convert speed from km/hr to m/sec\nspeed = 90 * 1000 / 3600\n# calculate time taken to pass the man\ntime = 500 / speed\nprint(time)"

In [4]:
# text generate test
text = "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
result


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


'An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is a scalar, which is a number. The query and keys are vectors, and the values are vectors. The attention function is used to compute the output from the query and keys. The attention function is used in many different applications, such as natural language processing, computer vision, and recommendation systems.\nThe attention function is a mathematical function that is used to compute the output of a neural network. The function is used to compute the output of a neural network by taking into account the input data and the'