In [1]:
from llmfact import LayerOutputExtractor, FBNFeatureExtractor, GroupFBNFeatureExtractor, FBNExtractor, LLMFC
from llmfact.extractor import MutiLayerAnalysis, MutiLayerAnalysis2
from llmfact.extractor import SingleLayerAnalysis
from llmfact.mask import MaskedGPT2ForSequenceClassification, MaskedGPT2AmplifiedForSequenceClassification, MaskedGPT2LMModel, MaskedModel
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2ForSequenceClassification, Trainer, TrainingArguments
from transformers import GPT2Tokenizer
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForCausalLM, AutoModelForQuestionAnswering
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
from torch.utils.data import DataLoader
# from rouge_score import rouge_scorer
from evaluate import load

from llmfact.utils import IoU, correlation_activation, thresholding, write_layer_txt, evaluate_iou
from llmfact.stat import  StatICA, StatDictionaryLearning
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA
import seaborn as sns
import pandas as pd
from tqdm.auto import tqdm
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0,5,6,7,8,9'
os.environ["TOKENIZERS_PARALLELISM"]  = "true"

In [2]:
model_name = "lmsys/vicuna-7b-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
include_layers = []
for name, _ in model.named_modules():
    if "mlp.act" in name or "mlp.up" in name:
        include_layers.append(name)
include_layers

['model.layers.0.mlp.up_proj',
 'model.layers.0.mlp.act_fn',
 'model.layers.1.mlp.up_proj',
 'model.layers.1.mlp.act_fn',
 'model.layers.2.mlp.up_proj',
 'model.layers.2.mlp.act_fn',
 'model.layers.3.mlp.up_proj',
 'model.layers.3.mlp.act_fn',
 'model.layers.4.mlp.up_proj',
 'model.layers.4.mlp.act_fn',
 'model.layers.5.mlp.up_proj',
 'model.layers.5.mlp.act_fn',
 'model.layers.6.mlp.up_proj',
 'model.layers.6.mlp.act_fn',
 'model.layers.7.mlp.up_proj',
 'model.layers.7.mlp.act_fn',
 'model.layers.8.mlp.up_proj',
 'model.layers.8.mlp.act_fn',
 'model.layers.9.mlp.up_proj',
 'model.layers.9.mlp.act_fn',
 'model.layers.10.mlp.up_proj',
 'model.layers.10.mlp.act_fn',
 'model.layers.11.mlp.up_proj',
 'model.layers.11.mlp.act_fn',
 'model.layers.12.mlp.up_proj',
 'model.layers.12.mlp.act_fn',
 'model.layers.13.mlp.up_proj',
 'model.layers.13.mlp.act_fn',
 'model.layers.14.mlp.up_proj',
 'model.layers.14.mlp.act_fn',
 'model.layers.15.mlp.up_proj',
 'model.layers.15.mlp.act_fn',
 'model.laye

In [4]:
wiki_dataset = load_dataset("Self-GRIT/wikitext-2-raw-v1-preprocessed", split='train')
print(wiki_dataset)

Dataset({
    features: ['text'],
    num_rows: 15313
})


In [5]:
ica_dataset = []
sample_num = 40
ica_num = 1
for i in range(sample_num * ica_num):
    ica_dataset.append(wiki_dataset['text'][i])

In [6]:
from llmfact.decomposition.canica import CanICA
import torch 
import queue 
from concurrent.futures  import ThreadPoolExecutor 
from tqdm import tqdm 


def z_score_signals(signals):
	if not isinstance(signals, torch.Tensor):
		raise TypeError("Input signals must be a PyTorch tensor.")

	mean = torch.mean(signals, dim=0)
	std = torch.std(signals, dim=0)

	signals = signals - mean

	eps = torch.finfo(signals.dtype).eps  # 获取当前数据类型的最小正数
	std = torch.where(std < eps, torch.tensor(1.0, dtype=signals.dtype, device=signals.device), std)

	# 归一化
	signals /= std

	return signals


import torch
import queue
from concurrent.futures  import ThreadPoolExecutor
from tqdm import tqdm
 
class SingleLayerAnalysis(LayerOutputExtractor): 
    def __init__(self, model, include_layers=["h.0.attn.c_attn"],  test=False, device='cpu'): 
        super().__init__(model, include_layers=include_layers, test=test, device=device)
        self.include_layers  = include_layers 
        self.mixing_  = None 
        self.origin_mixing_  = None
        self.normal_mixing_  = None 
 
    def fit(self, inputs, n_components=10, alpha=1.96, random_state=666,
            preprocessing=False, total_layer_num=None, method="fastica", 
            max_iter=200, n_iter=5, norm=True): 
        
        # 确定总层数 
        if total_layer_num:
            total_layer_num = total_layer_num
        else:
            total_layer_num = len(self.include_layers)  
 
        # 提取层输出
        if type(inputs) == list:
            layer_outputs = torch.cat([self.extract_layer_outputs(inp)  for inp in inputs], dim=0) 
        else: 
            layer_outputs = self.extract_layer_outputs(inputs) 
 
        if preprocessing:
            layer_outputs = z_score_signals(layer_outputs).to(torch.float64) 
 
        token_num = layer_outputs.shape[0]  
        layer_outputs = layer_outputs.reshape(token_num,  total_layer_num, -1) 
 
        # 获取GPU设备
        num_gpus = torch.cuda.device_count()  
        if num_gpus == 0:
            raise RuntimeError("No GPU available")
        max_workers = min(num_gpus, 10)
        device_ids = list(range(max_workers))
        device_queue = queue.Queue()
        for device_id in device_ids:
            device_queue.put(device_id)  
 
        # 定义处理函数 
        def process_layer(i):
            device = device_queue.get() 
            try: 
                with torch.cuda.device(device):  
                    data = layer_outputs[:, i, :].to(device) 
                    if method == "fastica": 
                        # CPU处理逻辑（略） 
                        pass 
                    else: 
                        ica = CanICA(n_components=n_components,
                                     random_state=random_state, 
                                     device=device) 
                        ica.fit(data,  max_iter=max_iter)
                        return ica.normal_mixing_ 
            finally:
                device_queue.put(device)  
 
        # 并行执行
        normal_mixing_list = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_layer, i) for i in range(total_layer_num)] 
            for future in tqdm(futures, total=total_layer_num, desc="Processing Layers"): 
                normal_mixing_list.append(future.result().detach().cpu()) 
 
        self.normal_mixing_  = torch.cat(normal_mixing_list,  dim=1)

        
# class SingleLayerAnalysis(LayerOutputExtractor):
#     def __init__(self, model, include_layers=["h.0.attn.c_attn"], test=False, device='cpu'):
#         super().__init__(model, include_layers=include_layers, test=test, device=device)
#         self.include_layers = include_layers
#         self.mixing_ = None
#         self.origin_mixing_ = None
#         self.normal_mixing_ = None

#     def fit(self, inputs, n_components=10, alpha=1.96, random_state=666,
#             preprocessing=False, total_layer_num=None, method="fastica",
#             max_iter=200, n_iter=5, norm=True):
#         if total_layer_num:
#             total_layer_num = total_layer_num
#         else:
#             total_layer_num = len(self.include_layers)

#         if type(inputs) == list:
#             total_layer_outputs = []
#             for inp in inputs:
#                 layer_outputs = self.extract_layer_outputs(inp)
#                 total_layer_outputs.append(layer_outputs)
#             layer_outputs = torch.cat(total_layer_outputs, dim=0)
#         else:
#             layer_outputs = self.extract_layer_outputs(inputs)

#         if preprocessing:
#             layer_outputs = z_score_signals(layer_outputs)
#             layer_outputs = layer_outputs.to(torch.float64)

#         token_num = layer_outputs.shape[0]

#         layer_outputs = layer_outputs.reshape(token_num, total_layer_num, -1)

#         # mixing_list = []
#         # origin_mixing_list = []
#         normal_mixing_list = []
#         for i in trange(total_layer_num):
#             if method == "fastica":
#                 ica = FastICA(n_components=n_components,
#                               random_state=random_state,
#                               max_iter=max_iter)
#                 ica.fit(layer_outputs[:, i, :])
#                 mixing = torch.tensor(ica.mixing_.T)
#                 # origin_mixing_list.append(mixing)

#                 mean = torch.mean(mixing, dim=1, keepdim=True)
#                 std = torch.std(mixing, dim=1, keepdim=True)

#                 normalized_matrix = (mixing - mean) / std

#                 normal_mixing_list.append(normalized_matrix)

#             else:
#                 ica = CanICA(n_components=n_components, random_state=random_state, device=self.device)
#                 ica.fit(layer_outputs[:, i, :], max_iter=max_iter)

#                 normal_mixing_list.append(ica.normal_mixing_)
                
#         self.normal_mixing_ = torch.cat(normal_mixing_list, dim=1)

In [7]:
def MutiICA(model, include_layers, dataset, n_components=256, window_size=3, preprocessing=True, max_iter=500, n_iter=5, norm=True):
    extractor = MutiLayerAnalysis2(model, include_layers=include_layers, device=model.device)
    inputs_list = [tokenizer(inputs, return_tensors="pt", max_length=1024, truncation=True) for inputs in dataset]
    extractor.fit(inputs=inputs_list, n_components=n_components, 
                  window_size=window_size, random_state=666,
                  preprocessing=preprocessing, total_layer_num=32, method="canica", max_iter=max_iter, norm=norm, n_iter=n_iter)
    return extractor
    

def save_fbn(data, save_dir, data_type, model_name, n_components, alpha):
    if isinstance(data, list) or isinstance(data, dict):
        save_path = save_dir + data_type + "_" + model_name + "_" + str(n_components) + "_" + str(alpha) + ".pth"
        print(f"save at {save_path}")
        torch.save(data, save_path)
    else:
        save_path = save_dir + data_type + "_" + model_name + "_" + str(n_components) + "_" + str(alpha) + ".pth"
        print(f"save at {save_path}")
        torch.save(data, save_path)

def SingleICA(model, include_layers, dataset, n_components=256, preprocessing=True, max_iter=500, norm=True, n_iter=5):
    extractor = SingleLayerAnalysis(model, include_layers=include_layers, device=model.device)
    inputs_list = [tokenizer(inputs, return_tensors="pt", max_length=1024, truncation=True) for inputs in dataset]
    extractor.fit(inputs=inputs_list, n_components=n_components, 
                  random_state=327, preprocessing=True,
                  total_layer_num=32, method="canica", max_iter=max_iter, norm=norm, n_iter=n_iter)
    return extractor

In [8]:
len(ica_dataset)

40

In [9]:
%%time
from tqdm import trange
import warnings
warnings.filterwarnings("ignore")

model_name = "vicuna-7b-v1.5-muti-layer-wise"
n_components = 128
preprocessing = True
max_iter = 300
norm = False
n_iter = 4

normal_list = []
for i in trange(ica_num):
    extractor = SingleICA(model, include_layers, ica_dataset[sample_num*i:sample_num*(i+1)],
                          n_components=n_components, preprocessing=True,
                          max_iter=max_iter, norm=norm, n_iter=n_iter)
    normal = extractor.normal_mixing_
    normal_list.append(normal)
normal_components = torch.cat(normal_list, dim=0)
save_fbn(normal_components, "./data/FBN/", f"text{len(ica_dataset)}-mlp.act-CanICA-SingleICA-max_iter-{max_iter}", model_name, n_components, f"normal_mixing_std_{preprocessing}")
normal_components.shape

  0%|          | 0/1 [00:00<?, ?it/s]
Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s][A
Processing Layers:   3%|▎         | 1/32 [01:08<35:23, 68.51s/it][A
Processing Layers:   9%|▉         | 3/32 [01:08<08:38, 17.90s/it][A
Processing Layers:  12%|█▎        | 4/32 [01:09<05:38, 12.08s/it][A
Processing Layers:  16%|█▌        | 5/32 [01:10<03:39,  8.13s/it][A
Processing Layers:  19%|█▉        | 6/32 [01:11<02:31,  5.84s/it][A
Processing Layers:  22%|██▏       | 7/32 [02:01<08:14, 19.77s/it][A
Processing Layers:  25%|██▌       | 8/32 [02:05<05:55, 14.81s/it][A
Processing Layers:  28%|██▊       | 9/32 [02:06<04:05, 10.67s/it][A
Processing Layers:  31%|███▏      | 10/32 [02:06<02:45,  7.53s/it][A
Processing Layers:  34%|███▍      | 11/32 [02:08<01:59,  5.71s/it][A
Processing Layers:  38%|███▊      | 12/32 [02:09<01:25,  4.29s/it][A
Processing Layers:  41%|████      | 13/32 [02:58<05:41, 17.95s/it][A
Processing Layers:  44%|████▍     | 14/32 [03:03<04:11, 13.98s/it][A

save at ./data/FBN/text40-mlp.act-CanICA-SingleICA-max_iter-300_vicuna-7b-v1.5-muti-layer-wise_128_normal_mixing_std_True.pth
CPU times: user 30min 32s, sys: 11min 15s, total: 41min 48s
Wall time: 7min 2s


torch.Size([128, 704512])

In [10]:
normal_components.shape

torch.Size([128, 704512])

In [11]:
torch.cuda.empty_cache()

In [12]:
def cut_par_num(neuron_num_list):
    total_par = 6738415616
    print("total parameters:", total_par)
    
    total_mlp = 32 * (4096 * 11008 * 3)
    print("total mlp parameters:", total_mlp)

    total_cut = 0
    for i in neuron_num_list:
        cut_num = 4096 * 11008 * 3 - (i * 4096 * 3)
        total_cut += cut_num
    print("total cut parameters num:", total_cut)

    print(f"total cut mlp parameters: {total_cut / total_mlp:.4f}")
    print(f"total cut parameters: {total_cut / total_par:.4f}")
    print(f"parameters after cut: {total_par - total_cut:.4f}")

In [18]:
any_mask = torch.abs(normal_components) > 3.46
any_mask = torch.any(any_mask, dim=0).reshape(1, -1)
print(any_mask.sum())

mask = any_mask.reshape(32, 2, -1)
mask_matrix = torch.ones((32, 11008))
for i in range(3, mask.shape[0] - 2):
    mask_matrix[i] = torch.any(mask[i], dim=0)
print(mask_matrix.sum())
print(mask_matrix.sum(dim=1))
cut_par_num(mask_matrix.sum(dim=1))
# mask_matrix = np.repeat(mask_matrix, 2, axis=0)
mask_matrix.shape

tensor(297147)
tensor(246283.)
tensor([11008., 11008., 11008.,  5921.,  6619.,  6827.,  7164.,  7679.,  7961.,
         8265.,  8232.,  8168.,  8122.,  7995.,  8011.,  7877.,  7736.,  7457.,
         7231.,  6893.,  6777.,  6550.,  6562.,  6246.,  6292.,  5924.,  6004.,
         6228.,  6153.,  6349., 11008., 11008.])
total parameters: 6738415616
total mlp parameters: 4328521728
total cut parameters num: tensor(1.3022e+09)
total cut mlp parameters: 0.3008
total cut parameters: 0.1932
parameters after cut: 5436219392.0000


torch.Size([32, 11008])

In [19]:
import gc
def pruned_llama_mlp(model, mask):
    mask = torch.tensor(mask, dtype=torch.bool)

    for i in range(len(model.model.layers)):
        layer = model.model.layers[i]

        mask_1 = mask[i].type(torch.bool)
        # pruned_mlp = PrunedLlamaMLP(config=model.config,
        #                             mask=mask_1,
        #                             device=next(layer.parameters()).device)
        with torch.no_grad():
            # w1 = layer.mlp.gate_proj.weight[mask_1]
            layer.mlp.up_proj.weight.data = layer.mlp.up_proj.weight.data[torch.where(mask_1)[0]]
            # pruned_mlp.gate_proj.weight.copy_(w1.contiguous())

            # w2 = layer.mlp.up_proj.weight[mask_1]
            layer.mlp.gate_proj.weight.data = layer.mlp.gate_proj.weight.data[torch.where(mask_1)[0]]
            # pruned_mlp.up_proj.weight.copy_(w2.contiguous())

            layer.mlp.up_proj.out_features = mask_1.sum().item()
            layer.mlp.gate_proj.out_features = mask_1.sum().item()
            layer.mlp.intermediate_size = mask_1.sum().item()

            output_weight = layer.mlp.down_proj.weight.data[:, torch.where(mask_1)[0]]

            layer.mlp.down_proj.weight.data = output_weight

            layer.mlp.down_proj.in_features = mask_1.sum().item()

            # w3 = layer.mlp.down_proj.weight[:, mask_1]
            # pruned_mlp.down_proj.weight.copy_(w3.contiguous())

            # # del layer.mlp
            # del w1
            # del w2
            # del w3
            gc.collect()
            torch.cuda.empty_cache()

            # layer.mlp = pruned_mlp

    return model

class PrunedLlamaModel:
    def __init__(self, model, mask=None):
        self.mask = mask
        self.model = model

    def fit(self):
        total_par = 0
        for par in self.model.parameters():
            total_par += par.numel()
        print(f"total parameters before pruned: {total_par}")
        self.model = pruned_llama_mlp(self.model, self.mask)

        total_par_pruned = 0
        for par in self.model.parameters():
            total_par_pruned += par.numel()
        print(f"total parameters after pruned: {total_par_pruned}")
        print(f"total cut num: {total_par - total_par_pruned}")
        print(f"pruned rate: {(total_par - total_par_pruned) / total_par:.4f}")

        return self.model

In [20]:
# from llmfact.pruner.pruner import PrunedLlamaModel
pruner = PrunedLlamaModel(model, mask_matrix)
model = pruner.fit()

total parameters before pruned: 6738415616
total parameters after pruned: 5436219392
total cut num: 1302196224
pruned rate: 0.1932


In [21]:
from lm_eval import evaluator
import lm_eval
wrapper_model = lm_eval.models.huggingface.HFLM(pretrained=model, trust_remote_code=True)

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


In [22]:
results = evaluator.simple_evaluate( 
    model=wrapper_model,
    model_args="lmsys/vicuna-7b-v1.5",
    tasks=["wikitext"],
    num_fewshot=0,
    task_manager=lm_eval.tasks.TaskManager(),
    batch_size=1)
results['results']

[Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
[Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False
Overwriting default num_fewshot of wikitext from None to 0
100%|██████████| 62/62 [00:00<00:00, 433.48it/s]
  0%|          | 0/62 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5943 > 4096). Running this sequence through t

{'wikitext': {'alias': 'wikitext',
  'word_perplexity,none': 18.94210997911784,
  'word_perplexity_stderr,none': 'N/A',
  'byte_perplexity,none': 1.7333468983928229,
  'byte_perplexity_stderr,none': 'N/A',
  'bits_per_byte,none': 0.7935604130138104,
  'bits_per_byte_stderr,none': 'N/A'}}