In [1]:
from llmfact import LayerOutputExtractor, FBNFeatureExtractor, GroupFBNFeatureExtractor, FBNExtractor, LLMFC
from llmfact.extractor import MutiLayerAnalysis, MutiLayerAnalysis2
from llmfact.extractor import SingleLayerAnalysis
from llmfact.mask import MaskedGPT2ForSequenceClassification, MaskedGPT2AmplifiedForSequenceClassification, MaskedGPT2LMModel, MaskedModel
from transformers import GPT2Model, GPT2Config, GPT2LMHeadModel, GPT2ForSequenceClassification, Trainer, TrainingArguments
from transformers import GPT2Tokenizer
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForCausalLM, AutoModelForQuestionAnswering
from datasets import load_dataset, get_dataset_config_names, get_dataset_split_names
from torch.utils.data import DataLoader
# from rouge_score import rouge_scorer
from evaluate import load

from llmfact.utils import IoU, correlation_activation, thresholding, write_layer_txt, evaluate_iou
from llmfact.stat import  StatICA, StatDictionaryLearning
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.decomposition import FastICA
import seaborn as sns
import pandas as pd
from tqdm.auto import tqdm
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '4,5,6,7,8,9'
os.environ["TOKENIZERS_PARALLELISM"]  = "true"

In [2]:
model_name = "lmsys/vicuna-7b-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
include_layers = []
for name, _ in model.named_modules():
    if "mlp.act" in name or "mlp.up" in name:
        include_layers.append(name)
include_layers

['model.layers.0.mlp.up_proj',
 'model.layers.0.mlp.act_fn',
 'model.layers.1.mlp.up_proj',
 'model.layers.1.mlp.act_fn',
 'model.layers.2.mlp.up_proj',
 'model.layers.2.mlp.act_fn',
 'model.layers.3.mlp.up_proj',
 'model.layers.3.mlp.act_fn',
 'model.layers.4.mlp.up_proj',
 'model.layers.4.mlp.act_fn',
 'model.layers.5.mlp.up_proj',
 'model.layers.5.mlp.act_fn',
 'model.layers.6.mlp.up_proj',
 'model.layers.6.mlp.act_fn',
 'model.layers.7.mlp.up_proj',
 'model.layers.7.mlp.act_fn',
 'model.layers.8.mlp.up_proj',
 'model.layers.8.mlp.act_fn',
 'model.layers.9.mlp.up_proj',
 'model.layers.9.mlp.act_fn',
 'model.layers.10.mlp.up_proj',
 'model.layers.10.mlp.act_fn',
 'model.layers.11.mlp.up_proj',
 'model.layers.11.mlp.act_fn',
 'model.layers.12.mlp.up_proj',
 'model.layers.12.mlp.act_fn',
 'model.layers.13.mlp.up_proj',
 'model.layers.13.mlp.act_fn',
 'model.layers.14.mlp.up_proj',
 'model.layers.14.mlp.act_fn',
 'model.layers.15.mlp.up_proj',
 'model.layers.15.mlp.act_fn',
 'model.laye

In [4]:
wiki_dataset = load_dataset("Self-GRIT/wikitext-2-raw-v1-preprocessed", split='train')
print(wiki_dataset)

Dataset({
    features: ['text'],
    num_rows: 15313
})


In [5]:
ica_dataset = []
sample_num = 40
ica_num = 1
for i in range(sample_num * ica_num):
    ica_dataset.append(wiki_dataset['text'][i])

In [9]:
from llmfact.decomposition.canica import CanICA

def z_score_signals(signals):
	if not isinstance(signals, torch.Tensor):
		raise TypeError("Input signals must be a PyTorch tensor.")

	mean = torch.mean(signals, dim=0)
	std = torch.std(signals, dim=0)

	signals = signals - mean

	eps = torch.finfo(signals.dtype).eps  # 获取当前数据类型的最小正数
	std = torch.where(std < eps, torch.tensor(1.0, dtype=signals.dtype, device=signals.device), std)

	# 归一化
	signals /= std

	return signals
    
class SingleLayerAnalysis(LayerOutputExtractor):
    def __init__(self, model, include_layers=["h.0.attn.c_attn"], test=False, device='cpu'):
        super().__init__(model, include_layers=include_layers, test=test, device=device)
        self.include_layers = include_layers
        self.mixing_ = None
        self.origin_mixing_ = None
        self.normal_mixing_ = None

    def fit(self, inputs, n_components=10, alpha=1.96, random_state=666,
            preprocessing=False, total_layer_num=None, method="fastica",
            max_iter=200, n_iter=5, norm=True):
        if total_layer_num:
            total_layer_num = total_layer_num
        else:
            total_layer_num = len(self.include_layers)

        if type(inputs) == list:
            total_layer_outputs = []
            for inp in inputs:
                layer_outputs = self.extract_layer_outputs(inp)
                total_layer_outputs.append(layer_outputs)
            layer_outputs = torch.cat(total_layer_outputs, dim=0)
        else:
            layer_outputs = self.extract_layer_outputs(inputs)

        if preprocessing:
            layer_outputs = z_score_signals(layer_outputs)
            layer_outputs = layer_outputs.to(torch.float64)

        token_num = layer_outputs.shape[0]

        layer_outputs = layer_outputs.reshape(token_num, total_layer_num, -1)

        # mixing_list = []
        # origin_mixing_list = []
        normal_mixing_list = []
        for i in trange(total_layer_num):
            if method == "fastica":
                ica = FastICA(n_components=n_components,
                              random_state=random_state,
                              max_iter=max_iter)
                ica.fit(layer_outputs[:, i, :])
                mixing = torch.tensor(ica.mixing_.T)
                # origin_mixing_list.append(mixing)

                mean = torch.mean(mixing, dim=1, keepdim=True)
                std = torch.std(mixing, dim=1, keepdim=True)

                normalized_matrix = (mixing - mean) / std

                normal_mixing_list.append(normalized_matrix)

            else:
                ica = CanICA(n_components=n_components, random_state=random_state, device=self.device)
                ica.fit(layer_outputs[:, i, :], max_iter=max_iter)

                normal_mixing_list.append(ica.normal_mixing_)
                
        self.normal_mixing_ = torch.cat(normal_mixing_list, dim=1)

In [10]:
def MutiICA(model, include_layers, dataset, n_components=256, window_size=3, preprocessing=True, max_iter=500, n_iter=5, norm=True):
    extractor = MutiLayerAnalysis2(model, include_layers=include_layers, device=model.device)
    inputs_list = [tokenizer(inputs, return_tensors="pt", max_length=1024, truncation=True) for inputs in dataset]
    extractor.fit(inputs=inputs_list, n_components=n_components, 
                  window_size=window_size, random_state=666,
                  preprocessing=preprocessing, total_layer_num=32, method="canica", max_iter=max_iter, norm=norm, n_iter=n_iter)
    return extractor
    

def save_fbn(data, save_dir, data_type, model_name, n_components, alpha):
    if isinstance(data, list) or isinstance(data, dict):
        save_path = save_dir + data_type + "_" + model_name + "_" + str(n_components) + "_" + str(alpha) + ".pth"
        print(f"save at {save_path}")
        torch.save(data, save_path)
    else:
        save_path = save_dir + data_type + "_" + model_name + "_" + str(n_components) + "_" + str(alpha) + ".pth"
        print(f"save at {save_path}")
        np.save(save_path, data)

def SingleICA(model, include_layers, dataset, n_components=256, preprocessing=True, max_iter=500, norm=True, n_iter=5):
    extractor = SingleLayerAnalysis(model, include_layers=include_layers, device=model.device)
    inputs_list = [tokenizer(inputs, return_tensors="pt", max_length=1024, truncation=True) for inputs in dataset]
    extractor.fit(inputs=inputs_list, n_components=n_components, 
                  random_state=327, preprocessing=True,
                  total_layer_num=32, method="canica", max_iter=max_iter, norm=norm, n_iter=n_iter)
    return extractor

In [11]:
len(ica_dataset)

40

In [8]:
%%time
from tqdm import trange
import warnings
warnings.filterwarnings("ignore")

model_name = "vicuna-7b-v1.5-muti-layer-wise"
n_components = 128
preprocessing = True
max_iter = 300
norm = False
n_iter = 4

normal_list = []
for i in trange(ica_num):
    try:
        extractor = SingleICA(model, include_layers, ica_dataset[sample_num*i:sample_num*(i+1)],
                              n_components=n_components, preprocessing=True,
                              max_iter=max_iter, norm=norm, n_iter=n_iter)
        normal = extractor.normal_mixing_
        normal_list.append(normal)
    except:
        print(f"第{i}次算ICA出现问题，出现问题的样本在[{sample_num*i}:{sample_num*(i+1)}]")
normal_components = np.concatenate(normal_list, axis=0)
save_fbn(normal_components, "./data/FBN/", f"text{len(ica_dataset)}-mlp.act-CanICA-SingleICA-max_iter-{max_iter}", model_name, n_components, f"normal_mixing_std_{preprocessing}")
normal_components.shape

  0%|          | 0/80 [00:00<?, ?it/s]

  3%|▎         | 1/32 [01:24<43:51, 84.87s/it][A
  6%|▋         | 2/32 [02:01<28:06, 56.20s/it][A
  9%|▉         | 3/32 [02:41<23:37, 48.89s/it][A
 12%|█▎        | 4/32 [03:26<22:05, 47.33s/it][A
 16%|█▌        | 5/32 [04:17<21:52, 48.62s/it][A
 19%|█▉        | 6/32 [05:20<23:12, 53.58s/it][A
 22%|██▏       | 7/32 [06:20<23:17, 55.92s/it][A
 25%|██▌       | 8/32 [07:08<21:15, 53.13s/it][A
 28%|██▊       | 9/32 [07:58<20:02, 52.29s/it][A
 31%|███▏      | 10/32 [08:42<18:14, 49.76s/it][A
 34%|███▍      | 11/32 [09:29<17:04, 48.77s/it][A
 38%|███▊      | 12/32 [10:17<16:10, 48.54s/it][A
 41%|████      | 13/32 [11:09<15:40, 49.53s/it][A
 44%|████▍     | 14/32 [12:04<15:25, 51.43s/it][A
 47%|████▋     | 15/32 [12:43<13:28, 47.53s/it][A
 50%|█████     | 16/32 [13:36<13:05, 49.11s/it][A
 53%|█████▎    | 17/32 [14:20<11:53, 47.58s/it][A
 56%|█████▋    | 18/32 [15:03<10:47, 46.22s/it][A
 59%|█████▉    | 19/32 [15:46<09:51, 45.48s/it][A


第11次算ICA出现问题，出现问题的样本在[440:480]


 15%|█▌        | 12/80 [7:00:28<36:12:58, 1917.33s/it]
  0%|          | 0/32 [00:00<?, ?it/s][A
  3%|▎         | 1/32 [00:56<29:26, 56.98s/it][A
  6%|▋         | 2/32 [01:38<23:51, 47.71s/it][A
  9%|▉         | 3/32 [02:19<21:34, 44.63s/it][A
 12%|█▎        | 4/32 [03:04<21:02, 45.09s/it][A
 16%|█▌        | 5/32 [03:59<21:44, 48.33s/it][A
 19%|█▉        | 6/32 [04:40<19:59, 46.15s/it][A
 22%|██▏       | 7/32 [05:25<18:56, 45.48s/it][A
 25%|██▌       | 8/32 [06:19<19:20, 48.36s/it][A
 28%|██▊       | 9/32 [07:08<18:35, 48.48s/it][A
 31%|███▏      | 10/32 [07:56<17:46, 48.49s/it][A
 34%|███▍      | 11/32 [08:40<16:29, 47.10s/it][A
 38%|███▊      | 12/32 [09:31<16:04, 48.24s/it][A
 41%|████      | 13/32 [10:20<15:20, 48.43s/it][A
 44%|████▍     | 14/32 [11:14<15:00, 50.05s/it][A
 47%|████▋     | 15/32 [12:04<14:10, 50.05s/it][A
 50%|█████     | 16/32 [12:53<13:14, 49.68s/it][A
 53%|█████▎    | 17/32 [13:38<12:06, 48.43s/it][A
 56%|█████▋    | 18/32 [14:29<11:28, 49.17s/i

第54次算ICA出现问题，出现问题的样本在[2160:2200]


 69%|██████▉   | 55/80 [28:01:11<10:54:00, 1569.61s/it]
  0%|          | 0/32 [00:00<?, ?it/s][A
  3%|▎         | 1/32 [01:08<35:23, 68.50s/it][A
  6%|▋         | 2/32 [01:40<23:24, 46.81s/it][A
  9%|▉         | 3/32 [02:21<21:30, 44.51s/it][A
 12%|█▎        | 4/32 [03:08<21:07, 45.25s/it][A
 16%|█▌        | 5/32 [03:55<20:40, 45.95s/it][A
 19%|█▉        | 6/32 [04:54<21:49, 50.37s/it][A
 22%|██▏       | 7/32 [05:41<20:28, 49.13s/it][A
 25%|██▌       | 8/32 [06:27<19:20, 48.37s/it][A
 28%|██▊       | 9/32 [07:23<19:25, 50.69s/it][A
 31%|███▏      | 10/32 [08:12<18:20, 50.03s/it][A
 34%|███▍      | 11/32 [08:55<16:46, 47.94s/it][A
 38%|███▊      | 12/32 [09:53<16:59, 50.97s/it][A
 41%|████      | 13/32 [10:41<15:51, 50.06s/it][A
 44%|████▍     | 14/32 [11:27<14:39, 48.85s/it][A
 47%|████▋     | 15/32 [12:09<13:14, 46.75s/it][A
 50%|█████     | 16/32 [12:55<12:28, 46.76s/it][A
 53%|█████▎    | 17/32 [13:37<11:17, 45.19s/it][A
 56%|█████▋    | 18/32 [14:16<10:06, 43.31s/

save at ./data/FBN/text3200-mlp.act-CanICA-SingleICA-max_iter-300_vicuna-7b-v1.5-muti-layer-wise_128_normal_mixing_std_True.npy
CPU times: user 12d 16h 23min 42s, sys: 2h 50min 41s, total: 12d 19h 14min 23s
Wall time: 1d 19h 29min 37s


(9984, 704512)

In [10]:
def cut_par_num(neuron_num_list):
    total_par = 6738415616
    print("total parameters:", total_par)
    
    total_mlp = 32 * (4096 * 11008 * 3)
    print("total mlp parameters:", total_mlp)

    total_cut = 0
    for i in neuron_num_list:
        cut_num = 4096 * 11008 * 3 - (i * 4096 * 3)
        total_cut += cut_num
    print("total cut parameters num:", total_cut)

    print(f"total cut mlp parameters: {total_cut / total_mlp:.4f}")
    print(f"total cut parameters: {total_cut / total_par:.4f}")
    print(f"parameters after cut: {total_par - total_cut:.4f}")

In [13]:
any_mask = np.abs(normal_components) > 6.03
any_mask = np.any(any_mask, axis=0).reshape(1, -1)
print(any_mask.sum())

mask = any_mask.reshape(32, 2, -1)
mask_matrix = np.ones((32, 11008))
for i in range(3, mask.shape[0] - 2):
    mask_matrix[i] = np.any(mask[i], axis=0)
print(mask_matrix.sum())
print(mask_matrix.sum(axis=1))
cut_par_num(mask_matrix.sum(axis=1))
# mask_matrix = np.repeat(mask_matrix, 2, axis=0)
mask_matrix.shape

276714
243438.0
[11008. 11008. 11008.  4776.  5473.  6001.  6571.  7220.  7181.  7296.
  7414.  7467.  7488.  7553.  7594.  7533.  7887.  7633.  7596.  7212.
  7295.  7189.  7310.  7064.  7157.  6829.  6560.  6564.  6256.  6279.
 11008. 11008.]
total parameters: 6738415616
total mlp parameters: 4328521728
total cut parameters num: 1337155584.0
total cut mlp parameters: 0.3089
total cut parameters: 0.1984
parameters after cut: 5401260032.0000


(32, 11008)

In [14]:
from llmfact.pruner.pruner import PrunedLlamaModel
pruner = PrunedLlamaModel(model, mask_matrix)
model = pruner.fit()

total parameters before pruned: 6738415616
total parameters after pruned: 5401260032
total cut num: 1337155584
pruned rate: 0.1984


In [2]:
(6738415616 - 5444132864) / 6738415616

0.1920752333718918

In [15]:
from lm_eval import evaluator
import lm_eval
wrapper_model = lm_eval.models.huggingface.HFLM(pretrained=model, trust_remote_code=True)

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


In [16]:
results = evaluator.simple_evaluate( 
    model=wrapper_model,
    model_args="lmsys/vicuna-7b-v1.5",
    tasks=["wikitext"],
    num_fewshot=0,
    task_manager=lm_eval.tasks.TaskManager(),
    batch_size=1)
results['results']

[Task: wikitext] metric word_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric word_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric byte_perplexity is defined, but aggregation is not. using default aggregation=weighted_perplexity
[Task: wikitext] metric byte_perplexity is defined, but higher_is_better is not. using default higher_is_better=False
[Task: wikitext] metric bits_per_byte is defined, but aggregation is not. using default aggregation=bits_per_byte
[Task: wikitext] metric bits_per_byte is defined, but higher_is_better is not. using default higher_is_better=False
Overwriting default num_fewshot of wikitext from None to 0
100%|██████████| 62/62 [00:00<00:00, 384.27it/s]
  0%|          | 0/62 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5943 > 4096). Running this sequence through t

{'wikitext': {'alias': 'wikitext',
  'word_perplexity,none': 16.724930608476676,
  'word_perplexity_stderr,none': 'N/A',
  'byte_perplexity,none': 1.6934611737796776,
  'byte_perplexity_stderr,none': 'N/A',
  'bits_per_byte,none': 0.7599749103409951,
  'bits_per_byte_stderr,none': 'N/A'}}

In [17]:
results = evaluator.simple_evaluate( 
    model=wrapper_model,
    tasks=["piqa", "hellaswag", "winogrande", "openbookqa", "arc_easy", "arc_challenge"],
    num_fewshot=0,
    task_manager=lm_eval.tasks.TaskManager(),
    batch_size=1)
results['results']

Overwriting default num_fewshot of arc_challenge from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of openbookqa from None to 0
Overwriting default num_fewshot of winogrande from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 1172/1172 [00:01<00:00, 656.53it/s]
100%|██████████| 2376/2376 [00:03<00:00, 615.64it/s]
100%|██████████| 500/500 [00:00<00:00, 1476.18it/s]
100%|██████████| 1267/1267 [00:00<00:00, 48409.78it/s]
100%|██████████| 10042/10042 [00:06<00:00, 1515.06it/s]
100%|██████████| 1838/1838 [00:02<00:00, 621.07it/s]
Running loglikelihood requests: 100%|██████████| 62566/62566 [3:51:22<00:00,  4.51it/s]  


{'arc_challenge': {'alias': 'arc_challenge',
  'acc,none': 0.3779863481228669,
  'acc_stderr,none': 0.014169664520303164,
  'acc_norm,none': 0.3993174061433447,
  'acc_norm_stderr,none': 0.014312094557946681},
 'arc_easy': {'alias': 'arc_easy',
  'acc,none': 0.6822390572390572,
  'acc_stderr,none': 0.009554033064443069,
  'acc_norm,none': 0.6397306397306397,
  'acc_norm_stderr,none': 0.00985100258473252},
 'hellaswag': {'alias': 'hellaswag',
  'acc,none': 0.49352718581955785,
  'acc_stderr,none': 0.004989363276955276,
  'acc_norm,none': 0.657239593706433,
  'acc_norm_stderr,none': 0.0047366216988608345},
 'openbookqa': {'alias': 'openbookqa',
  'acc,none': 0.3,
  'acc_stderr,none': 0.020514426225627987,
  'acc_norm,none': 0.384,
  'acc_norm_stderr,none': 0.021772369465547118},
 'piqa': {'alias': 'piqa',
  'acc,none': 0.7257889009793254,
  'acc_stderr,none': 0.010408618664933516,
  'acc_norm,none': 0.736126224156692,
  'acc_norm_stderr,none': 0.010282996367695665},
 'winogrande': {'alia