In [None]:
import torch
import torch.nn as nn
import time
import psutil
import os
import tracemalloc
from transformers import AutoTokenizer
from modelscope.utils.hub import snapshot_download
from detection.Loader.mymodel_file.gptJ_cloud import gptJ_cloud
from detection.Loader.mymodel_file.gptJ_edge import gptJ_edge
from detection.SVD_model import SVDED_GPTJ_EDGE_Layer
from detection.MINI_PIPE_EVAL import load_and_tokenize_dataset

class PerformanceMonitor:
    """性能监控类，用于记录CPU时间和内存使用情况"""
    def __init__(self):
        self.process = psutil.Process(os.getpid())
        self.reset_stats()
        
    def reset_stats(self):
        """重置统计数据"""
        self.cloud_gpu_times = []
        self.edge_cpu_times = []
        self.network_times = []
        self.memory_snapshots = []
        self.token_count = 0
        self.layer_calls = 0
        
        # 详细的计时统计
        self.cloud_total_time = 0.0
        self.edge_total_time = 0.0
        self.network_total_time = 0.0
        
        # 内存统计
        self.initial_memory = self.get_memory_mb()
        self.peak_memory = self.initial_memory
        
    def get_memory_mb(self):
        """获取当前内存使用量(MB)"""
        return self.process.memory_info().rss / 1024 / 1024
    
    def get_cpu_percent(self):
        """获取CPU使用率"""
        return self.process.cpu_percent()
    
    def start_memory_tracking(self):
        """开始内存跟踪"""
        tracemalloc.start()
        
    def stop_memory_tracking(self):
        """停止内存跟踪并返回统计信息"""
        if tracemalloc.is_tracing():
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            return current / 1024 / 1024, peak / 1024 / 1024  # 转换为MB
        return 0, 0
    
    def record_cloud_time(self, time_taken):
        """记录云端GPU时间"""
        self.cloud_gpu_times.append(time_taken)
        self.cloud_total_time += time_taken
        
    def record_edge_time(self, time_taken):
        """记录边缘CPU时间"""
        self.edge_cpu_times.append(time_taken)
        self.edge_total_time += time_taken
        
    def record_network_time(self, time_taken):
        """记录网络传输时间"""
        self.network_times.append(time_taken)
        self.network_total_time += time_taken
        
    def record_memory_snapshot(self, phase=""):
        """记录内存快照"""
        current_memory = self.get_memory_mb()
        self.memory_snapshots.append({
            'phase': phase,
            'memory_mb': current_memory,
            'timestamp': time.time()
        })
        self.peak_memory = max(self.peak_memory, current_memory)
        
    def increment_counters(self):
        """增加计数器"""
        self.layer_calls += 1
        
    def increment_token_count(self):
        """增加token计数"""
        self.token_count += 1
        
    def print_detailed_report(self):
        """打印详细的性能报告"""
        print(f"\n{'='*70}")
        print(f"🔍 详细性能分析报告")
        print(f"{'='*70}")
        
        # 基本统计
        print(f"📊 基本统计信息:")
        print(f"   🔢 处理的Token数量: {self.token_count}")
        print(f"   🔢 总层调用次数: {self.layer_calls}")
        print(f"   🔢 平均每token层调用: {self.layer_calls/max(1, self.token_count):.1f}")
        
        # 时间统计
        print(f"\n⏱️  时间统计 (总计):")
        print(f"   ☁️  GPU云端总时间: {self.cloud_total_time:.4f}s")
        print(f"   🖥️  CPU边缘总时间: {self.edge_total_time:.4f}s")
        print(f"   🌐 网络传输总时间: {self.network_total_time:.4f}s")
        print(f"   🔄 总处理时间: {self.cloud_total_time + self.edge_total_time + self.network_total_time:.4f}s")
        
        # 平均时间统计
        if self.token_count > 0:
            print(f"\n⏱️  平均每Token时间:")
            print(f"   ☁️  GPU云端平均: {self.cloud_total_time/self.token_count:.4f}s")
            print(f"   🖥️  CPU边缘平均: {self.edge_total_time/self.token_count:.4f}s")
            print(f"   🌐 网络传输平均: {self.network_total_time/self.token_count:.4f}s")
            print(f"   🔄 总平均: {(self.cloud_total_time + self.edge_total_time + self.network_total_time)/self.token_count:.4f}s")
        
        # 内存统计
        current_memory = self.get_memory_mb()
        memory_diff = current_memory - self.initial_memory
        print(f"\n💾 内存使用统计:")
        print(f"   📈 初始内存: {self.initial_memory:.2f}MB")
        print(f"   📊 当前内存: {current_memory:.2f}MB")
        print(f"   📈 峰值内存: {self.peak_memory:.2f}MB")
        print(f"   📊 内存变化: {memory_diff:+.2f}MB")
        
        # CPU使用率
        cpu_percent = self.get_cpu_percent()
        print(f"   🔥 当前CPU使用率: {cpu_percent:.1f}%")
        
        # 获取内存跟踪信息
        if hasattr(self, '_tracemalloc_peak'):
            print(f"   🔍 内存跟踪峰值: {self._tracemalloc_peak:.2f}MB")
        
        # 时间分布分析
        if len(self.cloud_gpu_times) > 0:
            print(f"\n📈 GPU时间分布:")
            print(f"   最小: {min(self.cloud_gpu_times):.4f}s")
            print(f"   最大: {max(self.cloud_gpu_times):.4f}s")
            print(f"   平均: {sum(self.cloud_gpu_times)/len(self.cloud_gpu_times):.4f}s")
            
        if len(self.edge_cpu_times) > 0:
            print(f"\n📈 CPU时间分布:")
            print(f"   最小: {min(self.edge_cpu_times):.4f}s")
            print(f"   最大: {max(self.edge_cpu_times):.4f}s")
            print(f"   平均: {sum(self.edge_cpu_times)/len(self.edge_cpu_times):.4f}s")
        
        # 性能比较
        if self.cloud_total_time > 0 and self.edge_total_time > 0:
            ratio = self.edge_total_time / self.cloud_total_time
            print(f"\n🔍 性能比较:")
            print(f"   CPU/GPU时间比: {ratio:.2f}x")
            if ratio > 1:
                print(f"   💡 CPU比GPU慢 {ratio:.1f} 倍")
            else:
                print(f"   💡 CPU比GPU快 {1/ratio:.1f} 倍")
        
        print(f"{'='*70}")
        
    def print_memory_timeline(self):
        """打印内存使用时间线"""
        if len(self.memory_snapshots) > 0:
            print(f"\n📊 内存使用时间线:")
            for i, snapshot in enumerate(self.memory_snapshots):
                print(f"   {i+1}. {snapshot['phase']}: {snapshot['memory_mb']:.2f}MB")
                
    def get_summary_stats(self):
        """返回摘要统计信息"""
        return {
            'cloud_total_time': self.cloud_total_time,
            'edge_total_time': self.edge_total_time,
            'network_total_time': self.network_total_time,
            'token_count': self.token_count,
            'layer_calls': self.layer_calls,
            'memory_usage_mb': self.get_memory_mb(),
            'memory_peak_mb': self.peak_memory,
            'memory_diff_mb': self.get_memory_mb() - self.initial_memory
        }

class SVD_GPTJ_Edge_Model(nn.Module):
    """包含所有SVD层的完整edge模型，兼容原始edge模型接口"""
    def __init__(self, original_edge, svd_reduce_rate, device='cpu', svd_device='cpu',No_init=False):
        super().__init__()
        self.device = device
        self.svd_device = svd_device
        self.num_layers = original_edge.num_layers
        self.max_ctx = original_edge.max_ctx
        self.v_cache = [None] * self.num_layers
        
        print(f"🔄 开始SVD分解处理，压缩率: {svd_reduce_rate}")
        print(f"📊 总共需要处理 {self.num_layers} 层...")
        print(f"⚡ SVD分解设备: {svd_device}, 运行设备: {device}")
        
        # 用SVD压缩的层替换原始edge层
        self.svd_layers = nn.ModuleList()
        if(not No_init):
            for i in range(self.num_layers):
                print(f"  处理第 {i+1}/{self.num_layers} 层: ", end="")
                original_edge_layer = original_edge.layers[i]
                    # 奇数层跳过压缩
                if isinstance(svd_reduce_rate, list):

                    svd_layer = SVDED_GPTJ_EDGE_Layer(
                        gptj_edge_layer=original_edge_layer,
                        reduce_rate=svd_reduce_rate[i],
                        device=device,
                        svd_device=svd_device
                    )
                else:
                    svd_layer = SVDED_GPTJ_EDGE_Layer(
                        gptj_edge_layer=original_edge_layer,
                        reduce_rate=svd_reduce_rate,
                        device=device,
                        svd_device=svd_device
                    )
                print("跳过压缩 (奇数层)")
                self.svd_layers.append(svd_layer)
        
        print(f"🎉 所有层的SVD分解处理完成！")
    
    def forward_no_cache(self,x,layer_idx,attn_weights):
        output=self.svd_layers[layer_idx].forward_no_cache(
            x,  attn_weights
        )
        return output
    
    def forward_cache(self, x, layer_idx, attn_weights):
        """
        兼容原始edge模型的forward_cache接口
        Args:
            x: 输入tensor
            layer_idx: 层索引
            attn_weights: 注意力权重
        Returns:
            tuple: (v_cache, output_x) - 与原始edge模型相同的返回格式
        """
        # 使用SVD压缩的层进行前向传播
        # tim1=time.time()
        self.v_cache[layer_idx], output_x = self.svd_layers[layer_idx].forward_cache(
            x, self.v_cache[layer_idx], attn_weights
        )
        # tim2=time.time()
        # print(f"layer_{layer_idx}_forward_time:",tim2-tim1)
        
        # 应用sliding window到缓存
        if self.v_cache[layer_idx] is not None and self.v_cache[layer_idx].size(1) > self.max_ctx:
            self.v_cache[layer_idx] = self.v_cache[layer_idx][:, -self.max_ctx:, :]
        # tim3=time.time()
        # print(f"layer_{layer_idx}_memory_time:",tim3-tim2)

        return self.v_cache[layer_idx], output_x

class GPTJPipeline(nn.Module):
    def __init__(self, model_name='AI-ModelScope/gpt-j-6b', device_cloud='cuda:0', device_edge='cpu', svd_reduce_rate=0.5, use_compile=True,edge=None):
        super(GPTJPipeline, self).__init__()
        print(f"🚀 初始化GPTJPipeline...")
        print(f"📋 配置信息:")
        print(f"   - 模型: {model_name}")
        print(f"   - 云端设备: {device_cloud}")
        print(f"   - 边缘设备: {device_edge}")
        print(f"   - SVD压缩率: {svd_reduce_rate}")
        
        # 初始化性能监控器
        self.performance_monitor = PerformanceMonitor()
        
        # 使用 ModelScope 下载模型
        print(f"📥 使用ModelScope下载模型 {model_name}...")
        model_dir = snapshot_download(
            repo_id=model_name,
            cache_dir='./gpt-j-6b'
        )
        print(f"✅ 模型下载完成，路径: {model_dir}")
        
        # 使用本地模型路径加载 tokenizer
        print(f"🔤 加载tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        
        # 设置 pad_token 为 eos_token（GPT-J 没有 pad_token）
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        print(f"☁️  加载云端模型到 {device_cloud}...")
        self.cloud = gptJ_cloud(model_name=model_dir).to(device_cloud)
        print(f"🖥️  加载边缘模型到CPU...")
        # 强制 edge 放在 CPU
        original_edge = gptJ_edge(model_name=model_dir).to('cpu')
        self.embed = self.cloud.model.transformer.wte
        self.ln_f = self.cloud.model.transformer.ln_f
        self.lm_head = self.cloud.model.lm_head
        self.num_layers = len(self.cloud.q_weights)
        
        print(f"📊 模型共有 {self.num_layers} 层")
        
        # SVD压缩参数
        self.svd_reduce_rate = svd_reduce_rate
        self.use_compile = use_compile
        
        # 创建整个SVD edge模型
        print(f"🔧 创建SVD边缘模型...")
        # 如果有GPU，先在GPU上进行SVD分解，然后移到CPU
        svd_device = device_cloud if torch.cuda.is_available() else 'cpu'
        print(f"🔧 SVD分解将在 {svd_device} 上进行...")
        
        if(svd_reduce_rate!=-1):
            self.edge = SVD_GPTJ_Edge_Model(
                original_edge=original_edge,
                svd_reduce_rate=svd_reduce_rate,
                device='cpu',  # 最终运行在CPU上
                svd_device=svd_device  # 但SVD分解在GPU上进行
            )
        else:
            self.edge=self.edge = SVD_GPTJ_Edge_Model(
                original_edge=original_edge,
                svd_reduce_rate=svd_reduce_rate,
                device='cpu',  # 最终运行在CPU上
                svd_device=svd_device,  # 但SVD分解在GPU上进行
                No_init=True
            )
        
        print(f"✅ GPTJPipeline初始化完成！")
        print(f"🎯 准备开始推理，SVD压缩率: {self.svd_reduce_rate}")
    



    # def forward(self, input_ids, attention_mask=None):
    #     """
    #     Forward pass with RoPE and causal mask support,
    #     using cloud/edge split with caching.
    #     Args:
    #         input_ids (LongTensor): [B, T] input token IDs.
    #         attention_mask (BoolTensor, optional): [B, T] mask for valid tokens.
    #     Returns:
    #         Tensor: logits of shape [B, T, vocab_size]
    #     """
    #     # Reset caches for a fresh generation or evaluation
    #     for i in range(self.num_layers):
    #         self.cloud.k_cache[i] = None
    #         self.edge.v_cache[i] = None

    #     B, T = input_ids.size()
    #     device = input_ids.device

    #     # Token embeddings (GPT-J uses RoPE internally, no explicit positional embedding)
    #     x = self.embed(input_ids)  # [B, T, D]

    #     # Prepare attention mask: combine causal mask with token mask
    #     if attention_mask is None:
    #         attention_mask = torch.ones((B, T), device=device)
    #     # Expand token mask to [B, 1, T] for broadcasting
    #     token_mask = attention_mask.unsqueeze(1)  # [B, 1, T]
    #     # Causal mask [1, T, T]
    #     causal = torch.tril(torch.ones((T, T), device=device)).unsqueeze(0)
    #     # Final mask [B, T, T]: broadcast token_mask along query dimension
    #     attn_mask = token_mask * causal  # [B, T, T]

    #     # Cloud-Edge split forward
    #     for layer_idx in range(self.num_layers):
    #         # Move hidden and mask to cloud device
    #         x_cloud = x.to(self.cloud.device)
    #         mask_cloud = attn_mask.to(self.cloud.device)
    #         # cloud forward: returns (hidden, kv_cache, attn_weights)
    #         x_cloud, _, attn_w = self.cloud.forward_cache(
    #             x_cloud, layer_idx
    #         )
    #         # Move to edge for remaining computation
    #         x_edge = x_cloud.to(self.edge.device)
    #         attn_edge = attn_w.to(self.edge.device)
    #         x_edge, _ = self.edge.forward_cache(
    #             x_edge, layer_idx, attn_weights=attn_edge
    #         )
    #         # Bring back to main device
    #         x = x_edge.to(device)

    #     # Final normalization and LM head
    #     x = self.ln_f(x)           # [B, T, D]
    #     logits = self.lm_head(x)   # [B, T, vocab_size]
    #     return logits


    def forward(self, input_ids):
        # 1. 生成 padding mask: pad_token_id 位置为 0，其它为 1
        #    假设 self.config.pad_token_id 已经被设置
        attention_mask = (input_ids != self.tokenizer.pad_token_id).long()  # [B, T]

        # Reset caches for a fresh generation
        for i in range(self.num_layers):
            self.cloud.k_cache[i] = None
            self.edge.v_cache[i] = None

        # Statistics variables
        cloud_time = edge_time = net_time = 0.0
        layer_calls = 0
        bandwidth = 10  # MB/s

        # Embedding
        x = self.embed(input_ids)  # [B, T, D]

        # 层级迭代
        for layer_idx in range(self.num_layers):
            # Cloud forward：传入 attention_mask，用于内部做 pad+causal 屏蔽
            if hasattr(torch.cuda, 'synchronize'):
                torch.cuda.synchronize()
            t0 = time.time()
            _, _, attn_weights = self.cloud.forward_cache(x, layer_idx, attention_mask)
            if hasattr(torch.cuda, 'synchronize'):
                torch.cuda.synchronize()
            cloud_time += time.time() - t0

            # Edge forward（保持不变）
            x_cpu = x.to('cuda:0')
            attn_cpu = attn_weights.to('cuda:0')
            t1 = time.time()
            _, x_cpu = self.edge.forward_cache(x_cpu, layer_idx, attn_cpu)
            edge_time += time.time() - t1

            # 网络开销估算
            elements = attn_cpu.numel() * attn_cpu.element_size()
            net_time += elements / bandwidth / 1024 / 1024
            x = x_cpu.to(self.embed.weight.device)
            elements = x.numel() * x.element_size()
            net_time += elements / bandwidth / 1024 / 1024

            layer_calls += 1

        # Final normalization and LM head to get logits
        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits


    # def forward (self, input_ids):
    #     outputs = input_ids

    #     # Reset caches for a fresh generation
    #     for i in range (self.num_layers):
    #         self.cloud.k_cache [i] = None
    #         self.edge.v_cache [i] = None

    #     # Statistics variables
    #     cloud_time = 0.0
    #     edge_time = 0.0
    #     layer_calls = 0
    #     net_time = 0.0
    #     bandwidth = 10  # MB/s

    #     # Context window size
    #     max_ctx = self.cloud.max_ctx
    #     x = self.embed(outputs)
    #     # Process the input sequence step by step for causal language modeling
    #     # for token_idx in range (outputs.size(1) - 1):  # Exclude the last token for target generation
    #         # For each token in the sequence, we use the preceding tokens for input
    #         # cur_input = outputs [:, token_idx].unsqueeze(-1)  # Take current token as input
            

    #     for layer_idx in range (self.num_layers):
    #         # Use cache-enabled forward so attention spans all previous tokens
    #         if hasattr (torch.cuda, 'synchronize'):
    #             torch.cuda.synchronize()
    #         t0 = time.time()
    #         _, _, attn_weights = self.cloud.forward_cache (x, layer_idx)
    #         if hasattr (torch.cuda, 'synchronize'):
    #             torch.cuda.synchronize()
    #         cloud_time += time.time() - t0

    #         x_cpu = x.to('cuda:0')
    #         attn_cpu = attn_weights.to('cuda:0')
    #         t1 = time.time()
    #         _, x_cpu = self.edge.forward_cache (x_cpu, layer_idx, attn_cpu)
    #         edge_time += time.time() - t1

    #         elements = attn_cpu.numel() * attn_cpu.element_size ()  # B
    #         net_time += elements / bandwidth / 1024 / 1024
    #         x = x_cpu.to(self.embed.weight.device)
    #         elements = x.numel() * x.element_size ()  # B
    #         net_time += elements / bandwidth / 1024 / 1024
    #         layer_calls += 1

    #     # Final normalization and LM head to get logits
    #     x = self.ln_f (x)
    #     logits = self.lm_head (x)

    #     return logits

    def generate(self, prompt, max_length=50, temperature=1.0, top_k=50):
        """
        调用 forward 方法生成文本
        """
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')[0].tolist()

        # 开始生成文本
        outputs = input_ids.copy()

        for token_idx in range(max_length):
            # 当前token输入到模型
            cur_input = torch.tensor([[outputs[-1]]]).to(self.embed.weight.device)
            logits = self.forward(cur_input)  # 调用forward方法

            # 使用 top-k + 温度采样代替贪心采样
            next_logits = logits[:, -1, :] / temperature
            topk_vals, topk_idx = torch.topk(next_logits, k=top_k, dim=-1)
            probs = torch.softmax(topk_vals, dim=-1)
            next_id = topk_idx[0, torch.multinomial(probs, num_samples=1).item()].item()
            
            outputs.append(next_id)
            
            # 如果遇到结束符，提前停止
            if next_id == self.tokenizer.eos_token_id:
                print(f"  遇到结束符，提前结束生成")
                break

        return self.tokenizer.decode(outputs, clean_up_tokenization_spaces=True)




Using /root/.cache/torch_extensions/py311_cu126 as PyTorch extensions root...
Emitting ninja build file /root/.cache/torch_extensions/py311_cu126/fused_svd_matmul/build.ninja...
Building extension module fused_svd_matmul...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module fused_svd_matmul...


ninja: no work to do.


In [None]:


# 基本配置
model_name    = 'AI-ModelScope/gpt-j-6b'
device_cloud  = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device_edge   = 'cuda:0'
svd_device    = device_cloud if torch.cuda.is_available() else 'cpu'
rates         = [round(i * 0.1, 1) for i in range(0,10)]   # [0.0, 0.1, …, 0.9]

# 1. 下载并加载原始 edge 模型
# original_edge = gptJ_edge(model_name=model_name).to(device_edge)
num_layers    = 28

# del original_edge

pipeline = GPTJPipeline(
    model_name=model_name,
    device_cloud=device_cloud,
    device_edge=device_edge,
    svd_reduce_rate=-1,  # 占位，无实际用到
    
)

scheme=[0.0 for _ in range(num_layers)]


🚀 初始化GPTJPipeline...
📋 配置信息:
   - 模型: AI-ModelScope/gpt-j-6b
   - 云端设备: cuda:0
   - 边缘设备: cuda:0
   - SVD压缩率: -1
📥 使用ModelScope下载模型 AI-ModelScope/gpt-j-6b...
Downloading Model from https://www.modelscope.cn to directory: ./gpt-j-6b/AI-ModelScope/gpt-j-6b




✅ 模型下载完成，路径: ./gpt-j-6b/AI-ModelScope/gpt-j-6b
🔤 加载tokenizer...
☁️  加载云端模型到 cuda:0...


Some weights of the model checkpoint at ./gpt-j-6b/AI-ModelScope/gpt-j-6b were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transforme

🖥️  加载边缘模型到CPU...


Some weights of the model checkpoint at ./gpt-j-6b/AI-ModelScope/gpt-j-6b were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transforme

📊 模型共有 28 层
🔧 创建SVD边缘模型...
🔧 SVD分解将在 cuda:0 上进行...
🔄 开始SVD分解处理，压缩率: -1
📊 总共需要处理 28 层...
⚡ SVD分解设备: cuda:0, 运行设备: cpu
🎉 所有层的SVD分解处理完成！
✅ GPTJPipeline初始化完成！
🎯 准备开始推理，SVD压缩率: -1


In [3]:
evaluation_results = {}

dataloader=load_and_tokenize_dataset("./minipile_cache",pipeline.tokenizer,1)

import pickle
with torch.no_grad():
    # 4.1 构建仅替换 svd_layers 的 edge 模型
    edge_model = pipeline.edge  # 直接复用对象
    temp = nn.ModuleList()

    # 加载模型缓存并添加到 svd_layers
    for i, rate in enumerate(scheme):
        cache_path = f"svd_models/svd_layer_{i}_rate_{rate}.pt"
        print(f"正在加载：{cache_path}")
        mod = torch.load(cache_path, map_location='cuda:0', weights_only=False)
        temp.append(mod)

    edge_model.svd_layers = temp
    edge_model.v_cache = [None] * num_layers
    
    # 4.2 将 pipeline.edge 指向新模型并评估
    pipeline.edge = edge_model.to('cuda:0')
    pipeline.cloud = pipeline.cloud.to('cuda:0')



Using the latest cached version of the dataset since JeanKaddour/minipile couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at minipile_cache/JeanKaddour___minipile/default/0.0.0/18ad1b0c701eaa0de03d3cecfdd769cbc70ffbd0 (last modified on Tue Jul 15 14:28:44 2025).


正在加载：svd_models/svd_layer_0_rate_0.0.pt
正在加载：svd_models/svd_layer_1_rate_0.0.pt
正在加载：svd_models/svd_layer_2_rate_0.0.pt
正在加载：svd_models/svd_layer_3_rate_0.0.pt
正在加载：svd_models/svd_layer_4_rate_0.0.pt
正在加载：svd_models/svd_layer_5_rate_0.0.pt
正在加载：svd_models/svd_layer_6_rate_0.0.pt
正在加载：svd_models/svd_layer_7_rate_0.0.pt
正在加载：svd_models/svd_layer_8_rate_0.0.pt
正在加载：svd_models/svd_layer_9_rate_0.0.pt
正在加载：svd_models/svd_layer_10_rate_0.0.pt
正在加载：svd_models/svd_layer_11_rate_0.0.pt
正在加载：svd_models/svd_layer_12_rate_0.0.pt
正在加载：svd_models/svd_layer_13_rate_0.0.pt
正在加载：svd_models/svd_layer_14_rate_0.0.pt
正在加载：svd_models/svd_layer_15_rate_0.0.pt
正在加载：svd_models/svd_layer_16_rate_0.0.pt
正在加载：svd_models/svd_layer_17_rate_0.0.pt
正在加载：svd_models/svd_layer_18_rate_0.0.pt
正在加载：svd_models/svd_layer_19_rate_0.0.pt
正在加载：svd_models/svd_layer_20_rate_0.0.pt
正在加载：svd_models/svd_layer_21_rate_0.0.pt
正在加载：svd_models/svd_layer_22_rate_0.0.pt
正在加载：svd_models/svd_layer_23_rate_0.0.pt
正在加载：svd_models/svd_layer_

In [None]:
from tqdm import tqdm
import math


def evaluate_minipile_gptj(model, batch_size: int = 1, cache_dir: str = "./minipile_cache", Dataloader=None) -> dict:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Load and tokenize dataset
    tokenizer = model.tokenizer  # already initialized in the pipeline
    dataloader = None
    if Dataloader is None:
        dataloader = load_and_tokenize_dataset(cache_dir, tokenizer, batch_size)
    else:
        dataloader = Dataloader

    # Initialize loss function
    criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=-100)

    # Evaluation loop
    total_loss = 0.0
    total_batches = 0

    # model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # 拿到完整的 input_ids, attention_mask, 和已经被 collator 设好 -100 的 labels
            input_ids    = batch['input_ids'].to(device)       # [B, T]
            attention_mask = batch['attention_mask'].to(device)# [B, T]
            labels       = batch['labels'].to(device)          # [B, T], pad 已经是 -100

            # 打印一下数据形状和内容以便调试
            print(f"输入形状: input_ids={input_ids.shape}, attention_mask={attention_mask.shape}, labels={labels.shape}")
            print(f"第一个样本前10个token - input_ids: {input_ids[0, :10]}")
            print(f"第一个样本前10个label: {labels[0, :10]}")
            
            with torch.no_grad():
                # 传递attention_mask给forward函数
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits  = outputs                     # [B, T, V]

            # 手动 shift：logits 丢掉最后一位，labels 丢掉第一位
            shift_logits = logits[:, :-1, :].contiguous()    # [B, T-1, V]
            shift_labels = labels[:, 1:].contiguous()        # [B, T-1]

            # 计算交叉熵 loss，ignore_index=-100 会跳过所有 pad 位置
            loss = criterion(
                shift_logits.view(-1, shift_logits.size(-1)),  # [(B*(T-1)), V]
                shift_labels.view(-1)                          # [(B*(T-1))]
            )
            print(f"当前batch loss: {loss.item()}")
            total_loss   += loss.item()
            total_batches+= 1
            
            # 只处理前几个batch进行调试
            if total_batches >= 3:
                break

        avg_loss = total_loss / total_batches
        perplexity = math.exp(avg_loss)

    return {"avg_loss": avg_loss, "perplexity": perplexity}

eval_result = evaluate_minipile_gptj(pipeline, batch_size=1, Dataloader=dataloader)

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

Evaluating:   7%|▋         | 9/125 [00:04<01:02,  1.85it/s]


KeyboardInterrupt: 

In [6]:
import importlib
importlib.reload(GPTJPipeline)

ImportError: module GPTJPipeline not in sys.modules