In [1]:
import torch
import time
import os
from transformers import AutoTokenizer
from modelscope.utils.hub import snapshot_download
from detection.Loader.mymodel_file.gptJ_cloud import gptJ_cloud
from detection.Loader.mymodel_file.gptJ_edge import gptJ_edge

from transformers import AutoModelForCausalLM,AutoTokenizer
from  datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from modelscope.utils.hub import snapshot_download
import torch

class GPTJPipeline:
    def __init__(self, model_name='AI-ModelScope/gpt-j-6b', device_cloud='cuda:0', device_edge='cpu'):
        # 使用 ModelScope 下载模型
        print(f"Downloading model {model_name} using ModelScope...")
        model_dir = snapshot_download(
            repo_id=model_name,
            cache_dir='./gpt-j-6b'
        )
        print(f"Model downloaded to: {model_dir}")
        
        # 使用本地模型路径加载 tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        
        # 设置 pad_token 为 eos_token（GPT-J 没有 pad_token）
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.cloud = gptJ_cloud(model_name=model_dir).to(device_cloud)
        # 强制 edge 放在 CPU
        self.edge = gptJ_edge(model_name=model_dir).to('cpu')
        
        # 获取 embedding 和输出层
        self.embed = self.cloud.model.transformer.wte
        self.ln_f = self.cloud.model.transformer.ln_f
        self.lm_head = self.cloud.model.lm_head
        self.num_layers = len(self.cloud.q_weights)
    def forward(self,prompt):
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')[0].tolist()
        outputs = input_ids.copy()
        # cur_id = torch.tensor([[outputs[-1]]]).to(self.embed.weight.device)
        x = self.embed(torch.tensor(outputs).to('cuda:0'))
        # x = self.embed(torch.tensor(outputs).to(self.embed.weight.device))
        
        for layer_idx in range(self.num_layers):
            # use cache-enabled forward so attention spans all previous tokens
            if hasattr(torch.cuda, 'synchronize'):
                torch.cuda.synchronize()
            t0 = time.time()
            _, _, attn_weights = self.cloud.forward_cache(x, layer_idx)
            if hasattr(torch.cuda, 'synchronize'):
                torch.cuda.synchronize()
            # cloud_time += time.time() - t0

            x_cpu = x.to('cpu')
            attn_cpu = attn_weights.to('cpu')
            # t1 = time.time()
            _, x_cpu = self.edge.forward_cache(x_cpu, layer_idx, attn_cpu)
            # edge_time += time.time() - t1
            
            elements = attn_cpu.numel() * attn_cpu.element_size()  # B
            # net_time += elements / bandwidth / 1024 / 1024
            x = x_cpu.to(self.embed.weight.device)
            elements = x.numel() * x.element_size()  # B
            # net_time += elements / bandwidth / 1024 / 1024
            # layer_calls += 1
            
        # final normalization and LM head to get logits
        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits

    def generate(self, prompt, max_length=50, temperature=1.0, top_k=50):
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')[0].tolist()
        outputs = input_ids.copy()

        # reset caches for a fresh generation
        for i in range(self.num_layers):
            self.cloud.k_cache[i] = None
            self.edge.v_cache[i] = None

        # 统计变量
        cloud_time = 0.0
        edge_time = 0.0
        layer_calls = 0
        net_time = 0.0
        bandwidth = 10  # MB/s

        # 上下文窗口大小
        max_ctx = self.cloud.max_ctx

        # 预热缓存：将 prompt 中每个 token 走一次 forward_cache
        for pos, token_id in enumerate(input_ids):
            # clamp 位置，防止越界
            pos_clamped = pos if pos < max_ctx else max_ctx - 1
            cur_id = torch.tensor([[token_id]]).to(self.embed.weight.device)
            
            # GPT-J 没有位置embedding，直接使用 token embedding
            x = self.embed(cur_id)
            
            for layer_idx in range(self.num_layers):
                # cloud on GPU
                if hasattr(torch.cuda, 'synchronize'):
                    torch.cuda.synchronize()
                t0 = time.time()
                _, _, attn_weights = self.cloud.forward_cache(x, layer_idx)
                if hasattr(torch.cuda, 'synchronize'):
                    torch.cuda.synchronize()
                cloud_time += time.time() - t0
                
                # edge on CPU: 把 x 和 attn_weights 都搬到 cpu
                x_cpu = x.to('cpu')
                attn_cpu = attn_weights.to('cpu')
                elements = attn_cpu.numel() * attn_cpu.element_size()  # B
                net_time += elements / bandwidth / 1024 / 1024  # s
                
                t1 = time.time()
                _, x_cpu = self.edge.forward_cache(x_cpu, layer_idx, attn_cpu)
                edge_time += time.time() - t1
                print(f"edge_time_{layer_idx}:",time.time()-t1)
                # 回到 GPU 继续下一层
                x = x_cpu.to(self.embed.weight.device)
                elements = x.numel() * x.element_size()  # B
                net_time += elements / bandwidth / 1024 / 1024
                layer_calls += 1

        # 真实生成阶段
        for _ in range(max_length):
            cur_id = torch.tensor([[outputs[-1]]]).to(self.embed.weight.device)
            x = self.embed(cur_id)
            # x = self.embed(torch.tensor(outputs).to(self.embed.weight.device))
            
            for layer_idx in range(self.num_layers):
                # use cache-enabled forward so attention spans all previous tokens
                if hasattr(torch.cuda, 'synchronize'):
                    torch.cuda.synchronize()
                t0 = time.time()
                _, _, attn_weights = self.cloud.forward_cache(x, layer_idx)
                if hasattr(torch.cuda, 'synchronize'):
                    torch.cuda.synchronize()
                cloud_time += time.time() - t0

                x_cpu = x.to('cpu')
                attn_cpu = attn_weights.to('cpu')
                t1 = time.time()
                _, x_cpu = self.edge.forward_cache(x_cpu, layer_idx, attn_cpu)
                edge_time += time.time() - t1
                
                elements = attn_cpu.numel() * attn_cpu.element_size()  # B
                net_time += elements / bandwidth / 1024 / 1024
                x = x_cpu.to(self.embed.weight.device)
                elements = x.numel() * x.element_size()  # B
                net_time += elements / bandwidth / 1024 / 1024
                layer_calls += 1
                
            # final normalization and LM head to get logits
            x = self.ln_f(x)
            logits = self.lm_head(x)
            
            # 用 top-k + 温度采样代替贪心 argmax
            next_logits = logits[:, -1, :] / temperature
            topk_vals, topk_idx = torch.topk(next_logits, k=top_k, dim=-1)
            probs = torch.softmax(topk_vals, dim=-1)
            next_id = topk_idx[0, torch.multinomial(probs, num_samples=1).item()].item()
            outputs.append(next_id)
            
            if next_id == self.tokenizer.eos_token_id:
                break

        # 打印平均耗时
        if layer_calls > 0:
            print(f"Avg GPU(cloud) per-layer: {cloud_time/layer_calls:.4f}s, CPU(edge) per-layer: {edge_time/layer_calls:.4f}s, net: {net_time/layer_calls:.4f}s")
            print(f"Avg GPU(cloud) per-token: {cloud_time/layer_calls+net_time/layer_calls:.4f}s, CPU(edge) per-token: {edge_time/layer_calls:.4f}s")
            
        return self.tokenizer.decode(outputs, clean_up_tokenization_spaces=True)





In [2]:
model_path = snapshot_download(
            repo_id='AI-ModelScope/gpt-j-6b',
            cache_dir='./gpt-j-6b'
        )
tokenizer = AutoTokenizer.from_pretrained(
            model_path,
            local_files_only=True,
            trust_remote_code=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
from torch import nn



Downloading Model from https://www.modelscope.cn to directory: ./gpt-j-6b/AI-ModelScope/gpt-j-6b


In [3]:

def load_and_tokenize_dataset(cache_dir: str, tokenizer, batch_size: int = 1):

    ds = load_dataset("JeanKaddour/minipile", split="validation", cache_dir=cache_dir)

    # Tokenize dataset
    def tokenize_fn(examples):
        return tokenizer(examples['text'], padding=True, truncation=True, max_length=512)
    
    tokenized = ds.map(tokenize_fn, batched=True, remove_columns=["text"])

    # Group the dataset into blocks of model_max_length
    block_size = tokenizer.model_max_length
    def group_texts(examples):
        all_ids = sum(examples["input_ids"], [])
        total_len = (len(all_ids) // block_size) * block_size
        blocks = [all_ids[i:i + block_size] for i in range(0, total_len, block_size)]
        return {"input_ids": blocks}

    lm_dataset = tokenized.map(group_texts, batched=True, remove_columns=["attention_mask"])

    # DataLoader setup
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    dataloader = DataLoader(lm_dataset, batch_size=batch_size, collate_fn=data_collator)

    return dataloader


dataloader=load_and_tokenize_dataset("./minipile_cache",tokenizer,1)

Using the latest cached version of the dataset since JeanKaddour/minipile couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at minipile_cache/JeanKaddour___minipile/default/0.0.0/18ad1b0c701eaa0de03d3cecfdd769cbc70ffbd0 (last modified on Tue Jul 15 14:28:44 2025).


In [4]:
model_name = 'AI-ModelScope/gpt-j-6b'
device_cloud = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device_edge = 'cuda:0'

pipeline = GPTJPipeline(model_name=model_name, device_cloud=device_cloud, device_edge=device_edge)


Downloading model AI-ModelScope/gpt-j-6b using ModelScope...




Downloading Model from https://www.modelscope.cn to directory: ./gpt-j-6b/AI-ModelScope/gpt-j-6b
Model downloaded to: ./gpt-j-6b/AI-ModelScope/gpt-j-6b


Some weights of the model checkpoint at ./gpt-j-6b/AI-ModelScope/gpt-j-6b were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transforme

In [5]:
prompt = "Once upon a time"
generated_text = pipeline.generate(prompt, max_length=50)
print(generated_text)
prompt = "China is a"
generated_text = pipeline.generate(prompt, max_length=50)
print(generated_text)

edge_time_0: 0.00906991958618164
edge_time_1: 0.011162281036376953
edge_time_2: 0.0074765682220458984
edge_time_3: 0.007546663284301758
edge_time_4: 0.007815122604370117
edge_time_5: 0.007494926452636719
edge_time_6: 0.0074841976165771484
edge_time_7: 0.008049488067626953
edge_time_8: 0.01416635513305664
edge_time_9: 0.0077266693115234375
edge_time_10: 0.009167194366455078
edge_time_11: 0.00717926025390625
edge_time_12: 0.00768589973449707
edge_time_13: 0.011844635009765625
edge_time_14: 0.00760197639465332
edge_time_15: 0.007195949554443359
edge_time_16: 0.0072896480560302734
edge_time_17: 0.0076525211334228516
edge_time_18: 0.007966279983520508
edge_time_19: 0.007086753845214844
edge_time_20: 0.007016658782958984
edge_time_21: 0.007021665573120117
edge_time_22: 0.007078647613525391
edge_time_23: 0.011701822280883789
edge_time_24: 0.007598161697387695
edge_time_25: 0.008166790008544922
edge_time_26: 0.007725238800048828
edge_time_27: 0.0072460174560546875
edge_time_0: 0.00766158103942

In [6]:
logits=pipeline.forward(prompt)
print(logits.shape)
predicted_ids = torch.argmax(logits, dim=-1)
predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_ids[0])  # 获取第一批次的词索引并转换为词

# 3. 输出生成的文本
generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
print(generated_text)

RuntimeError: Tensors must have same number of dimensions: got 3 and 2

In [None]:
from tqdm import tqdm
device='cuda:0'    # Evaluation loop
total_loss = 0.0
total_batches = 0
torch.cuda.empty_cache()
torch.cuda.synchronize()
# model=model.to(device)
for batch in tqdm(dataloader, desc="Evaluating"):
    input_ids = batch['input_ids'].to(device)
    print("yes")
    # target_ids = input_ids[:, 1:].contiguous()  # 去掉第一个token，生成目标序列
    target_ids=batch['labels'].to(device)
    print(batch)
    attn_mask=batch['attention_mask'].to(device)
    # input_ids = input_ids[:, :-1]  # 去掉最后一个token，作为模型的输入# For causal language modeling, the target is the input itself
    # Forward pass
    with torch.no_grad():
        # outputs = model(input_ids=input_ids,attention_mask=attn_mask)
        otuputs=model.forward(input_ids)
        logits = outputs.logits  
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

    # Compute loss (CrossEntropyLoss expects target to be in shape [batch_size, seq_len])
    # We need to flatten logits and targets to compute cross-entropy loss correctly
    loss = criterion(logits.view(-1, logits.size(-1)), target_ids.view(-1))
    print(loss.item())
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    total_loss += loss.item()
    total_batches += 1
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

avg_loss = total_loss / total_batches
# perplexity = math.exp(avg_loss)

Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

yes
{'input_ids': tensor([[   48,    25,   198,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[  48,   25,  198,  ..., -100, -100, -100]])}





NameError: name 'model' is not defined