In [1]:
import torch
import copy
import argparse
import numpy as np
import json
import scipy
from torch_geometric.data import Data
from torch_sparse import SparseTensor
from tqdm import tqdm
import os
import pickle
from torch.nn import CrossEntropyLoss
import json
from transformers import LlamaForCausalLM, LlamaTokenizer, AdamW, get_linear_schedule_with_warmup
from itertools import chain
from torch_geometric.utils import to_undirected
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(r'data.json') as f:
    data = json.load(f)

x = torch.tensor(data['features'], dtype=torch.float)
y = torch.tensor(data['labels'], dtype=torch.long)

edges = [[(i, j) for j in js] for i, js in enumerate(data['links'])]
edges = list(chain(*edges))  # type: ignore
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_index = to_undirected(edge_index, num_nodes=x.size(0))


In [3]:
train_mask = torch.tensor(data['train_masks'], dtype=torch.bool)
train_mask = train_mask.t().contiguous()

val_mask = torch.tensor(data['val_masks'], dtype=torch.bool)
val_mask = val_mask.t().contiguous()

test_mask = torch.tensor(data['test_mask'], dtype=torch.bool)

stopping_mask = torch.tensor(data['stopping_masks'], dtype=torch.bool)
stopping_mask = stopping_mask.t().contiguous()

In [4]:
adj_t= SparseTensor(row=edge_index[0].to(torch.long), 
col=edge_index[1].to(torch.long),
sparse_sizes=(len(y),len(y)))

In [5]:
data= Data(x=x, adj_t=adj_t,y=y,train_mask=train_mask,
                    val_mask=val_mask, test_mask=test_mask,
                    stopping_mask=stopping_mask)

In [6]:
data

Data(x=[11701, 300], y=[11701], adj_t=[11701, 11701, nnz=431726], train_mask=[11701, 20], val_mask=[11701, 20], test_mask=[11701], stopping_mask=[11701, 20])

In [7]:
metadata = json.load(open(r'metadata.json'))

In [8]:
labels = metadata.get('labels', [])
nodes = metadata.get('nodes', [])

In [9]:
labels

{'0': 'Computational linguistics',
 '1': 'Databases',
 '2': 'Operating systems',
 '3': 'Computer architecture',
 '4': 'Computer security',
 '5': 'Internet protocols',
 '6': 'Computer file systems',
 '7': 'Distributed computing architecture',
 '8': 'Web technology',
 '9': 'Programming language topics'}

In [10]:
nodes[0].keys()

dict_keys(['id', 'title', 'label', 'outlinks', 'tokens'])

In [12]:
nodes[0]

{'id': 32473088,
 'title': 'Twilio',
 'label': 'Distributed computing architecture',
 'outlinks': [23862,
  28684,
  90451,
  93483,
  1691376,
  1954315,
  2420207,
  27156851,
  46967612],
 'tokens': ['twilio',
  'twilio',
  'cloud',
  'communications',
  'platform',
  'service',
  'cpaas',
  'company',
  'based',
  'san',
  'francisco',
  'california',
  'twilio',
  'allows',
  'software',
  'developers',
  'programmatically',
  'make',
  'receive',
  'phone',
  'calls',
  'send',
  'receive',
  'text',
  'messages',
  'perform',
  'communication',
  'functions',
  'using',
  'web',
  'service',
  'apis',
  'twilio',
  'founded',
  '2008',
  'jeff',
  'lawson',
  'evan',
  'cooke',
  'john',
  'wolthuis',
  'originally',
  'based',
  'seattle',
  'washington',
  'san',
  'francisco',
  'california',
  'twilio',
  'first',
  'major',
  'press',
  'coverage',
  'november',
  '2008',
  'result',
  'application',
  'built',
  'jeff',
  'lawson',
  'rickroll',
  'people',
  'investor',
 

In [25]:
max_length = 0
min_length = float('inf')
total_length = 0
count = 0

for node in nodes:
    if 'tokens' in node:
        token_length = len(node['tokens'])
        max_length = max(max_length, token_length)
        min_length = min(min_length, token_length)
        total_length += token_length
        count += 1

average_length = total_length / count if count > 0 else 0

print(f"Longest length: {max_length}")
print(f"Shortest length: {min_length}")
print(f"Average length: {average_length:.2f}")

Longest length: 15230
Shortest length: 1
Average length: 414.20


In [26]:
model = LlamaForCausalLM.from_pretrained(
                                    '../llama2-7b-hf',
                                    load_in_8bit=True,
                                    torch_dtype=torch.float16,
                                    use_safetensors=False,
                                    device_map='cuda:0'
                                )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /root/miniconda3/envs/edgetoken/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /root/miniconda3/envs/edgetoken/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.63s/it]


In [27]:
tokenizer = LlamaTokenizer.from_pretrained('../llama2-7b-hf', max_length=4096)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.unk_token

In [28]:
len(nodes)

11701

In [None]:
edge_weights = []

batch_size = 1  # 根据实际情况调整批次大小

max_input_length = 448  # 考虑到特殊token，实际文本长度设为4000

with torch.no_grad():  # 关闭梯度计算，减少显存使用
    for i in tqdm(range(0, len(nodes), batch_size), desc="Processing batches"):
        batch_texts = []
        for j in range(i, min(i + batch_size, len(nodes))):
            # 获取节点中的tokens
            tokens = nodes[j]['tokens']
            
            # 拼接tokens为字符串，并且添加描述性的前缀
            text = ' '.join(tokens[:10])
            
            # 使用tokenizer来获取文本长度
            tokenized_text = tokenizer(text, return_length=True)
            length = len(tokenized_text['input_ids'])
            batch_texts.append(text)

        # 批量处理
        batch_encoding = tokenizer(batch_texts, padding='longest', max_length=512, truncation=True, return_tensors="pt").to('cuda')

        input_ids = batch_encoding['input_ids']
        attention_mask = batch_encoding['attention_mask']

        # 计算每个句子嵌入的最终隐藏状态
        # outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        # sentence_embeddings = outputs.hidden_states[-1][:, -1, :].cpu()
        embeddings = model.model.embed_tokens(input_ids)
        sentence_embeddings=torch.mean(embeddings,dim=1).cpu()

        # 保存结果并释放内存
        edge_weights.extend(sentence_embeddings)
        # del input_ids, attention_mask, outputs
        torch.cuda.empty_cache()

Processing batches: 100%|██████████| 11701/11701 [00:12<00:00, 955.50it/s] 


In [32]:
# 转换为张量
edge_weights = torch.stack(edge_weights)
edge_weights=edge_weights.float()

In [33]:
data.x=edge_weights

In [36]:
data

Data(x=[11701, 4096], y=[11701], adj_t=[11701, 11701, nnz=431726], train_mask=[11701, 20], val_mask=[11701, 20], test_mask=[11701], stopping_mask=[11701, 20])

In [35]:
torch.save(data, 'data_token_right_10.pt')