In [1]:
from collections import Counter
from string import punctuation
from tqdm.notebook import tqdm
import numpy as np
import networkx as nx

In [4]:
with open(r'F:\Project\pythonProject\amazon_ratings\amazon-meta.txt') as file:
    lines = [line.strip() for line in file]

FileNotFoundError: [Errno 2] No such file or directory: 'F:\\Project\\pythonProject\\amazon_ratings\\amazon-meta.txt'

In [2]:
lines[:20]

NameError: name 'lines' is not defined

Transform raw lines to a dictionary with graph nodes information. The dictionary keys are item ASINs, and the values contain item's title, a list of neighbors (similar item ASINs) and item's rating.

In [55]:
nodes = {}
i = 0
while i < len(lines):
    if not lines[i].startswith('ASIN:') or lines[i + 1] == 'discontinued product':
        i += 1
        continue
    
    idx = lines[i][6:]
    
    i += 1
    assert lines[i].startswith('title:')
    title = lines[i][7:]
    
    while not lines[i].startswith('similar:'):
        i += 1
    
    neighbors = lines[i].split()[2:]
    
    while not lines[i].startswith('reviews:'):
        i += 1
    
    num_reviews = int(lines[i].split()[2])
    if num_reviews == 0:
        continue
    
    rating = float(lines[i].split()[7])
    
    nodes[idx] = {
        'title': title,
        'neighbors': neighbors,
        'rating': rating
    }
    
    i += 1

len(nodes)

402735

In [56]:
Counter(len(dct['neighbors']) for dct in nodes.values())

Counter({5: 294006, 0: 78683, 1: 7936, 2: 7822, 3: 7424, 4: 6864})

In [57]:
Counter(dct['rating'] for dct in nodes.values())

Counter({5.0: 145835,
         4.5: 103563,
         4.0: 83458,
         3.5: 34405,
         3.0: 21227,
         2.5: 6561,
         2.0: 4117,
         1.0: 2732,
         1.5: 837})

Assign consecutive ids to nodes.

In [58]:
for i, dct in enumerate(nodes.values()):
    dct['num'] = i

Get a list of edges.

In [59]:
edges = set()
for dct in nodes.values():
    for neighbor_idx in dct['neighbors']:
        if neighbor_idx in nodes:
            neighbor_dct = nodes[neighbor_idx]
            u, v = dct['num'], neighbor_dct['num']
            if u > v:
                u, v = v, u
            
            edges.add((u, v))

edges = list(edges)
len(edges)

808236

Create a networkx graph.

In [60]:
graph = nx.Graph()
graph.add_nodes_from(range(len(nodes)))
graph.add_edges_from(edges)

Only leave the largest connected component of the graph.

In [61]:
comps = nx.connected_components(graph)
largest_comp = max(comps, key=len)
len(largest_comp)

279832

In [62]:
graph.remove_nodes_from([i for i in range(len(graph.nodes)) if i not in largest_comp])
len(graph.nodes)

279832

Only leave the largest connected component of the 5-core of the graph (we need to separate the largest connected component again, since taking the 5-core results in multiple connected components).

In [63]:
core = nx.k_core(graph, k=5)
len(core.nodes)

42400

In [64]:
comps = nx.connected_components(core)
largest_comp = max(comps, key=len)
len(largest_comp)

24492

In [65]:
core.remove_nodes_from([i for i in list(core.nodes) if i not in largest_comp])
len(core.nodes)

24492

Some more simple processing.

In [66]:
core_nodeset = set(core.nodes)

In [67]:
core_nodes = {dct['num']: dct for dct in nodes.values() if dct['num'] in core_nodeset}
len(core_nodes)

24492

In [68]:
core_nodes = [core_nodes[i] for i in range(len(nodes)) if i in core_nodes]
len(core_nodes)

24492

In [69]:
titles = [node['title'] for node in core_nodes]

In [70]:
ratings = [node['rating'] for node in core_nodes]

In [71]:
core = nx.convert_node_labels_to_integers(core, ordering='sorted')

In [72]:
edges = np.array(sorted(core.edges))
edges.shape

(93050, 2)

In [73]:
Counter(ratings)

Counter({4.5: 9010,
         5.0: 6560,
         4.0: 5678,
         3.5: 2183,
         3.0: 772,
         2.5: 173,
         2.0: 78,
         1.5: 21,
         1.0: 17})

Convert possible rating values into five classes.

In [74]:
rating_to_label = {5.0: 0, 4.5: 1, 4.0: 2, 3.5: 3, 3.0: 4, 2.5: 4, 2.0: 4, 1.5: 4, 1.0: 4}
labels = np.array([rating_to_label[rating] for rating in ratings])
labels.shape

(24492,)

In [75]:
Counter(labels)

Counter({1: 9010, 0: 6560, 2: 5678, 3: 2183, 4: 1061})

In [76]:
len(titles)

24492

In [3]:
with open('../../../new_datasets/amazon_ratings_texts.txt', 'w') as file:
    for title in titles:
        file.write(title + '\n')


FileNotFoundError: [Errno 2] No such file or directory: '../../../new_datasets/amazon_ratings_texts.txt'

Get average embeddings of words in item titles.

In [32]:
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_vectors

ImportError: cannot import name 'triu' from 'scipy.linalg' (F:\anaconda3\Lib\site-packages\scipy\linalg\__init__.py)

In [27]:
fasttext_path = datapath('/home/olegplatonov/fastText/cc.en.300.bin')

In [28]:
wv = load_facebook_vectors(fasttext_path)

In [29]:
translator = str.maketrans(punctuation, ' ' * len(punctuation))

embs = []
for title in titles:
    title = title.lower()
    title = title.translate(translator)
    title = title.split()
    
    emb = np.zeros(300, dtype=np.float32)
    for word in title:
        emb += wv[word]
    
    emb /= len(title)
    
    embs.append(emb)

embs = np.array(embs)
embs.shape

(24492, 300)

In [77]:
labels

array([1, 2, 1, ..., 4, 0, 0])

Create 10 random stratified train-val-test splits.

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
train_masks = []
val_masks = []
test_masks = []

num_data_splits = 10
for i in range(num_data_splits):
    full_idx = np.arange(len(labels))
    train_idx, val_and_test_idx = train_test_split(full_idx, test_size=0.5, random_state=i,
                                                   stratify=labels)

    val_idx, test_idx = train_test_split(val_and_test_idx, test_size=0.5, random_state=i,
                                         stratify=labels[val_and_test_idx])
    
    train_mask = np.array([False for _ in labels])
    train_mask[train_idx] = True
    train_masks.append(train_mask)
    
    val_mask = np.array([False for _ in labels])
    val_mask[val_idx] = True
    val_masks.append(val_mask)
    
    test_mask = np.array([False for _ in labels])
    test_mask[test_idx] = True
    test_masks.append(test_mask)

train_masks = np.vstack(train_masks)
val_masks = np.vstack(val_masks)
test_masks = np.vstack(test_masks)

In [32]:
np.savez_compressed('../../data/amazon_ratings.npz',
                    node_features=embs,
                    node_labels=labels,
                    edges=edges,
                    train_masks=train_masks,
                    val_masks=val_masks,
                    test_masks=test_masks)

Compute graph metrics: edge homophily, adjusted homophily and edge label informativeness.

In [42]:
def h_edge(graph, labels):
    h_edge = 0
    for u, v in graph.edges:
        if labels[u] == labels[v]:
            h_edge += 1

    h_edge /= len(graph.edges)
    
    return h_edge


def h_adj(graph, labels):
    num_classes = len(np.unique(labels))
    
    D = np.zeros((num_classes,))
    for u in graph.nodes:
        label = labels[u]
        D[label] += graph.degree(u)

    adjust = (D**2 / (len(graph.edges) * 2)**2).sum()

    h_adj = (h_edge(graph, labels) - adjust) / (1 - adjust)
    
    return h_adj


def li_edge(graph, labels):
    num_classes = len(np.unique(labels))
    
    class_probs = np.array([0 for _ in range(num_classes)], dtype=float)
    class_degree_weighted_probs = np.array([0 for _ in range(num_classes)], dtype=float)
    for u in graph.nodes:
        label = labels[u]
        class_probs[label] += 1
        class_degree_weighted_probs[label] += graph.degree(u)

    class_probs /= class_probs.sum()
    class_degree_weighted_probs /= class_degree_weighted_probs.sum()

    edge_probs = np.zeros((num_classes, num_classes))
    for u, v in graph.edges:
        label_u = labels[u]
        label_v = labels[v]
        edge_probs[label_u, label_v] += 1
        edge_probs[label_v, label_u] += 1

    edge_probs /= edge_probs.sum()
    
    edge_probs += 1e-8

    li_edge = 2 - (edge_probs * np.log(edge_probs)).sum() / (class_degree_weighted_probs * np.log(class_degree_weighted_probs)).sum()

    return li_edge


In [43]:
data = np.load(r'F:\Project\pythonProject\amazon_ratings\amazon_ratings.npz')

In [44]:
graph = nx.Graph()
graph.add_nodes_from(range(len(data['node_features'])))
graph.add_edges_from(data['edges'])

labels = data['node_labels']

In [45]:
h_edge(graph, labels)

0.3803761418592155

In [46]:
h_adj(graph, labels)

0.14023015893977192

In [47]:
li_edge(graph, labels)

0.039767270134040134

In [49]:
len(labels)

24492

# 更换embedding

In [2]:
import torch
import copy
import argparse
import numpy as np
import json
import scipy
from torch_geometric.data import Data
from torch_sparse import SparseTensor
from tqdm import tqdm
import os
import pickle
from torch.nn import CrossEntropyLoss
import json
from transformers import LlamaForCausalLM, LlamaTokenizer, AdamW, get_linear_schedule_with_warmup
from itertools import chain
from torch_geometric.utils import to_undirected
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

model = LlamaForCausalLM.from_pretrained(
                                    '../llama2-7b-hf',
                                    load_in_8bit=True,
                                    torch_dtype=torch.float16,
                                    use_safetensors=False,
                                    device_map='cuda:0'
                                )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /root/miniconda3/envs/edgetoken/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /root/miniconda3/envs/edgetoken/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.82s/it]


In [4]:
tokenizer = LlamaTokenizer.from_pretrained('../llama2-7b-hf', max_length=4096)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.unk_token

In [1]:
with open('amazon_ratings_texts.txt', 'r', encoding='utf-8') as file:
    content = file.readlines()  # 读取所有行

In [2]:
# 去除每行末尾的换行符
content = [line.strip() for line in content]

# 将列表转换为NumPy数组
content = np.array(content)

NameError: name 'np' is not defined

In [7]:
tokenizer(content[3])

{'input_ids': [1, 22012, 3929, 1567, 313, 29928, 957, 498, 7532, 2155, 2187, 29897], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
content[4]

'Puddnhead Wilson : And, Those Extraordinary Twins (The Penguin English Library)'

In [21]:
edge_weights = []

batch_size = 8  # 根据实际情况调整批次大小


with torch.no_grad():  # 关闭梯度计算，减少显存使用
    for i in tqdm(range(0, len(content), batch_size), desc="Processing batches"):
        batch_texts = []
        for j in range(i, min(i + batch_size, len(content))):
            # 获取节点中的tokens
            text = content[j]
            
            text = f"This sentence: \"{text}\" means in a word:"
            
            # 使用tokenizer来获取文本长度
            tokenized_text = tokenizer(text, return_length=True)
            length = len(tokenized_text['input_ids'])
            
            # 如果长度超过最大输入长度，进行截断
            if length > 512:
                text = content[j][:496]
                text = f"This sentence: \"{text}\" means in a word:"
                
                # 再次检查长度，确保符合要求
                tokenized_text = tokenizer(text, return_length=True)
                assert len(tokenized_text['input_ids']) <= 512, "Text is still too long after truncation."
            
            batch_texts.append(text)

        # 批量处理
        batch_encoding = tokenizer(batch_texts, padding='longest', max_length=512, truncation=True, return_tensors="pt").to('cuda')

        input_ids = batch_encoding['input_ids']
        attention_mask = batch_encoding['attention_mask']

        # 计算每个句子嵌入的最终隐藏状态
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        sentence_embeddings = outputs.hidden_states[-1][:, -1, :].cpu()

        # 保存结果并释放内存
        edge_weights.extend(sentence_embeddings)
        del input_ids, attention_mask, outputs
        torch.cuda.empty_cache()

Processing batches:   0%|          | 0/3062 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Processing batches: 100%|██████████| 3062/3062 [10:05<00:00,  5.06it/s]


In [14]:
edge_weights = []

batch_size = 1  # 根据实际情况调整批次大小

max_input_length = 448  # 考虑到特殊token，实际文本长度设为4000

with torch.no_grad():  # 关闭梯度计算，减少显存使用
    for i in tqdm(range(0, len(content), batch_size), desc="Processing batches"):
        batch_texts = []
        for j in range(i, min(i + batch_size, len(content))):
            # 获取节点中的tokens
            text = content[j]
            
            
            # 使用tokenizer来获取文本长度
            tokenized_text = tokenizer(text, return_length=True)
            length = len(tokenized_text['input_ids'])
            batch_texts.append(text)

        # 批量处理
        batch_encoding = tokenizer(batch_texts, padding='longest', max_length=512, truncation=True, return_tensors="pt").to('cuda')

        input_ids = batch_encoding['input_ids'][:,1:10]
        attention_mask = batch_encoding['attention_mask']

        # 计算每个句子嵌入的最终隐藏状态
        # outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        # sentence_embeddings = outputs.hidden_states[-1][:, -1, :].cpu()
        embeddings = model.model.embed_tokens(input_ids)
        sentence_embeddings=torch.mean(embeddings,dim=1).cpu()

        # 保存结果并释放内存
        edge_weights.extend(sentence_embeddings)
        # del input_ids, attention_mask, outputs
        torch.cuda.empty_cache()

Processing batches: 100%|██████████| 24492/24492 [00:31<00:00, 769.20it/s] 


In [15]:
tokenizer.batch_decode(input_ids)

['If You Take a Mouse Five-Book Set']

In [16]:
len(edge_weights)

24492

In [17]:
# 转换为张量
edge_weights = torch.stack(edge_weights)
edge_weights=edge_weights.float() 

In [18]:
edge_weights=F.normalize(edge_weights, p=2, dim=1)

In [19]:
file_path = f'../amazon_ratings/amazon_ratings.npz'
data = np.load(file_path)

In [20]:
np.savez_compressed('amazon_ratings_right_10.npz',
                    node_features=edge_weights,
                    node_labels=data['node_labels'],
                    edges=data['edges'],
                    train_masks=data['train_masks'],
                    val_masks=data['val_masks'],
                    test_masks=data['test_masks'])