In [3]:
import json
import numpy as np
import torch

In [4]:
# 检查 MPS 是否可用
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS is available!")
else:
    device = torch.device("cpu")
    print("MPS is not available, using CPU instead")

MPS is available!


# 词表生成模块
目标：给每个n-gram切片一个独有的索引id

##### 将单词、句子转化为n-gram切片

In [19]:
def n_gram(word, n=3):
    s = []
    word = '#' + word + '#'
    for i in range(len(word) - n + 1):
        s.append(word[i:i + n])
    return s

def lst_gram(lst, n=3):
    s = []
    for word in str(lst).lower().split():
        s.extend(n_gram(word, n))
    return s 

print(n_gram('helloworld'))
print(lst_gram('hello world'))

['#he', 'hel', 'ell', 'llo', 'low', 'owo', 'wor', 'orl', 'rld', 'ld#']
['#he', 'hel', 'ell', 'llo', 'lo#', '#wo', 'wor', 'orl', 'rld', 'ld#']


## 读取文件
### 遍历文件中出现的每一个词，将其转化为n-gram切片后加入列表

In [5]:
file_path = './data/'
files = ['norm_train_claims.json', 'norm_dev_claims.json','norm_climate_evidences.json']

for file in files:
    with open(file_path + file, 'r', encoding='utf-8') as f:
        if file == 'norm_train_claims.json':
            train_claims = json.load(f)
        elif file == 'norm_dev_claims.json':
            dev_claims = json.load(f)
        elif file == 'norm_climate_evidences.json':
            evidence = json.load(f)

In [6]:
unique_words = []
for key, value in train_claims.items():
    unique_words.extend(value['norm_claim'].split())
for key, value in dev_claims.items():
    unique_words.extend(value['norm_claim'].split())
for key, value in evidence.items():
    unique_words.extend(value.split())
unique_words = list(set(unique_words))
print("Number of unique words in datasets:", len(unique_words))

Number of unique words in datasets: 461678


In [7]:
vocab = []

for key, value in train_claims.items():
    vocab.extend(lst_gram(value['norm_claim']))
    evid_ids = value['evidences']
    for evid_id in evid_ids:
        if evid_id in evidence.keys():
            vocab.extend(lst_gram(evidence[evid_id]))
for key, value in dev_claims.items():
    vocab.extend(lst_gram(value['norm_claim']))
    evid_ids = value['evidences']
    for evid_id in evid_ids:
        if evid_id in evidence.keys():
            vocab.extend(lst_gram(evidence[evid_id]))
# for key, value in evidence.items():
#     vocab.extend(lst_gram(value))

### 去重
vocab = list(set(vocab))
print("Number of unique vocab in datasets:", len(vocab))

NameError: name 'lst_gram' is not defined

添加特殊字段

In [23]:
vocab_list = ['[PAD]', '[UNK]']
vocab_list.extend(vocab)
print("Final vocab size:", len(vocab_list))

Final vocab size: 6131


In [25]:
with open("./data/vocab_true_evid.txt", "w", encoding="utf-8") as f:
    for word in vocab_list:
        f.write(word + "\n")

In [65]:
evid_text = list(evidence.values())
evid_text_test = evid_text[0]    

In [None]:
import numpy as np

# 定义哈希空间维度（如 10000 或更大，视内存和冲突情况而定）
HASH_SPACE_DIM = 10000

def hashing_trick(word, hash_space_dim=HASH_SPACE_DIM):
    """
    将单个词转换为哈希索引，使用 hash 函数并对哈希空间维度取模。
    """
    return hash(word) % hash_space_dim

def vocab_to_hash(vocab_list, hash_space_dim=HASH_SPACE_DIM):
    """
    对整个 vocab list 进行哈希映射，返回哈希索引列表。
    """
    hashed_vocab = [hashing_trick(word, hash_space_dim) for word in vocab_list]
    return hashed_vocab

def evidence_to_embedding(evidence, vocab_list, hash_space_dim=HASH_SPACE_DIM, embedding_dim=100):
    """
    对每个 evidence 进行 n-gram 分词，将 n-gram 词映射到哈希索引，并生成 embedding。
    """
    # 将 vocab list 转化为哈希索引列表
    hashed_vocab = vocab_to_hash(vocab_list, hash_space_dim)

    # 使用分词和哈希结果初始化 embedding
    embedding = np.zeros(hash_space_dim)

    # 对每个 n-gram 词应用哈希映射，更新对应位置的值
    for word in evidence:
        hash_index = hashing_trick(word, hash_space_dim)
        embedding[hash_index] += 1  # 累计出现次数，可以用 TF-IDF 或其他权重替代

    return embedding

# 示例：假设我们有一个 vocab list 和 evidence 列表
vocab_list = ["word1", "word2", "example", "ngram", "sample"]
evidence = ["example", "word2", "ngram", "example"]

# 对 evidence 生成 embedding
embedding = evidence_to_embedding(evidence, vocab_list)
print("Evidence Embedding:\n", embedding)

储存到文件

In [8]:
vocab_file_path = './data/vocab_with_evid.txt'
with open(vocab_file_path, 'w', encoding='utf-8') as f:
    for slice in vocab_list:
        f.write(slice + '\n')

# GPU 版本

In [2]:
def hashing_trick(word, HASH_SPACE_DIM= 2000):
    return hash(word) % HASH_SPACE_DIM

def vocab_to_hash(vocab_path, HASH_SPACE_DIM = 2000):
    vocab_list = open(vocab_path, encoding='utf-8').readlines()
    # hashed_vocab = [hashing_trick(word, hash_space_dim) for word in vocab_list]
    hashed_vocab = {word: hashing_trick(word, HASH_SPACE_DIM) for word in vocab_list}
    return hashed_vocab

def n_gram_gpu(text_tensor, n=3):
    ngrams = [text_tensor[i:i+n] for i in range(len(text_tensor) - n + 1)]
    return ngrams
    
def get_batch(evid_text, batch_size=1024, start_index = 0):
    end_index = start_index + batch_size
    batch_data = evid_text[start_index:end_index]
    return batch_data, end_index

def process_batch_char(batch_data, indices_tensor, HASH_SPACE_DIM, n_gram = 3, max_length = 100):
    batch_char_list = []
    for evidence in batch_data:
        text_tensor = torch.tensor([ord(c) for c in evidence]).to("mps")
        ngrams = n_gram_gpu(text_tensor, n_gram)
        
        ngram_hashes = torch.tensor([hash(ngram) % HASH_SPACE_DIM for ngram in ngrams], dtype=torch.long).to("mps")
        hash_indices = indices_tensor[ngram_hashes]
        batch_char_list.append(hash_indices)
    
        # 在 GPU 上填充序列
    batch_char_tensor = torch.nn.utils.rnn.pad_sequence(batch_char_list, batch_first=True, padding_value=0).to("mps")
    
    # 截断或补充到 max_length
    if batch_char_tensor.size(1) > max_length:
        # 截断
        batch_char_tensor = batch_char_tensor[:, :max_length]
    else:
        # 补充
        padding_size = max_length - batch_char_tensor.size(1)
        batch_char_tensor = F.pad(batch_char_tensor, (0, padding_size), value=0)
    
    return batch_char_tensor

def load_evd_text(vocab_path, evid_text,HASH_SPACE_DIM = 2000, n_gram = 3, max_length = 100):
    hashed_vocab = vocab_to_hash(vocab_path = vocab_path,
                            HASH_SPACE_DIM = HASH_SPACE_DIM)
    ngrams_list = list(hashed_vocab.keys())  # 获取所有 n-gram 作为列表
    indices_list = list(hashed_vocab.values())  # 获取对应的哈希索引列表

    # 将 n-grams 和索引转化为 GPU 上的张量
    ngrams_tensor = torch.tensor([hash(ngram) % HASH_SPACE_DIM for ngram in ngrams_list]).to("mps")
    indices_tensor = torch.tensor(indices_list).to("mps")

    start_index = 0
    batch_size = 49152
    all_evd_chars = []
    while start_index < len(evid_text):
        print("current processed batch index",start_index)
        batch_data, start_index = get_batch(evid_text, batch_size, start_index)
        batch_char_list = process_batch_char(batch_data, indices_tensor, HASH_SPACE_DIM, n_gram = 3, max_length = 100)
        batch_char_tensor_cpu = batch_char_list.to("cpu")
        all_evd_chars.append(batch_char_tensor_cpu)
    return all_evd_chars
    

In [89]:
hashed_vocab = vocab_to_hash(vocab_path = './data/vocab_true_evid.txt',
                            hash_space_dim = 2000)
ngrams_list = list(hashed_vocab.keys())  # 获取所有 n-gram 作为列表
indices_list = list(hashed_vocab.values())  # 获取对应的哈希索引列表

# 将 n-grams 和索引转化为 GPU 上的张量
ngrams_tensor = torch.tensor([hash(ngram) % vocab_size for ngram in ngrams_list]).to("mps")
indices_tensor = torch.tensor(indices_list).to("mps")

In [3]:
def process_batch_char(batch_data, indices_tensor, HASH_SPACE_DIM, n_gram = 3, max_length = 100):
    batch_char_list = []
    for evidence in batch_data:
        text_tensor = torch.tensor([ord(c) for c in evidence]).to("mps")
        ngrams = n_gram_gpu(text_tensor, n_gram)
        
        ngram_hashes = torch.tensor([hash(ngram) % HASH_SPACE_DIM for ngram in ngrams], dtype=torch.long).to("mps")
        hash_indices = indices_tensor[ngram_hashes]
        batch_char_list.append(hash_indices)
    
        # 在 GPU 上填充序列
    batch_char_tensor = torch.nn.utils.rnn.pad_sequence(batch_char_list, batch_first=True, padding_value=0).to("mps")
    
    # 截断或补充到 max_length
    if batch_char_tensor.size(1) > max_length:
        # 截断
        batch_char_tensor = batch_char_tensor[:, :max_length]
    else:
        # 补充
        padding_size = max_length - batch_char_tensor.size(1)
        batch_char_tensor = F.pad(batch_char_tensor, (0, padding_size), value=0)
    
    return batch_char_tensor

In [14]:
def load_claim_text_to_char(vocab_path, claim_df, HASH_SPACE_DIM, n_gram = 3, max_length = 100):
    hashed_vocab = vocab_to_hash(vocab_path = vocab_path,
                            HASH_SPACE_DIM = HASH_SPACE_DIM)    
    ngrams_list = list(hashed_vocab.keys())  # 获取所有 n-gram 作为列表
    indices_list = list(hashed_vocab.values())  # 获取对应的哈希索引列表
    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

    # 将 n-grams 和索引转化为 GPU 上的张量
    ngrams_tensor = torch.tensor([hash(ngram) % HASH_SPACE_DIM for ngram in ngrams_list]).to("mps")
    indices_tensor = torch.tensor(indices_list).to("mps")
    all_char_list = []
    for claim_id, values in claim_df.items():
        claim_text = values['norm_claim']
        text_tensor = torch.tensor([ord(c) for c in claim_text]).to("mps")
        ngrams = n_gram_gpu(text_tensor, n_gram)

        ngram_hashes = torch.tensor([hash(ngram) % HASH_SPACE_DIM for ngram in ngrams], dtype=torch.long).to("mps")
        hash_indices = indices_tensor[ngram_hashes]

        all_char_list.append(hash_indices)

        # 在 GPU 上填充序列
    all_char_list = torch.nn.utils.rnn.pad_sequence(all_char_list, batch_first=True, padding_value=0).to("mps")

    # 截断或补充到 max_length
    if all_char_list.size(1) > max_length:
        # 截断
        all_char_list = all_char_list[:, :max_length]
    else:
        # 补充
        padding_size = max_length - all_char_list.size(1)
        all_char_list = F.pad(all_char_list, (0, padding_size), value=0)

    return all_char_list

In [15]:
all_char_list = load_claim_text_to_char(vocab_path = './data/vocab_true_evid.txt', 
                                        claim_df = train_claims, HASH_SPACE_DIM=2000, n_gram = 3, max_length = 100)
len(all_char_list)

1228

In [16]:
dev_char = load_claim_text_to_char(vocab_path = './data/vocab_true_evid.txt',
                                        claim_df = dev_claims, HASH_SPACE_DIM=2000, n_gram = 3, max_length = 100)
len(dev_char)

154

In [8]:
def load_evd_text(vocab_path, evid_text,HASH_SPACE_DIM = 2000, n_gram = 3, max_length = 100, batch_size = 49152):
    hashed_vocab = vocab_to_hash(vocab_path = vocab_path,
                            HASH_SPACE_DIM = HASH_SPACE_DIM)
    ngrams_list = list(hashed_vocab.keys())  # 获取所有 n-gram 作为列表
    indices_list = list(hashed_vocab.values())  # 获取对应的哈希索引列表
    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

    # 将 n-grams 和索引转化为 GPU 上的张量
    ngrams_tensor = torch.tensor([hash(ngram) % HASH_SPACE_DIM for ngram in ngrams_list]).to("mps")
    indices_tensor = torch.tensor(indices_list).to("mps")

    start_index = 0
    all_evd_chars = []
    while start_index < len(evid_text):
        print("current processed batch index",start_index)
        batch_data, start_index = get_batch(evid_text, batch_size, start_index)
        batch_char_list = process_batch_char(batch_data, indices_tensor, HASH_SPACE_DIM, n_gram = 3, max_length = 100)
        batch_char_tensor_cpu = batch_char_list.to("cpu")
        all_evd_chars.append(batch_char_tensor_cpu)
    return all_evd_chars

In [9]:
all_evd_chars = load_evd_text(vocab_path = './data/vocab_true_evid.txt',evid_text = list(evidence.values()),
                            batch_size = 49152,
                            HASH_SPACE_DIM = 2000)

current processed batch index 0
current processed batch index 49152
current processed batch index 98304
current processed batch index 147456
current processed batch index 196608
current processed batch index 245760
current processed batch index 294912
current processed batch index 344064
current processed batch index 393216
current processed batch index 442368
current processed batch index 491520


In [120]:
start_index = 0
batch_size = 49152
batched_char_list = []
while start_index < len(evid_text):
    print("current processed batch index",start_index)
    batch_data, start_index = get_batch(evid_text, batch_size, start_index)
    batch_char_list = process_batch_char(batch_data, HASH_SPACE_DIM, device, n = 3)
    batch_char_tensor_cpu = batch_char_list.to("cpu")
    batched_char_list.append(batch_char_tensor_cpu)

current processed batch index 0
current processed batch index 49152
current processed batch index 98304
current processed batch index 147456
current processed batch index 196608
current processed batch index 245760
current processed batch index 294912
current processed batch index 344064
current processed batch index 393216
current processed batch index 442368
current processed batch index 491520


In [None]:

# 将 n-grams 转换为 GPU 张量形式
batch_ngrams_tensor = torch.tensor([hash(ngram) % vocab_size for ngram in batch_ngrams]).to("mps")

# GPU 加速查找：通过张量查找对应索引
hash_indices = indices_tensor[batch_ngrams_tensor]

print("Hash indices:", hash_indices)

In [None]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
