In [39]:
import re
import numpy as np
import nltk
from collections import Counter
import torch
import gensim
from gensim.models import Word2Vec
from torch import nn
from torch.nn.functional import one_hot
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize

In [40]:
def file2data(file_path: str) -> list:
    """_summary_

    Args:
        file_path (str): absa file path

    Returns:
        list: every elements contain 2 item —— sentence, targets
    """
    res = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for i in range(0, len(lines), 3):
        sentence = lines[i].strip()
        targets = lines[i + 1].strip().split(',')

        for target in targets:
            target = target.strip()
            processed_sentence = re.sub(r'\$T\$', target, sentence)

            data_tuple = (processed_sentence, target)
            res.append(data_tuple)

    return res

In [41]:


def tokenize_sentence(sentence):
    return word_tokenize(sentence)

def build_vocab(data_tuples):
    counter = Counter()
    for sentence, _ in data_tuples:
        tokens = tokenize_sentence(sentence)
        counter.update(tokens)
    return build_vocab_from_iterator([counter.keys()], specials=["<unk>", "<pad>"])

def sentence_to_tensor(sentence, vocab):
    tokens = tokenize_sentence(sentence)
    token_ids = [vocab[token] for token in tokens]
    return torch.tensor(token_ids, dtype=torch.long)


In [42]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VirmarQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [43]:
def handle(data):
    vocab = build_vocab(data)
    vocab.set_default_index(vocab["<unk>"])
    vocab_list = vocab.get_itos()
    tensor_data = [sentence_to_tensor(sentence, vocab) for sentence, _ in data]
    padded_tensor_data = pad_sequence(tensor_data, batch_first=True, padding_value=vocab["<pad>"])
    word_sentences = [[vocab_list[idx] for idx in indexed_sentence] for indexed_sentence in padded_tensor_data]
    word2vec_model = Word2Vec(sentences=word_sentences, vector_size=100, window=5, min_count=1, sg=1)
    vocab_size = len(vocab_list)
    embedding_dim = word2vec_model.vector_size
    embedding_weights = np.zeros((vocab_size, embedding_dim))
    for idx, word in enumerate(vocab_list):
        if word in word2vec_model.wv:
            embedding_weights[idx] = word2vec_model.wv[word]
        else:
            embedding_weights[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
            # 将权重矩阵转换为 PyTorch 张量
    embedding_weights = torch.tensor(embedding_weights, dtype=torch.float32)

    # 创建 Embedding 层并初始化权重
    embedding = nn.Embedding(vocab_size, embedding_dim)
    embedding.weight = nn.Parameter(embedding_weights)

    return vocab, vocab_list, padded_tensor_data, embedding

In [44]:
def trans2matrix(idx_data, embedding):
    indices = []
    for i in idx_data:
        indices.append(i)
    return embedding(torch.tensor(indices))


In [45]:
path = r"D:\\Code\\ABSA\\data\\train.raw"
data = file2data(path)


In [46]:
vocab, vocab_list, idx_data, embedding = handle(data)

In [47]:
print(idx_data[0])

tensor([ 9294,  5122,  4952,  5384,    42,  9294, 10538,    25, 13030,    25,
        14010,  7927,  8587, 11283, 14124,  9739,  6273,  8879,    42,  9629,
        13067,  5682,  6211,  4922, 13981, 15027, 11909,  4922, 11954,    42,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1])


In [48]:
input_indices = torch.tensor([vocab[vocab_list[9294]], vocab[vocab_list[9294]]])
print(input_indices)
output_vectors = embedding(input_indices)
print(output_vectors.shape)

tensor([9294, 9294])
torch.Size([2, 100])


In [49]:
print(trans2matrix(idx_data[2], embedding).shape)

torch.Size([73, 100])
