In [1]:
import re
import numpy as np
import nltk
from collections import Counter
import torch
import gensim
from gensim.models import Word2Vec
from torch import nn
from torch.nn.functional import one_hot
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim




In [2]:
def file2data(file_path: str) -> list:
    """_summary_

    Args:
        file_path (str): absa file path

    Returns:
        list: every elements contain 2 item —— sentence, targets
    """
    res = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for i in range(0, len(lines), 3):
        sentence = lines[i].strip()
        targets = lines[i + 1].strip().split(',')

        for target in targets:
            target = target.strip()
            processed_sentence = re.sub(r'\$T\$', target, sentence)

            data_tuple = (processed_sentence, target)
            res.append(data_tuple)

    return res

In [3]:
def tokenize_sentence(sentence):
    return word_tokenize(sentence)

def build_vocab(data_tuples):
    counter = Counter()
    for sentence, _ in data_tuples:
        tokens = tokenize_sentence(sentence)
        counter.update(tokens)
    return build_vocab_from_iterator([counter.keys()], specials=["<unk>", "<pad>"])

def sentence_to_tensor(sentence, vocab, max_len):
    tokens = tokenize_sentence(sentence)
    token_ids = [vocab[token] for token in tokens]
    token_ids = token_ids[:max_len] + [vocab["<pad>"]] * (max_len - len(token_ids))
    return torch.tensor(token_ids, dtype=torch.long)


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VirmarQ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
def generate_bio_labels(sentence, target, vocab, max_len):
    tokens = tokenize_sentence(sentence)
    labels = ["O"] * len(tokens)
    
    target_tokens = tokenize_sentence(target)
    target_len = len(target_tokens)
    
    for i in range(len(tokens) - target_len + 1):
        if tokens[i:i + target_len] == target_tokens:
            labels[i] = "B"
            for j in range(1, target_len):
                labels[i + j] = "I"
            break
    
    label_ids = [0 if label == "O" else 1 if label == "B" else 2 for label in labels]
    label_ids = label_ids[:max_len] + [0] * (max_len - len(label_ids))
    return torch.tensor(label_ids, dtype=torch.long)

In [6]:
def train_word2vec(sentences, embed_dim=256):
    model = Word2Vec(sentences, vector_size=embed_dim, window=5, min_count=1, workers=4)
    return model

def build_embedding_matrix(vocab, word2vec_model, embed_dim=256):
    embedding_matrix = np.zeros((len(vocab), embed_dim))
    for i, word in enumerate(vocab.get_itos()):
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
        else:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim, ))
    return torch.tensor(embedding_matrix, dtype=torch.float32)


In [7]:
def data2vec(data):
    vocab = build_vocab(data)
    max_len = max(len(tokenize_sentence(sentence)) for sentence, _ in data)
    sentences = [tokenize_sentence(sentence) for sentence, _ in data]
    word2vec_model = train_word2vec(sentences)
    embedding_matrix = build_embedding_matrix(vocab, word2vec_model)

    sentence_tensors = []
    bio_labels = []

    for sentence, target in data:
        sentence_tensor = sentence_to_tensor(sentence, vocab, max_len)
        bio_label = generate_bio_labels(sentence, target, vocab, max_len)
        sentence_tensors.append(sentence_tensor)
        bio_labels.append(bio_label)

    # 将结果转换为Tensor
    sentence_tensors = torch.stack(sentence_tensors)
    bio_labels = torch.stack(bio_labels)

    # 输出结果
    print("词表大小：", len(vocab))
    print("句子词下标向量示例：", sentence_tensors[0])
    print("BIO标签示例：", bio_labels[0])

    return vocab, sentence_tensors, bio_labels, embedding_matrix

In [8]:
path = r"D:\\Code\\ABSA\\data\\train.raw"
data = file2data(path)
vocab, features, labels, embedding = data2vec(data)


词表大小： 15238
句子词下标向量示例： tensor([ 9294,  5122,  4952,  5384,    42,  9294, 10538,    25, 13030,    25,
        14010,  7927,  8587, 11283, 14124,  9739,  6273,  8879,    42,  9629,
        13067,  5682,  6211,  4922, 13981, 15027, 11909,  4922, 11954,    42,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1])
BIO标签示例： tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])


In [9]:
print(data[1])
print(features[1])
print(labels[1])

('musicmonday britney spears - lucky do you remember this song ? it ` s awesome . i love it .', 'britney spears')
tensor([10867,  6070, 13398,    26, 10340,  7464, 15157, 12394, 14028, 13324,
          451,  9629,  4921, 12715,  5558,    42,  9294, 10312,  9629,    42,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1])
tensor([0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0])


In [10]:
class AteData(Dataset):
    def __init__(self, sentence_idx, target_idx, embeddings):
        self.sentence_idx = sentence_idx
        self.target_idx = target_idx
        self.embeddings = embeddings

    def __len__(self):
        return len(self.sentence_idx)

    def trans2matrix(self, idx_vec):
        res = []
        for i in idx_vec:
            res.append(list(self.embeddings[i]))
        return torch.tensor(res)

    def __getitem__(self, index):
        sentence_tensor = self.sentence_idx[index]
        target_tensor = self.target_idx[index]
        sentence_embedding = self.trans2matrix(sentence_tensor)
        return sentence_embedding.long(), target_tensor

In [11]:
# 创建Dataset和DataLoader
dataset = AteData(features, labels, embedding)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [12]:
# 示例：检查DataLoader输出
for batch_idx, (sentence_embeddings, target_labels) in enumerate(dataloader):
    print(f"Batch {batch_idx + 1}")
    print("Sentence Embeddings Shape:", sentence_embeddings.shape)
    print("Target Labels Shape:", target_labels.shape)
    # break  # 仅检查第一个批次

Batch 1
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 2
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 3
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 4
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 5
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 6
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 7
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 8
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 9
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labels Shape: torch.Size([16, 73])
Batch 10
Sentence Embeddings Shape: torch.Size([16, 73, 256])
Target Labe