In [2]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
data_train = pd.read_csv('task1/train.tsv', sep='\t')
data_test = pd.read_csv('task1/test.tsv', sep='\t')

In [4]:
phrases = data_train['Phrase'].values
y = data_train['Sentiment'].values

## 简单的词向量

In [5]:
cv = CountVectorizer()
X = cv.fit_transform(phrases).toarray()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

## padding词向量

In [5]:
def build_vocabulary(raw_docs):
    vocab = {"<PAD>": 0}
    for doc in raw_docs:
        for word in doc.split():
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

vocab = build_vocabulary(phrases)

In [6]:
def text_to_indices(vocab, text):
    return [vocab[word] for word in text.split()]

In [7]:
X_indices = [text_to_indices(vocab, text) for text in phrases]

In [8]:
X_padded = pad_sequence([torch.tensor(indices) for indices in X_indices], batch_first=True, padding_value=0)

In [9]:
y = torch.tensor(y, dtype=torch.long)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, random_state=42)
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

  X_train = torch.tensor(X_train, dtype=torch.long)
  y_train = torch.tensor(y_train, dtype=torch.long)
  X_test = torch.tensor(X_test, dtype=torch.long)
  y_test = torch.tensor(y_test, dtype=torch.long)


In [20]:
class TextDataset(Dataset):
    def __init__(self,X,y) -> None:
        super().__init__()
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [21]:
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

In [22]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [23]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim) -> None:
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, X):
        return self.linear(X)

In [42]:
class LogisticRegressionWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        
        return self.linear(x)

## RNN

In [48]:
def TextRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, bidirectional=True)
        # self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.f1 = nn.Sequential(nn.Linear(hidden_dim*2, 64), nn.Softmax())
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1,0,2)
        x, _ = self.rnn(x)
        # x = x.mean(dim=1)
        return self.linear(x)

SyntaxError: invalid syntax (3989882823.py, line 1)

In [43]:
vocab_size = len(vocab)
embed_dim = 50  # 嵌入维度，可以调整
output_dim = len(np.unique(y))

In [44]:
model = LogisticRegressionWithEmbedding(vocab_size, embed_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [45]:
num_epochs = 100
losses = []
for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        output = model(batch_x)
        loss = criterion(output, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss
    
    if (epoch % 10 == 0):
        print(f"Epochs: {epoch}, loss={loss.item()}")

  1%|          | 1/100 [00:09<16:14,  9.84s/it]

Epochs: 0, loss=0.8985151648521423


 11%|█         | 11/100 [01:40<13:28,  9.08s/it]

Epochs: 10, loss=0.7739977240562439


 21%|██        | 21/100 [03:16<12:36,  9.58s/it]

Epochs: 20, loss=0.7139894366264343


 31%|███       | 31/100 [04:57<12:05, 10.52s/it]

Epochs: 30, loss=0.47800853848457336


 41%|████      | 41/100 [06:47<10:57, 11.14s/it]

Epochs: 40, loss=1.0031400918960571


 51%|█████     | 51/100 [08:36<08:53, 10.89s/it]

Epochs: 50, loss=0.7111338973045349


 61%|██████    | 61/100 [10:28<07:24, 11.40s/it]

Epochs: 60, loss=0.4461754262447357


 71%|███████   | 71/100 [12:21<05:26, 11.26s/it]

Epochs: 70, loss=0.9004294872283936


 81%|████████  | 81/100 [14:05<03:16, 10.33s/it]

Epochs: 80, loss=0.4519774615764618


 91%|█████████ | 91/100 [15:58<01:42, 11.42s/it]

Epochs: 90, loss=0.6723834276199341


100%|██████████| 100/100 [17:38<00:00, 10.58s/it]


In [46]:
model.eval()
with torch.no_grad():
    output = model(X_test)
    _, pred = torch.max(output, 1)
    acc = accuracy_score(y_test, pred)
    print(f"Test loss: {loss.item()}, acc:{acc}")

Test loss: 0.4245510995388031, acc:0.6433423042419583


In [47]:
res = model(X_test)
_, pred = torch.max(model(X_test),1 )