In [9]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

base_dir = os.getcwd()  

In [10]:
def load_data(batch_size = 128, max_len = 15):
    df = pd.read_csv(os.path.join(base_dir, "train.tsv"), sep="\t")
    X_text = df["Phrase"].tolist()
    y = df["Sentiment"].tolist()

    X_train, X_val, y_train, y_val = train_test_split(X_text, y, test_size=0.15, random_state= 2)

    all_tokens = [word for sent in X_train for word in sent.split()]
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for i, word in enumerate(Counter(all_tokens).keys(), start=2):
        vocab[word] = i

    def encode_sentence(sentence, vocab, max_len):
        tokens = sentence.split()
        idxs = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]
        idxs = idxs[:max_len]
        idxs += [vocab["<PAD>"]] * (max_len - len(idxs))
        return torch.tensor(idxs)

    X_train_idx = [encode_sentence(s, vocab, max_len) for s in X_train]
    X_val_idx = [encode_sentence(s, vocab, max_len) for s in X_val]
    y_train = torch.tensor(y_train)
    y_val = torch.tensor(y_val)

    X_train_tensor = torch.stack(X_train_idx)  
    X_val_tensor = torch.stack(X_val_idx)

    train_dataset = TensorDataset(X_train_tensor, y_train)
    val_dataset = TensorDataset(X_val_tensor, y_val)

    train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size, shuffle=False)

    return train_loader, val_loader, vocab

In [11]:
def embedding(method, vocab_size, embedding_dim):
    if method == 'random':
        embedding_layer = nn.Embedding(vocab_size, embedding_dim)
    elif method == 'glove':
        glove_path = r"C:\Users\86178\Desktop\学习\glove.6B.100d.txt"
        emb_matrix = np.random.randn(vocab_size, embedding_dim)
        with open(glove_path, "r", encoding="utf-8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.array(values[1:], dtype=float)
                if word in vocab:
                    emb_matrix[vocab[word]] = vector
        embedding_layer = nn.Embedding.from_pretrained(torch.tensor(emb_matrix, dtype=torch.float))
    else:
        raise ValueError("Unknown embedding method")
    return embedding_layer

In [12]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_layer):
        super().__init__()
        self.embedding = embedding_layer
        kernel_sizes = [3, 4, 5]
        num_filters = 100
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embedding_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x)               
        x = x.unsqueeze(1)                  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]   
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]    
        x = torch.cat(x, 1)              
        x = self.dropout(x)
        return self.fc(x)

In [13]:
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, embedding_layer=None):
        super().__init__()
        self.embedding = embedding_layer
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            batch_first=True,
            bidirectional=True
        )

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)  
        out, (h_n, c_n) = self.lstm(x)  
        h_n = h_n.permute(1, 0, 2).contiguous().view(x.size(0), -1) 
        h_n = self.dropout(h_n)
        return self.fc(h_n)  

In [14]:
def forward_model(model, train_loader, dev_loader, num_epochs=50, device="cuda"):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    best_dev_acc = 0
    patience, patience_counter = 10, 0  

    for epoch in range(num_epochs):
        model.train()
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()  

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for batch_x, batch_y in dev_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                preds = outputs.argmax(dim=1)
                correct += (preds == batch_y).sum().item()
                total += batch_y.size(0)
        dev_acc = correct / total

        print(f"Epoch {epoch+1}, Dev Acc = {dev_acc:.4f}")

        if dev_acc > best_dev_acc:
            best_dev_acc = dev_acc
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

In [None]:
train_loader, val_loader, vocab = load_data()
embedding_layer = embedding(method = 'glove', vocab_size = len(vocab), embedding_dim = 100)
CNNmodel = TextCNN(vocab_size = len(vocab), embedding_dim = 100, num_classes = 5, embedding_layer = embedding_layer)
RNNmodel = TextRNN(vocab_size = len(vocab), embedding_dim = 100, hidden_dim = 128, num_classes = 5, embedding_layer = embedding_layer)
print('-- CNN model --')
forward_model(CNNmodel, train_loader, val_loader, num_epochs=50, device="cuda")
print('-- RNN model --')
forward_model(RNNmodel, train_loader, val_loader, num_epochs=50, device="cuda")