In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [53]:
df_train = pd.read_csv('new_train.tsv', header=0, sep='\t')
train_text = list(np.array(df_train)[:, 0])
train_labels = list(np.array(df_train)[:, 1])
df_test = pd.read_csv('new_test.tsv', header=0, sep='\t')
test_text = list(np.array(df_test)[:, 0])
test_labels = list(np.array(df_test)[:, 1])

In [62]:
# 句子数据（简单示例）
# sentences = [("I love this movie", 1),
#              ("This is a great day", 1),
#              ("I hate this weather", 0),
#              ("This food is terrible", 0)]

train_sentences = list(zip(train_text, train_labels))
test_sentences = list(zip(test_text, test_labels))
sentences = train_sentences+test_sentences

# 构建词典
word_list = list(set(" ".join([s[0] for s in sentences]).split()))
word_dict = {w: i for i, w in enumerate(word_list)}
word_dict["<PAD>"] = 0  # 添加填充标记
vocab_size = len(word_dict)

max_len = max(len(s[0].split()) for s in sentences)

# 句子转换为索引（并填充）
def sentence_to_tensor(sentence):
    idxs = [word_dict[word] for word in sentence.split()]
    # 填充到 max_len
    idxs += [0] * (max_len - len(idxs))
    return torch.tensor(idxs, dtype=torch.long)

In [74]:

# 超参数
embed_dim = 512  # 词向量维度
num_classes = max([i for i in train_labels])+1  # 正向 or 负向

# CNN 模型
class SentimentCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(SentimentCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(0.5)  # **新增 Dropout**

        # **动态计算 feature_dim**
        with torch.no_grad():
            sample_input = torch.randint(0, vocab_size, (1, max_len))  # 随机输入
            sample_output = self.pool(torch.relu(self.conv(self.embedding(sample_input).permute(0, 2, 1))))
            feature_dim = sample_output.shape[1] * sample_output.shape[2]

        self.fc = nn.Linear(feature_dim, num_classes)  # **修正 Linear 层**

    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len) -> (batch, seq_len, embed_dim)
        x = x.permute(0, 2, 1)  # (batch, embed_dim, seq_len)
        x = torch.relu(self.conv(x))
        x = self.pool(x)

        # print(f"Shape after pooling: {x.shape}")  # **检查展平前的形状**

        x = x.view(x.size(0), -1)  # **展平成 (batch_size, feature_dim)**
        x = self.dropout(x)  # **在全连接层前使用 Dropout**
        x = self.fc(x)
        return x


In [75]:
# 创建模型
model = SentimentCNN(vocab_size, embed_dim, num_classes)

# 损失函数 & 优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0002, weight_decay=1e-4)

# 准备数据
X_train = torch.stack([sentence_to_tensor(s[0]) for s in train_sentences])
y_train = torch.tensor([s[1] for s in train_sentences], dtype=torch.long)
X_test = torch.stack([sentence_to_tensor(s[0]) for s in test_sentences])
y_test = torch.tensor([s[1] for s in test_sentences], dtype=torch.long)

In [76]:
# 训练模型
train_losses = []
test_losses = []
for epoch in range(1000):
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    test_output = model(X_test)
    test_loss = criterion(test_output, y_test)

    train_losses.append(loss.item())  # 记录训练损失
    test_losses.append(test_loss.item())  # 记录测试损失

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
        print(f"Test Loss: {test_loss.item():.4f}")

plt.figure(figsize=(10, 5))
plt.plot(range(len(train_losses)), train_losses, label="Train Loss", color='blue')
plt.plot(range(len(test_losses)), test_losses, label="Test Loss", color='red')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training & Testing Loss Over Epochs")
plt.legend()
plt.grid()
plt.show()


Epoch 10, Loss: 1.5722
Test Loss: 1.5803
Epoch 20, Loss: 1.5274
Test Loss: 1.5421
Epoch 30, Loss: 1.4998
Test Loss: 1.5297
Epoch 40, Loss: 1.4873
Test Loss: 1.5286
Epoch 50, Loss: 1.4720
Test Loss: 1.5155
Epoch 60, Loss: 1.4512
Test Loss: 1.5202
Epoch 70, Loss: 1.4391
Test Loss: 1.5048
Epoch 80, Loss: 1.4245
Test Loss: 1.5093
Epoch 90, Loss: 1.4125
Test Loss: 1.5044
Epoch 100, Loss: 1.3941
Test Loss: 1.5019
Epoch 110, Loss: 1.3864
Test Loss: 1.5029
Epoch 120, Loss: 1.3661
Test Loss: 1.4961
Epoch 130, Loss: 1.3584
Test Loss: 1.5018
Epoch 140, Loss: 1.3383
Test Loss: 1.4965
Epoch 150, Loss: 1.3209
Test Loss: 1.4828
Epoch 160, Loss: 1.3091
Test Loss: 1.4977
Epoch 170, Loss: 1.2892
Test Loss: 1.4918
Epoch 180, Loss: 1.2774
Test Loss: 1.5010
Epoch 190, Loss: 1.2618
Test Loss: 1.4968
Epoch 200, Loss: 1.2529
Test Loss: 1.4979
Epoch 210, Loss: 1.2319
Test Loss: 1.5007
Epoch 220, Loss: 1.2171
Test Loss: 1.5027
Epoch 230, Loss: 1.2018
Test Loss: 1.4939
Epoch 240, Loss: 1.1861
Test Loss: 1.4964
E

KeyboardInterrupt: 

In [70]:
print(f"Test Loss: {criterion(model(X_test), y_test).item():.4f}")

Test Loss: 1.6394


In [51]:
# 预测
test_sentence = test_sentences[0]
X_test = sentence_to_tensor(test_sentence).unsqueeze(0)
pred = model(X_test).argmax(dim=1).item()
print(f"Sentence: '{test_sentence}', Prediction: {pred}")


Sentence: 'Once you get into its rhythm ... the movie becomes a heady experience .', Prediction: 1
