## 1. 导入必要的库

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

## 2. 加载数据

In [None]:
train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

print('Train data shape:', train_df.shape)
print('Test data shape:', test_df.shape)
train_df.head()

## 3. 数据预处理

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return word_tokenize(text)

train_df['tokens'] = train_df['reviews'].apply(preprocess_text)
test_df['tokens'] = test_df['reviews'].apply(preprocess_text)

print('Train tokens:')
print(train_df['tokens'].head())
print('\nTest tokens:')
print(test_df['tokens'].head())

### 3.1 构建词汇表

In [None]:
all_tokens = [token for tokens_list in train_df['tokens'] for token in tokens_list]
word_counts = Counter(all_tokens)
sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)

vocab_to_int = {word: i+1 for i, word in enumerate(sorted_words)}
int_to_vocab = {i: word for word, i in vocab_to_int.items()}

print(f'Vocabulary size: {len(vocab_to_int)}')

### 3.2 文本序列化

In [None]:
train_df['review_int'] = train_df['tokens'].apply(lambda x: [vocab_to_int.get(word, 0) for word in x])
test_df['review_int'] = test_df['tokens'].apply(lambda x: [vocab_to_int.get(word, 0) for word in x])

train_df.head()

### 3.3 序列填充

In [None]:
def pad_features(reviews_int, seq_length):
    features = np.zeros((len(reviews_int), seq_length), dtype=int)
    for i, row in enumerate(reviews_int):
        features[i, -len(row):] = np.array(row)[:seq_length]
    return features

seq_length = 200

X_train = pad_features(train_df['review_int'], seq_length)
y_train = train_df['sentiments'].values
X_test = pad_features(test_df['review_int'], seq_length)

print('Padded training data shape:', X_train.shape)

### 3.4 创建数据集和数据加载器

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).long()
        self.y = torch.from_numpy(y).float()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_data = ReviewDataset(X_train, y_train)

batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

## 4. 模型定义

### 4.1 LSTM 模型

In [None]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5): 
        super(SentimentLSTM, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

### 4.2 双向 LSTM 模型

In [None]:
class SentimentBiLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5): 
        super(SentimentBiLSTM, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, output_size) # Multiply hidden_dim by 2 for bidirectional
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim * 2)
        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        return sig_out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers * 2, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.n_layers * 2, batch_size, self.hidden_dim).zero_())
        return hidden

### 4.3 CNN 模型

In [None]:
class SentimentCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_size, dropout): 
        super(SentimentCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_size)
        self.dropout = nn.Dropout(dropout)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.unsqueeze(1)
        conved = [torch.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [torch.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        out = self.fc(cat)
        return self.sig(out)

## 5. 训练和评估

In [None]:
def train_model(model, train_loader, lr=0.001, epochs=5, print_every=100):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    for e in range(epochs):
        h = model.init_hidden(batch_size) if hasattr(model, 'init_hidden') else None
        for i, (inputs, labels) in enumerate(train_loader):
            if hasattr(model, 'init_hidden'):
                h = tuple([each.data for each in h])
            model.zero_grad()
            if hasattr(model, 'init_hidden'):
                output, h = model(inputs, h)
            else:
                output = model(inputs).squeeze()
            loss = criterion(output, labels.float())
            loss.backward()
            optimizer.step()
            
            if i % print_every == 0:
                print(f'Epoch: {e+1}/{epochs}... ', 
                      f'Step: {i}... ', 
                      f'Loss: {loss.item():.6f}')

## 6. 实例化和训练模型

In [None]:
vocab_size = len(vocab_to_int) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

print('Training LSTM model...')
lstm_model = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
train_model(lstm_model, train_loader)

print('''
Training BiLSTM model...''')
bilstm_model = SentimentBiLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
train_model(bilstm_model, train_loader)

print('''
Training CNN model...''')
n_filters = 100
filter_sizes = [3, 4, 5]
dropout = 0.5
cnn_model = SentimentCNN(vocab_size, embedding_dim, n_filters, filter_sizes, output_size, dropout)
train_model(cnn_model, train_loader, lr=0.0005) # CNNs often benefit from a smaller learning rate

## 7. 预测并生成提交文件

In [None]:
def predict(model, X_test):
    model.eval()
    test_data = torch.from_numpy(X_test).long()
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    
    predictions = []
    with torch.no_grad():
        if hasattr(model, 'init_hidden'):
            h = model.init_hidden(batch_size)
        for inputs in test_loader:
            if len(inputs) < batch_size and hasattr(model, 'init_hidden'): # Handle last batch
                h = model.init_hidden(len(inputs))
            if hasattr(model, 'init_hidden'):
                h = tuple([each.data for each in h])
                output, h = model(inputs, h)
            else:
                output = model(inputs).squeeze()
            predictions.extend(output.tolist())
    return predictions

print('''
Predicting with BiLSTM model...''')
predictions = predict(bilstm_model, X_test)
binary_predictions = [1 if p > 0.5 else 0 for p in predictions]

submission_df = pd.DataFrame({'ID': test_df.index, 'sentiments': binary_predictions})
submission_df.to_csv('submission.csv', index=False)

print('Submission file created successfully!')
submission_df.head()