In [28]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torchmetrics

import torch.nn.functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torchvision import transforms
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
import ast
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchmetrics import F1Score

In [29]:
x = pd.read_csv('x_rate_train.csv')
y = pd.read_csv('y_rate_train.csv')

In [30]:
x_test = pd.read_csv('x_rate_test.csv')
y_test = pd.read_csv('y_rate_test.csv')

In [31]:

def convert_str_to_list(df, column_name):
    df[column_name] = df[column_name].apply(ast.literal_eval)
    return df


# 转换列
x['lemmatized_text_with_pos'] = x['lemmatized_text_with_pos'].astype(str)
df = convert_str_to_list(x, 'lemmatized_text_with_pos')


In [32]:
def convert_str_to_list(df, column_name):
    def eval_literal(row):
        try:
            return ast.literal_eval(row)
        except Exception as e:
            print(f"Failed to convert: {row}")
            raise e

    df[column_name] = df[column_name].apply(eval_literal)
    return df

# Apply this to your DataFrame
x_test['lemmatized_text_with_pos'] = x_test['lemmatized_text_with_pos'].astype(str)
df_test = convert_str_to_list(x_test, 'lemmatized_text_with_pos')

In [33]:
train_df =pd.concat((df,y),axis=1)

In [34]:
test_df = pd.concat((df_test, y_test), axis = 1)

In [35]:

def build_vocab(train_df, min_freq):
    unk_token = '<UNK>'
#     pad_token = '<PAD>'
#     default_index = -1
    tokenizer = get_tokenizer("basic_english")
    
    vocab = build_vocab_from_iterator(train_df["lemmatized_text_with_pos"], min_freq= min_freq, specials=[unk_token])
    vocab.set_default_index(vocab[unk_token])
    return vocab

In [36]:
vocab = build_vocab(train_df, min_freq=5)

In [37]:
len(vocab.get_itos())

5310

In [38]:
def token2num(df, vocab):
    new_df = pd.DataFrame(columns=('lemmatized_text_with_pos', 'usefulCount', 'reviewCount', 'flagged', 'rate_diff'))
    for index, row in df.iterrows():
        new_df.loc[index, 'lemmatized_text_with_pos'] = [vocab[w] for w in row['lemmatized_text_with_pos']]
        new_df.loc[index, 'flagged'] = row['flagged']
        new_df.loc[index, 'usefulCount'] = row['usefulCount']
        new_df.loc[index, 'reviewCount'] = row['reviewCount']
        new_df.loc[index, 'rate_diff'] = row['rate_diff']
    return new_df

In [39]:
train_df = token2num(train_df, vocab)

In [40]:
test_df = token2num(test_df, vocab)

In [41]:
df_len_train = pd.DataFrame(train_df['lemmatized_text_with_pos'].apply(lambda x: len(x)).copy())
max_len_train = df_len_train['lemmatized_text_with_pos'].max()

In [42]:
df_len_test = pd.DataFrame(test_df['lemmatized_text_with_pos'].apply(lambda x: len(x)).copy())
max_len_test = df_len_test['lemmatized_text_with_pos'].max()

In [43]:
class MyDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        x = torch.tensor(self.df.iloc[idx, 0])
        y = torch.tensor(self.df.iloc[idx, 3])
        uc = torch.tensor(self.df.iloc[idx, 1])
        rd = torch.tensor(self.df.iloc[idx, 2])
        # rc = torch.tensor(self.df.iloc[idx, 2])
        # print(x)
        # print(y)
        # print(uc)
        # print(rc)
        if self.transform is not None:
            x = self.transform(x)
        return x, y, uc, rd

In [44]:
train_set = MyDataset(train_df)

In [45]:
test_set = MyDataset(test_df)

In [46]:
def collate_fn(batch):
    data = [item[0] for item in batch]
    lens = [len(item[0]) for item in batch] 
    ucounts = torch.tensor([item[2] for item in batch])
    rdiff = torch.tensor([item[3] for item in batch])
    targets = torch.tensor([item[1] for item in batch])
   
    padded_batch = pad_sequence(data, batch_first=True)
    return padded_batch, targets, lens, ucounts, rdiff

In [47]:
batch_size = 128
train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)


In [48]:
test_loader = DataLoader(test_set, shuffle=False, batch_size=batch_size, collate_fn=collate_fn)

In [49]:
class ReviewEmbedder(nn.Module):
    def __init__(self, vocab_size, text_embed_dim, num_meta_features, meta_dim):
        super(ReviewEmbedder, self).__init__()
        self.text_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=text_embed_dim)
        
        # Meta embedding for numerical features like useful_counts and restaurant_ratings
        self.meta_embedding = nn.Linear(num_meta_features, meta_dim)

    def forward(self, text_indices, numerical_features):
        # Embed textual content
        text_embedded = self.text_embedding(text_indices)

        # Embed numerical features
        # Assuming numerical_features is a batch of [useful_counts, restaurant_ratings]
        meta_embedded = self.meta_embedding(numerical_features)

        # Concatenate embeddings along the last dimension
        combined_embeddings = torch.cat((text_embedded, meta_embedded.unsqueeze(1).expand(-1, text_embedded.size(1), -1)), dim=2)
        return combined_embeddings


In [50]:

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, n_classes, dict_size, embedding_size, num_meta_features):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.meta_size = int(self.embedding_size*0.5)
        self.dict_size = dict_size
        self.embedding = ReviewEmbedder(self.dict_size, self.embedding_size, num_meta_features, self.embedding_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.n_classes = n_classes
        
        # Additional embedding layer for numerical features like usefulCounts
        self.meta_embedding = nn.Linear(num_meta_features, self.embedding_size)  # Embedding size for metadata to match text embedding size
        
        # Attention mechanism
        self.attention = nn.MultiheadAttention(embedding_size*2, num_heads=1, dropout=0.1, batch_first = True)
        
        self.lstm = nn.LSTM(input_size=self.input_size*2 + num_meta_features,  # Adjust input size to include meta embedding
                            hidden_size=self.hidden_size, 
                            num_layers=self.n_layers, 
                            batch_first=True)
        
        self.fc = nn.Linear(self.hidden_size, self.n_classes)  # Output layer
        
        # Normalization layers for numerical features
        self.usefulCount_norm = nn.BatchNorm1d(1)  # Assuming usefulCount is a single feature
        self.reviewCount_norm = nn.BatchNorm1d(1)  # Assuming res_diff is a single feature

    def forward(self, x, lens, numerical_features, max_length):
        # Normalization for numerical features
        numerical_features = numerical_features.float()
        usefulCount_normalized = self.usefulCount_norm(numerical_features[:, 0].unsqueeze(1))
        reviewCount_normalized = self.reviewCount_norm(numerical_features[:, 1].unsqueeze(1))

        # Combine normalized numerical features
        normalized_features = torch.cat((usefulCount_normalized, reviewCount_normalized), dim=1)
        embeddings = self.embedding(x, normalized_features)

        # Concatenate text embeddings with numerical metadata embeddings
        # combined_embeddings_with_metadata = torch.cat([embeddings, normalized_features.unsqueeze(1)], dim=-1)

        # Apply self-attention mechanism
        attention_output, _ = self.attention(embeddings, embeddings, embeddings)
        # Assuming attention_outputs are in the shape (batch, seq_len, features)
        padded = torch.nn.functional.pad(attention_output, (0, 0, 0, max_length - attention_output.size(1)), mode='constant', value=0) if attention_output.size(1) < max_length else attention_output[:, :max_length]
        attention_outputs_reshaped = padded.reshape(padded.size(0), -1)
       
        # Concatenate attention output with normalized numerical features
        combined_embeddings = torch.cat([attention_output, normalized_features.unsqueeze(1).expand(-1, attention_output.size(1), -1)], dim=-1)
        
        # LSTM layer
        packed_combined_embeddings = pack_padded_sequence(combined_embeddings, lens, batch_first=True, enforce_sorted=False)
        packed_output, (ht, ct) = self.lstm(packed_combined_embeddings)

        # Output layer
        padded_output, lengths = pad_packed_sequence(packed_output, batch_first=True)
        h_n = []
        for seq, length in zip(padded_output, lengths):
            h_n.append(seq[length-1, :])
        h_n_batch = torch.stack(h_n)
        output = self.fc(h_n_batch)

        return output, ht[-1], attention_outputs_reshaped




In [51]:
def train(epoch, model, train_loader, batch_size, device, loss_fn, optimizer):
    model.train()
    train_loss = 0.
    n_corrects = 0
    total = 0
    step = 0
    for idx, (texts, labels, lens, uc, rd) in enumerate(train_loader):
        optimizer.zero_grad()
        texts = texts.to(device)
        labels = labels.to(device)
        uc = uc.to(device)
        rd = rd.to(device)
        features = torch.cat((uc.unsqueeze(1), rd.unsqueeze(1)), dim=1)
        # print(features.shape)
        

        outputs, h, att = model(texts, lens, features, max_len_train)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # att = att.detach().cpu().numpy()
        train_loss += loss.item()
        _, predictions = torch.max(outputs, dim=1)
        n_corrects += labels.eq(predictions).sum().item()
        total += labels.size(0)
        step += 1
    train_accuracy = 100. * n_corrects / total
#     train_loss /= step
    return train_loss, train_accuracy

In [52]:

@torch.no_grad()
def test(model, test_loader, device):
    n_corrects = 0
    total = 0
#     embedding = embedding.to(device)
    model.eval()

    # Metrics
    precision = torchmetrics.Precision(task='multiclass', num_classes=2, average='macro').to(device)  # Adjust num_classes as needed
    recall = torchmetrics.Recall(task='multiclass', num_classes=2, average='macro').to(device)
    f1 = torchmetrics.F1Score(task='multiclass', num_classes=2, average='macro').to(device)
    accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=2, average='macro').to(device)
    hidden_state = []
    all_predictions = []
    all_labels = []
    att_layer = []

    for texts, labels, lens, uc, rd in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)
        uc = torch.tensor(uc, dtype=torch.int).to(device)
        rd = torch.tensor(rd, dtype=torch.float).to(device)
        features = torch.cat((uc.unsqueeze(1), rd.unsqueeze(1)), dim=1)
        # print(features.shape)
        
        outputs, hidden, attention = model(texts, lens, features, max_len_test)
        _, predictions = torch.max(outputs, dim=1)

        # Update metrics
        precision.update(predictions, labels)
        recall.update(predictions, labels)
        f1.update(predictions, labels)
        accuracy.update(predictions, labels)

        n_corrects += labels.eq(predictions).sum().item()
        total += labels.size(0)

        hidden_state.append(hidden.detach().cpu().numpy())
        att_layer.append(attention.detach().cpu().numpy())
        # Collect all labels and predictions
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

    # Compute final metric values
    final_precision = precision.compute()
    final_recall = recall.compute()
    final_f1 = f1.compute()
    final_accuracy = accuracy.compute() * 100  # Convert to percentage

    hidden_states = np.concatenate(hidden_state, axis=0)
    att_layers = np.concatenate(att_layer, axis=0)

    test_accuracy = 100. * n_corrects / total
    # Compute the confusion matrix
    cm = confusion_matrix(all_labels, all_predictions)
    
    # return cm
    return test_accuracy, final_accuracy, final_precision, final_recall, final_f1, hidden_states, cm, att_layers


In [53]:
n_epochs = 30
lr = 3e-4

embedding_size = 200
hidden_size = 1024
n_layers = 3
dict_size = len(vocab.get_itos())
n_classes = 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(device)
# device = torch.device("cpu")
model = RNN(embedding_size, hidden_size, n_layers, n_classes, dict_size, embedding_size, 2).to(device)
# model.init_hidden(device)
print(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

RNN(
  (embedding): ReviewEmbedder(
    (text_embedding): Embedding(5310, 200)
    (meta_embedding): Linear(in_features=2, out_features=200, bias=True)
  )
  (meta_embedding): Linear(in_features=2, out_features=200, bias=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=400, out_features=400, bias=True)
  )
  (lstm): LSTM(402, 1024, num_layers=3, batch_first=True)
  (fc): Linear(in_features=1024, out_features=2, bias=True)
  (usefulCount_norm): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (reviewCount_norm): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [54]:
scheduler = optim.lr_scheduler.CyclicLR(optimizer, 
                                        cycle_momentum=False,
                                        base_lr=0.0001,
                                        max_lr=0.01,
                                        step_size_up=5)
for epoch in range(n_epochs):
    train_loss, train_accuracy = train(epoch, model, train_loader, batch_size, device, loss_fn, optimizer)
    current_lr = optimizer.param_groups[0]["lr"]
    print(f'Epoch: {epoch+1}/{n_epochs} | Train Loss: {train_loss:.4f} | Acc: {train_accuracy:.2f}% | lr: {current_lr:.6f}')
#     scheduler.step()

Epoch: 1/30 | Train Loss: 34.9433 | Acc: 79.81% | lr: 0.000100
Epoch: 2/30 | Train Loss: 30.2037 | Acc: 82.71% | lr: 0.000100
Epoch: 3/30 | Train Loss: 32.0080 | Acc: 81.78% | lr: 0.000100
Epoch: 4/30 | Train Loss: 30.2483 | Acc: 83.24% | lr: 0.000100
Epoch: 5/30 | Train Loss: 29.7696 | Acc: 83.39% | lr: 0.000100
Epoch: 6/30 | Train Loss: 29.2878 | Acc: 83.33% | lr: 0.000100
Epoch: 7/30 | Train Loss: 27.8676 | Acc: 84.47% | lr: 0.000100
Epoch: 8/30 | Train Loss: 28.3052 | Acc: 84.56% | lr: 0.000100
Epoch: 9/30 | Train Loss: 27.9193 | Acc: 84.32% | lr: 0.000100
Epoch: 10/30 | Train Loss: 27.4374 | Acc: 84.39% | lr: 0.000100
Epoch: 11/30 | Train Loss: 27.3834 | Acc: 84.44% | lr: 0.000100
Epoch: 12/30 | Train Loss: 27.2290 | Acc: 84.65% | lr: 0.000100
Epoch: 13/30 | Train Loss: 26.6667 | Acc: 84.99% | lr: 0.000100
Epoch: 14/30 | Train Loss: 26.6557 | Acc: 84.97% | lr: 0.000100
Epoch: 15/30 | Train Loss: 25.8286 | Acc: 85.32% | lr: 0.000100
Epoch: 16/30 | Train Loss: 25.2915 | Acc: 86.04% 

In [55]:

test_accuracy, test_acc, test_precision, test_recall, test_f1, hidden_states, cm, attention = test(model, test_loader, device)

  uc = torch.tensor(uc, dtype=torch.int).to(device)
  rd = torch.tensor(rd, dtype=torch.float).to(device)


In [56]:
test_accuracy

80.7738814993954

In [57]:
print(f"{test_acc:.4f}")
print(f"{test_precision:.4f}")
print(f"{test_recall:.4f}")
print(f"{test_f1:.4f}")

80.8004
0.8145
0.8080
0.8068


1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1