In [1]:
import torch
from torch import nn
from torch import optim
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
np.random.seed(42)

In [4]:
DATA_PATH = "tripadvisor_hotel_reviews.csv"

In [5]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [6]:
neutral_range = {"low": 4, "high": 5}
df["Sentiment"] = "neutral"
df["Sentiment"].loc[df["Rating"] < neutral_range["low"]] = "negative"
df["Sentiment"].loc[df["Rating"] >= neutral_range["high"]] = "positive"
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"].loc[df["Rating"] < neutral_range["low"]] = "negative"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sentiment"].loc[df["Rating"] >= neutral_range["high"]] = "positive"


Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,neutral
1,ok nothing special charge diamond member hilto...,2,negative
2,nice rooms not 4* experience hotel monaco seat...,3,negative
3,"unique, great stay, wonderful time hotel monac...",5,positive
4,"great stay great stay, went seahawk game aweso...",5,positive


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df["Review"], df["Sentiment"], test_size=0.2,
                                                                stratify=df["Sentiment"])

In [14]:
class RandomBaseline:
    def __init__(self):
        self.categories = {}

    def fit(self, data, target_col):
        cat_names = data[target_col].unique()
        agg = data.groupby(target_col).count()
        for n in cat_names:
            self.categories[n] = agg.loc[n][0] / len(data)

    def predict(self, data):
        return np.random.choice(list(self.categories.keys()), len(data), list(self.categories.values()))

In [15]:
rb = RandomBaseline()
rb.fit(df.iloc[X_train.index], "Sentiment")

In [17]:
pred = rb.predict(X_test)
accuracy_score(y_test, pred)

0.3266650402537204

In [18]:
tokenizer = get_tokenizer('basic_english')
tokenizer("the place was nice")

['the', 'place', 'was', 'nice']

In [19]:
def tokenized_review_iterator(reviews):
    for r in reviews:
        yield tokenizer(r)

vocab = build_vocab_from_iterator(tokenized_review_iterator(X_train), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab(['the', 'place', 'was', 'nice'])

[32, 31, 3978, 15]

In [20]:
target_map = {
    "positive": 0,
    "neutral": 1,
    "negative": 2
}

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: target_map[x]

class ReviewDataset(Dataset):

    def __init__(self, X, y, text_pipeline, label_pipeline):
        self.X = X
        self.y = y
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        text = torch.tensor(self.text_pipeline(self.X.iloc[idx]))
        length = torch.tensor(len(text))
        label = torch.tensor(self.label_pipeline(self.y.iloc[idx]))
        return {"text": text, "length": length, "labels": label}

train_dataset = ReviewDataset(X_train, y_train, text_pipeline, label_pipeline)
test_dataset = ReviewDataset(X_test, y_test, text_pipeline, label_pipeline)

In [22]:
def collate(batch):
    batch.sort(key=lambda x: x["length"], reverse=True)
    text, lengths, labels = zip(*[d.values() for d in batch])
    text = torch.nn.utils.rnn.pad_sequence(text, batch_first=True)
    lengths = torch.stack(lengths)
    labels = torch.stack(labels)
    return text, lengths, labels

In [23]:
class SentimentLSTM(nn.Module):

    def __init__(self, vocab_size, embed_dim, hidden_size, n_layers, num_class):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers=n_layers, batch_first=True)
        self.drop = nn.Dropout(0.5)
        self.batch_norm = nn.BatchNorm1d(n_layers * hidden_size)
        self.dense = nn.Linear(n_layers * hidden_size, num_class)

    def dense_parameters(self):
        return list(self.lstm.parameters()) + list(self.dense.parameters())

    def forward(self, encoded_text, lengths):
        batch_size = lengths.shape[0]
        embedded = self.embedding(encoded_text)
        packed_embeded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True)
        _, (hidden, cell) = self.lstm(packed_embeded)
        hidden = hidden.permute([1, 0, 2]).contiguous().view(batch_size, -1)
        hidden = self.drop(hidden)
        hidden = self.batch_norm(hidden)
        hidden = self.dense(hidden)
        return hidden

In [24]:
# training parameters
n_epoch = 20
lr = 1e-4
batch_size = 512

# model parameters
embedding_dim = 256
hidden_size = 128
n_layers = 3

model = SentimentLSTM(len(vocab), embedding_dim, hidden_size, n_layers, 3)

losses = {"train": [], "validation": []}
accuracies = {"train": [], "validation": []}

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)
validation_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)

criterion = nn.CrossEntropyLoss()
optimizer_sparse = optim.SparseAdam(model.embedding.parameters(), lr=lr)
optimizer_dense = optim.Adam(model.dense_parameters(), lr=lr)

model = model.to(device)

In [27]:
for n in range(n_epoch):
    epoch_loss = []
    epoch_acc = []
    for encoded_text, lengths, labels in train_loader:
        model = model.train()
        optimizer_dense.zero_grad()
        optimizer_sparse.zero_grad()

    encoded_text, lengths, labels = encoded_text.to(device), lengths.to(device), labels.to(device)
    y_pred = model(encoded_text, lengths)
    loss = criterion(y_pred, labels)

    loss.backward()
    optimizer_sparse.step()
    optimizer_dense.step()

    epoch_loss.append(loss.item())
    acc = accuracy_score(labels.detach().cpu(), y_pred.argmax(1).detach().cpu())
    epoch_acc.append(acc)

    avg_loss = (sum(epoch_loss) / len(epoch_loss))
    avg_acc = (sum(epoch_acc) / len(epoch_acc))
    print(f"epoch:{n} train_loss: {avg_loss:.4f}; train_acc: {avg_acc:.4f}")
    losses["train"].append(avg_loss)
    accuracies["train"].append(avg_acc)

    epoch_loss = []
    epoch_acc = []
    with torch.no_grad():
        for encoded_text, lengths, labels in validation_loader:
            model = model.eval()

    encoded_text, lengths, labels = encoded_text.to(device), lengths.to(device), labels.to(device)
    y_pred = model(encoded_text, lengths)
    loss = criterion(y_pred, labels)

    epoch_loss.append(loss.item())
    acc = accuracy_score(labels.detach().cpu(), y_pred.argmax(1).detach().cpu())
    epoch_acc.append(acc)

    avg_loss = (sum(epoch_loss) / len(epoch_loss))
    avg_acc = (sum(epoch_acc) / len(epoch_acc))
    print(f"epoch:{n} validation_loss: {avg_loss:.4f}; validation_acc: {avg_acc:.4f}")
    losses["validation"].append(avg_loss)
    accuracies["validation"].append(avg_acc)

epoch:0 train_loss: 1.1038; train_acc: 0.5000
epoch:0 validation_loss: 1.1203; validation_acc: 0.3333
epoch:1 train_loss: 0.9223; train_acc: 0.6250
epoch:1 validation_loss: 1.1332; validation_acc: 0.0000
epoch:2 train_loss: 0.9955; train_acc: 0.5000
epoch:2 validation_loss: 1.1262; validation_acc: 0.0000
epoch:3 train_loss: 1.3479; train_acc: 0.2500
epoch:3 validation_loss: 1.0920; validation_acc: 0.6667
epoch:4 train_loss: 1.0519; train_acc: 0.5000
epoch:4 validation_loss: 1.0944; validation_acc: 0.0000
epoch:5 train_loss: 1.1653; train_acc: 0.2500
epoch:5 validation_loss: 1.0981; validation_acc: 0.3333
epoch:6 train_loss: 1.1428; train_acc: 0.3750
epoch:6 validation_loss: 1.1154; validation_acc: 0.0000
epoch:7 train_loss: 1.1496; train_acc: 0.3750
epoch:7 validation_loss: 1.0780; validation_acc: 0.3333
epoch:8 train_loss: 1.3012; train_acc: 0.2500
epoch:8 validation_loss: 1.0894; validation_acc: 0.6667
epoch:9 train_loss: 1.1920; train_acc: 0.5000
epoch:9 validation_loss: 1.0594; val