**Dataset**: https://www.kaggle.com/datasets/ilhamfp31/yelp-review-dataset/data

In [1]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
import string
from collections import Counter
import numpy as np
from argparse import Namespace
import collections
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
args = Namespace(
    raw_train_dataset_csv="drive/MyDrive/Colab/Yelp/train.csv",
    raw_test_dataset_csv="drive/MyDrive/Colab/Yelp/test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="reviews_with_splits_lite.csv",
    seed=1337
)

In [4]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [5]:
# Select 10% of data
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [6]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)

    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)

    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'

    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'

    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    final_list.extend(item_list)

final_reviews = pd.DataFrame(final_list)

In [7]:
print(final_reviews.shape)
print(final_reviews.head())

(56000, 3)
   rating                                             review  split
0       1  Terrible place to work for I just heard a stor...  train
1       1  3 hours, 15 minutes-- total time for an extrem...  train
2       1  My less than stellar review is for service.   ...  train
3       1  I'm granting one star because there's no way t...  train
4       1  The food here is mediocre at best. I went afte...  train


In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

final_reviews.review = final_reviews.review.apply(preprocess_text)

In [9]:
# Mapping positive and negative reviews
mapping_dict = {1 : 'Negative', 2 : 'Positive'}
final_reviews['rating'] = final_reviews['rating'].map(mapping_dict)

In [10]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,Negative,terrible place to work for i just heard a stor...,train
1,Negative,"hours , minutes total time for an extremely s...",train
2,Negative,my less than stellar review is for service . w...,train
3,Negative,i m granting one star because there s no way t...,train
4,Negative,the food here is mediocre at best . i went aft...,train


In [11]:
final_reviews.to_csv("drive/MyDrive/Colab/Yelp/reviews_with_splits.csv",index = False)

In [12]:
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df.split == 'train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df.split == "val"]
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df.split == "test"]
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train' : (self.train_df, self.train_size),
                             'val' : (self.val_df, self.validation_size),
                             'test' : (self.test_df, self.test_size)}
        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """Load dataset and make a new vectorizer from scratch
        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        return cls(review_df,ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        """ returns the vectorizer"""
        return self._vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        review_vector = self._vectorizer.vectorize(row.review)
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {'x_data' : review_vector,
               'y_target' : rating_index}

    def get_num_batches(self,batch_size):
        """Given a batch size, return the number of batches in the dataset
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size


In [13]:
class Vocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""

    def __init__(self, token_to_idx = None, add_unk = True, unk_token = "<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token

        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx,
                'add_unk': self._add_unk,
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.
        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        """Retrieve the index associated with the token or the UNK index if token isn't present.
        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary)
            for the UNK functionality
        """

        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)



In [14]:
class ReviewVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""

    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        """Create a collapsed one-hot vector for the review
        Args:
            review (str): the review
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding
        """
        one_hot = np.zeros(len(self.review_vocab), dtype = np.float32)

        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, review_df, cutoff = 25):
        """Instantiate the vectorizer from the dataset dataframe
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency­based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk = True)
        rating_vocab = Vocabulary(add_unk = False)

        # Add ratings
        for rating in sorted(set(set(review_df.rating))):
            rating_vocab.add_token(rating)

        #Add top words if count > cutoff
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

    @classmethod
    def from_serializable(cls, contents):
        """Intantiate a ReviewVectorizer from a serializable dictionary
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

    def to_serializable(self):
        """Create the serializable dictionary for caching
        Returns:
            contents (dict): the serializable dictionary
        """
        return {'review_vocab': self.review_vocab.to_serializable(),
        'rating_vocab': self.rating_vocab.to_serializable()}

In [15]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [16]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    """ a simple perceptron based classifier"""
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features = num_features,
                            out_features = 1)

    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        Args:
            x_in (torch.Tensor): an input data tensor
            x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                                should be false if used with the cross­entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,).
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out


In [17]:
args = Namespace(
    # Data and path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='drive/MyDrive/Colab/Yelp/reviews_with_splits.csv',
    save_dir='model_storage/ch3/yelp/',
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=10,
    seed=1337,
    cuda = True
    # Runtime options omitted for space
)

In [18]:
def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [19]:
import torch.optim as optim

def make_train_state(args):
    return {'epoch_index' : 0,
            'train_loss' : [],
            'train_acc' : [],
            'val_loss' : [],
            'val_acc' : [],
            'test_loss' : -1,
            'test_acc' : -1}
train_state = make_train_state(args)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print("Using CUDA: {}".format(args.cuda))

# handle dirs
handle_dirs(args.save_dir)


# dataset and vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vectorizer()

# model
classifier = ReviewClassifier(num_features = len(vectorizer.review_vocab))
classifier = classifier.to(args.device)

# Loass and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr = args.learning_rate)

Using CUDA: False


In [20]:
for epoch_index in range(args.num_epochs):
    train_state["epoch_indx"] = epoch_index

    # Iterate over training dataset

    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split("train")
    batch_generator = generate_batches(dataset, batch_size = args.batch_size,
                                      device = args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # Training routine is 5 steps

        # Step 1 : zero the gradients
        optimizer.zero_grad()

        # Step 2 : compute the output
        y_pred = classifier(x_in = batch_dict['x_data'].float())

        # Step 3 : compute the loss
        loss = loss_func(y_pred, batch_dict["y_target"].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss)/(batch_index + 1)

        # Step 4 : use loss to produce gradients
        loss.backward()

        # Step 5 : use optimizer to take gradient step
        optimizer.step()

        # -----------------------------------------------
        # Compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict["y_target"])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)
    print("Epoch # {0} : Train Loss : {1} Train accuracy : {2}".format(epoch_index, running_loss, running_acc))

    # Iterate over val dataset
    # setup: batch generator, set loss and acc to 0, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset,
                                        batch_size=args.batch_size,
                                        device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):
        # Step 1 : compute the output
        y_pred = classifier(x_in = batch_dict["x_data"].float())

        # Step 2 : compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    print("Epoch # {0} : Val Loss : {1} Val accuracy : {2}".format(epoch_index, running_loss, running_acc))

Epoch # 0 : Train Loss : 0.48157445531265414 Train accuracy : 83.9103349673203
Epoch # 0 : Val Loss : 0.3813157329192529 Val accuracy : 88.62980769230768
Epoch # 1 : Train Loss : 0.32947911448728007 Train accuracy : 90.26756535947712
Epoch # 1 : Val Loss : 0.30913234628163844 Val accuracy : 90.31249999999999
Epoch # 2 : Train Loss : 0.2745377444753463 Train accuracy : 91.71772875816993
Epoch # 2 : Val Loss : 0.2735499019806201 Val accuracy : 91.26201923076923
Epoch # 3 : Train Loss : 0.24356742695071337 Train accuracy : 92.39174836601306
Epoch # 3 : Val Loss : 0.252897524375182 Val accuracy : 91.77884615384616
Epoch # 4 : Train Loss : 0.222848537603235 Train accuracy : 92.83854166666671
Epoch # 4 : Val Loss : 0.23897184706651242 Val accuracy : 91.95913461538463
Epoch # 5 : Train Loss : 0.20773092720633238 Train accuracy : 93.28533496732028
Epoch # 5 : Val Loss : 0.22973635746882506 Val accuracy : 92.03124999999997
Epoch # 6 : Train Loss : 0.19574191508924263 Train accuracy : 93.6146854

In [21]:
dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                    batch_size=args.batch_size,
                                    device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()
for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [22]:
print("Test loss: {:.3f}".format(train_state['test_loss']))
print("Test Accuracy: {:.2f}".format(train_state['test_acc']))

Test loss: 0.215
Test Accuracy: 91.95


In [23]:
def predict_rating(review, classifier, vectorizer,decision_threshold=0.5):
    """Predict the rating of a review
    Args:
        review (str): the text of the review
        classifier (ReviewClassifier): the trained model
        vectorizer (ReviewVectorizer): the corresponding vectorizer
        decision_threshold (float): The numerical boundary which
                                    separates the rating classes
    """
    classifier = classifier#.to('cuda')
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))#.to('cuda')
    result = classifier(vectorized_review.view(1, -1))
    probability_value = F.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0
    return vectorizer.rating_vocab.lookup_index(index)
test_review = "this is a pretty awesome book"
prediction = predict_rating(test_review, classifier, vectorizer)
print(f"{test_review} -> {prediction}")

this is a pretty awesome book -> Positive


In [24]:
# Sort weights
fc1_weights = classifier.fc1.weight.detach()[0].cpu()
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

# Top 20 words
print("Influential words in Positive Reviews:")
print("­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Positive Reviews:
­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­
delicious
amazing
great
fantastic
vegas
excellent
awesome
perfect
love
yummy
pleasantly
yum
wonderful
best
ngreat
favorite
reasonable
solid
loved
helpful


In [25]:
# Top 20 negative words
print("Influential words in Negative Reviews:")
print("­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Negative Reviews:
­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­­
worst
mediocre
bland
horrible
rude
terrible
awful
meh
overpriced
tasteless
disgusting
disappointing
dirty
ok
not
poor
poorly
disappointment
elsewhere
unfriendly


In [26]:
# Save the model
model_path = "drive/MyDrive/Colab/Yelp/model.pth"
torch.save(classifier.state_dict(), model_path)