## LSTM Example

This notebook must be moved to the root project directory to run; it walks through all the steps involved in creating
and training an LSTM classifier for Amazon reviews.

In [1]:
import torch
from torch import nn
import model_runner
import embeddings
import dataset
import os
from models.lstm import review_LSTM

### Importing the dataset

In [2]:
# Reader to get train and val data from csv
reader = dataset.csv_reader()

# Open formatted_train.csv
train_path = os.path.join(os.path.curdir, "dataset", "formatted_train.csv")
reader.open_csv(train_path, skip_header=True)
train_reviews = reader.read(-1)

# And formatted_val.csv
val_path = os.path.join(os.path.curdir, "dataset", "formatted_val.csv")
reader.open_csv(val_path, skip_header=True)
val_reviews = reader.read(-1)

### Embedding text reviews to vector representations

In [3]:
# Intialize the embedder to use the glove-wiki-gigaword-50 embedding dictionary
# https://github.com/piskvorky/gensim-data#:~:text=org/licenses/pddl/-,glove%2Dwiki%2Dgigaword%2D50,-400000
review_embedder = embeddings.review_embedder(review_labels=["negative", "positive"], embedding_model="glove-wiki-gigaword-50", oov_feature=True, title_body_feature=True)

In [4]:
# If we tried to embed our entire training dataset into feature vectors at once, it would utilize over 100GB of RAM,
# which is unrealistic for most computers. The batched_review_embedder_sampler class embeds samples a few at a time,
# reducing RAM usage compared to a traditional DataLoader, at the expense of needing to re-embed the reviews every epoch
train_sampler = embeddings.batched_review_embedder_sampler(train_reviews, review_embedder, batch_size=50)
val_sampler = embeddings.batched_review_embedder_sampler(val_reviews, review_embedder, batch_size=50)

# Testing the samplers
x_sample, y_sample = next(iter(train_sampler))
x_sample.data.shape, x_sample.batch_sizes[0], y_sample.shape



(torch.Size([4206, 52]), tensor(50), torch.Size([50, 2]))

### Model creation

In [5]:
# Model hyperparameters
input_size = review_embedder.feature_embedding_size
hidden_size = 100
output_size = 2
num_layers = 1

# The output classifier makes a final prediction of the label based on the final hidden state of the LSTM; this could be
# any feedforward architecture, but as a starting point a single linear layer is used. The output should have a softmax
# activation in order for out cross entropy loss function to work well
output_classifier = nn.Sequential(nn.Linear(hidden_size * num_layers, output_size), nn.Softmax(dim=1))

# The full model consists of an LSTM RNN and the output classifier defined above
model = review_LSTM(input_size, hidden_size, output_classifier, num_layers)

### Model training

In [6]:
# Model optimizer objects
optim = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

In [7]:
# Create a model runner to handle training
runner = model_runner.runner(model_name="LSTM_test", model=model, optimizer=optim, loss_fn=loss_fn)

In [8]:
# Train that model!
from wakepy import keep

with keep.running() as k:
    print("Successfully locked PC to prevent it sleeping during training!" if k.success else "Wasn't able to lock PC from sleeping during training!")
    runner.train(train_sampler, num_epochs=1, val_batch_iterable=val_sampler, autosave_interval_epochs=1)

Successfully locked PC to prevent it sleeping during training!


Training Epoch 17: 100%|██████████| 38400/38400 [14:50<00:00, 43.12batches/s, batch loss=0.423, epoch train accuracy=87.31%]
Evaluating model predictions: 100%|██████████| 9600/9600 [03:18<00:00, 48.26batches/s]


Reached epoch save interval, saving model state...


### Results

In [9]:
# Open the test dataset
test_path = os.path.join(os.path.curdir, "dataset", "formatted_test.csv")
reader.open_csv(test_path, skip_header=True)
test_reviews = reader.read(-1)
test_sampler = embeddings.batched_review_embedder_sampler(test_reviews, review_embedder, batch_size=50)

In [10]:
# Evaluate the model's performance on the test dataset
y_test, yhat_test = runner.predict_dataset(test_sampler)

Evaluating model predictions: 100%|██████████| 10400/10400 [03:32<00:00, 49.02batches/s]


In [11]:
# We can now look at testing metrics using our predictions; here, we only consider accuracy
from sklearn.metrics import accuracy_score

print(y_test.shape, yhat_test.shape, accuracy_score(y_test, yhat_test), sep='\n')

torch.Size([519995])
torch.Size([519995])
0.8714949182203675
