In [1]:
import torch
from torch import nn
import model_runner
import embeddings
import dataset
import os

### Importing the dataset

In [2]:
# Reader to get training and test data from csv
reader = dataset.csv_reader()

# Open train.csv
train_path = os.path.join(os.path.curdir, "dataset", "train_small.csv")
reader.open_csv(train_path)
train_reviews = reader.read(-1)

# And test.csv
test_path = os.path.join(os.path.curdir, "dataset", "test_small.csv")
reader.open_csv(test_path)
test_reviews = reader.read(-1)

### Embedding text reviews to vector representations

In [3]:
# Intialize the embedder to use the glove-wiki-gigaword-50 embedding dictionary
# https://github.com/piskvorky/gensim-data#:~:text=org/licenses/pddl/-,glove%2Dwiki%2Dgigaword%2D50,-400000
review_embedder = embeddings.review_embedder()
review_embedder.load_embedding_model("glove-wiki-gigaword-50")

In [4]:
# This dict denotes how review labels (strings) should be mapped to one-hot encodings (tensors)
# Note that this mapping is for the original dataset,the dataset with remapped labels will look different
review_label_mapping = {
    "1": torch.tensor([1., 0., 0., 0., 0.]),
    "2": torch.tensor([0., 1., 0., 0., 0.]),
    "3": torch.tensor([0., 0., 1., 0., 0.]),
    "4": torch.tensor([0., 0., 0., 1., 0.]),
    "5": torch.tensor([0., 0., 0., 0., 1.]),
}

In [5]:
# Creating final embeddings
# Based on this dict and the embedding scheme selected above (glove-wiki-gigaword-50),
# we can embed our review text and labels to tensors
# 
# Note that there are a few extra parameters to the embedder which aren't shown below:
# 
# oov_feature: creates an extra label feature which is zero usually, except for when a word
#   of a review can't be embedded because it is not contained in the word vector list
#   of the chosen embedding scheme (the word is then "out-of-vocab"). When an OOV word
#   is encountered in the review text and oov_feature is True (default), then the resulting
#   word vector is a bunch of zeroes, plus a one in the oov_feature postition; when
#   this happens and oov_feature is False, the word is simply skipped.
# 
# title_body_feature: similar to oov_feature, this creates an extra label feature which is
#   zero for words appearing in the title of the review and one for words appearing in the
#   body of the review.
train_features, train_labels = review_embedder.embed_dataset_features_and_labels(train_reviews, review_label_mapping)
test_features, test_labels = review_embedder.embed_dataset_features_and_labels(test_reviews, review_label_mapping)

Embedding features: 100%|██████████| 10000/10000 [00:02<00:00, 3947.91it/s]
Embedding features: 100%|██████████| 10000/10000 [00:02<00:00, 3899.72it/s]


In [6]:
# Constructing a data reader for our embedded reviews
# This is similar to a DataLoader, but we can't use that with variable-length data :(
# NOTE: this reader implementation is too slow to run on the full dataset and needs to be updated!
train_sampler = embeddings.embedded_review_random_sampler(train_features, train_labels)
test_sampler = embeddings.embedded_review_random_sampler(test_features, test_labels)

# Testing the sampler
x_sample, y_sample = next(iter(train_sampler))
x_sample.shape, y_sample.shape

(torch.Size([190, 52]), torch.Size([1, 5]))

### Model creation

In [7]:
# Torch has a built-in LSTM class which does what we need
model = nn.LSTM(input_size=52, hidden_size=100, num_layers=1, batch_first=True)

In [8]:
class review_LSTM(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_classifier: nn.Module, batch_first: bool = True):
        super().__init__()
        self._LSTM = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=batch_first)
        self._output_classifier = output_classifier
        super().add_module("LSTM", self._LSTM)
    
    def forward(self, x: torch.Tensor | nn.utils.rnn.PackedSequence):
        # Give the review data to the LSTM to munch on
        output, (h_n, c_n) = self._LSTM.forward(x)
        
        # c_n is the cell state of the LSTM given all the data it has seen so far, and is supposed to
        # represent the LSTM's overall interpretation of the data; it can be used as a feature vector
        # for the output classifier to make a final class prediction. Feed it to the output classifier!
        yhat = self._output_classifier.forward(c_n)
        
        # Return the results from the output classifier
        return yhat

In [9]:
input_size = 52
hidden_size = 100
output_size = 5

output_classifier = nn.Sequential(
    nn.Linear(hidden_size, output_size),
    nn.Softmax(dim=1)
)

model = review_LSTM(input_size, hidden_size, output_classifier)

### Model training

In [10]:
# Model optimizer objects
optim = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

In [11]:
# Create a model runner to handle training
runner = model_runner.runner(model_name="LSTM_test", model=model, optimizer=optim, loss_fn=loss_fn)

In [12]:
# Train that model!
runner.train(train_sampler, test_sampler, num_epochs=10, autosave_interval_epochs=1)

Epoch 14: 100%|██████████| 10000/10000 [00:46<00:00, 215.81it/s, batch loss=0.915, epoch train accuracy=37.0%]


{'most_accurate_model_epoch': 13, 'self._epoch': 14, 'epochs_since_starting_training': 1, 'self._loss_history': [1.6068969820857049, 1.5920192344069481, 1.5794373497307301, 1.5999930676281453, 1.5752094769895078, 1.6102991409838199, 1.577257874405384, 1.5670897438526155, 1.5874886572241784, 1.5601403160870075, 1.5795987164139749, 1.5588250208079815, 1.5186069102168083, 1.498301039505005], 'self._train_acc_history': [0.21902190219021903, 0.2508250825082508, 0.26752675267526754, 0.24542454245424541, 0.2737273727372737, 0.23542354235423543, 0.27262726272627263, 0.2861286128612861, 0.2591259125912591, 0.29812981298129815, 0.2765276527652765, 0.29572957295729574, 0.34493449344934496, 0.37013701370137014], 'self._val_acc_history': [0.2381, 0.2649, 0.2618, 0.2308, 0.2829, 0.2457, 0.2649, 0.2953, 0.2707, 0.3251, 0.2924, 0.3051, 0.37, 0.3407]}


Epoch 15: 100%|██████████| 10000/10000 [00:46<00:00, 214.47it/s, batch loss=1.299, epoch train accuracy=38.5%]


{'most_accurate_model_epoch': 15, 'self._epoch': 15, 'epochs_since_starting_training': 2, 'self._loss_history': [1.6068969820857049, 1.5920192344069481, 1.5794373497307301, 1.5999930676281453, 1.5752094769895078, 1.6102991409838199, 1.577257874405384, 1.5670897438526155, 1.5874886572241784, 1.5601403160870075, 1.5795987164139749, 1.5588250208079815, 1.5186069102168083, 1.498301039505005, 1.4848255165100097], 'self._train_acc_history': [0.21902190219021903, 0.2508250825082508, 0.26752675267526754, 0.24542454245424541, 0.2737273727372737, 0.23542354235423543, 0.27262726272627263, 0.2861286128612861, 0.2591259125912591, 0.29812981298129815, 0.2765276527652765, 0.29572957295729574, 0.34493449344934496, 0.37013701370137014, 0.3852385238523852], 'self._val_acc_history': [0.2381, 0.2649, 0.2618, 0.2308, 0.2829, 0.2457, 0.2649, 0.2953, 0.2707, 0.3251, 0.2924, 0.3051, 0.37, 0.3407, 0.3761]}


Epoch 16: 100%|██████████| 10000/10000 [00:46<00:00, 215.26it/s, batch loss=0.909, epoch train accuracy=40.6%]


{'most_accurate_model_epoch': 16, 'self._epoch': 16, 'epochs_since_starting_training': 3, 'self._loss_history': [1.6068969820857049, 1.5920192344069481, 1.5794373497307301, 1.5999930676281453, 1.5752094769895078, 1.6102991409838199, 1.577257874405384, 1.5670897438526155, 1.5874886572241784, 1.5601403160870075, 1.5795987164139749, 1.5588250208079815, 1.5186069102168083, 1.498301039505005, 1.4848255165100097, 1.4713312471032143], 'self._train_acc_history': [0.21902190219021903, 0.2508250825082508, 0.26752675267526754, 0.24542454245424541, 0.2737273727372737, 0.23542354235423543, 0.27262726272627263, 0.2861286128612861, 0.2591259125912591, 0.29812981298129815, 0.2765276527652765, 0.29572957295729574, 0.34493449344934496, 0.37013701370137014, 0.3852385238523852, 0.40594059405940597], 'self._val_acc_history': [0.2381, 0.2649, 0.2618, 0.2308, 0.2829, 0.2457, 0.2649, 0.2953, 0.2707, 0.3251, 0.2924, 0.3051, 0.37, 0.3407, 0.3761, 0.3989]}


Epoch 17:   3%|▎         | 314/10000 [00:01<00:48, 199.42it/s, batch loss=1.747, epoch train accuracy=37.1%]


KeyboardInterrupt: 