In [13]:
import torch
from torch import nn
import model_runner
import embeddings
import dataset
import os

### Importing the dataset

In [14]:
# Reader to get training and test data from csv
reader = dataset.csv_reader()

# Open train.csv
train_path = os.path.join(os.path.curdir, "dataset", "train.csv")
reader.open_csv(train_path)
train_reviews = reader.read(-1)

# And test.csv
test_path = os.path.join(os.path.curdir, "dataset", "test.csv")
reader.open_csv(test_path)
test_reviews = reader.read(-1)

### Embedding text reviews to vector representations

In [15]:
# Intialize the embedder to use the glove-wiki-gigaword-50 embedding dictionary
# https://github.com/piskvorky/gensim-data#:~:text=org/licenses/pddl/-,glove%2Dwiki%2Dgigaword%2D50,-400000
review_embedder = embeddings.review_embedder()
review_embedder.load_embedding_model("glove-wiki-gigaword-50")

In [16]:
# This dict denotes how review labels (strings) should be mapped to one-hot encodings (tensors)
# Note that this mapping is for the original dataset,the dataset with remapped labels will look different
review_label_mapping = {
    "1": torch.tensor([1., 0., 0., 0., 0.]),
    "2": torch.tensor([0., 1., 0., 0., 0.]),
    "3": torch.tensor([0., 0., 1., 0., 0.]),
    "4": torch.tensor([0., 0., 0., 1., 0.]),
    "5": torch.tensor([0., 0., 0., 0., 1.]),
}

In [17]:
# Creating final embeddings
# Based on this dict and the embedding scheme selected above (glove-wiki-gigaword-50),
# we can embed our review text and labels to tensors
# 
# Note that there are a few extra parameters to the embedder which aren't shown below:
# 
# oov_feature: creates an extra label feature which is zero usually, except for when a word
#   of a review can't be embedded because it is not contained in the word vector list
#   of the chosen embedding scheme (the word is then "out-of-vocab"). When an OOV word
#   is encountered in the review text and oov_feature is True (default), then the resulting
#   word vector is a bunch of zeroes, plus a one in the oov_feature postition; when
#   this happens and oov_feature is False, the word is simply skipped.
# 
# title_body_feature: similar to oov_feature, this creates an extra label feature which is
#   zero for words appearing in the title of the review and one for words appearing in the
#   body of the review.
# train_features, train_labels = review_embedder.embed_dataset_features_and_labels(train_reviews, review_label_mapping)
# test_features, test_labels = review_embedder.embed_dataset_features_and_labels(test_reviews, review_label_mapping)

In [18]:
# New sampler, which can run on the full dataset
# TODO desc
train_sampler = embeddings.review_embedder_sampler(train_reviews, review_embedder, review_label_mapping)
test_sampler = embeddings.review_embedder_sampler(test_reviews, review_embedder, review_label_mapping)

# Testing the samplers
x_sample, y_sample = next(iter(train_sampler))
x_sample.shape, y_sample.shape



(torch.Size([54, 52]), torch.Size([1, 5]))

### Model creation

In [19]:
# Torch has a built-in LSTM class which does what we need
model = nn.LSTM(input_size=52, hidden_size=100, num_layers=1, batch_first=True)

In [20]:
class review_LSTM(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_classifier: nn.Module, batch_first: bool = True):
        super().__init__()
        self._LSTM = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=batch_first)
        self._output_classifier = output_classifier
        super().add_module("LSTM", self._LSTM)
    
    def forward(self, x: torch.Tensor | nn.utils.rnn.PackedSequence):
        # Give the review data to the LSTM to munch on
        output, (h_n, c_n) = self._LSTM.forward(x)
        
        # c_n is the cell state of the LSTM given all the data it has seen so far, and is supposed to
        # represent the LSTM's overall interpretation of the data; it can be used as a feature vector
        # for the output classifier to make a final class prediction. Feed it to the output classifier!
        yhat = self._output_classifier.forward(c_n)
        
        # Return the results from the output classifier
        return yhat

In [21]:
input_size = 52
hidden_size = 100
output_size = 5

output_classifier = nn.Sequential(
    nn.Linear(hidden_size, output_size),
    nn.Softmax(dim=1)
)

model = review_LSTM(input_size, hidden_size, output_classifier)

### Model training

In [22]:
# Model optimizer objects
optim = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

In [23]:
# Create a model runner to handle training
runner = model_runner.runner(model_name="LSTM_test_full_dataset", model=model, optimizer=optim, loss_fn=loss_fn)

In [24]:
# Train that model!
runner.train(train_sampler, test_sampler, num_epochs=5, autosave_interval_epochs=1)

Training Epoch 43:   0%|          | 10241/3000000 [00:51<4:10:17, 199.08batches/s, batch loss=0.914, epoch train accuracy=42.05%]


KeyboardInterrupt: 