In [1]:
import torch
from torch import nn
import model_runner
import embeddings
import dataset
import os

### Importing the dataset

In [2]:
# Reader to get train and val data from csv
reader = dataset.csv_reader()

# Open formatted_train.csv
train_path = os.path.join(os.path.curdir, "dataset", "formatted_train.csv")
reader.open_csv(train_path, skip_header=True)
train_reviews = reader.read(-1)

# And formatted_val.csv
val_path = os.path.join(os.path.curdir, "dataset", "formatted_val.csv")
reader.open_csv(val_path, skip_header=True)
val_reviews = reader.read(-1)

### Embedding text reviews to vector representations

In [3]:
# Intialize the embedder to use the glove-wiki-gigaword-50 embedding dictionary
# https://github.com/piskvorky/gensim-data#:~:text=org/licenses/pddl/-,glove%2Dwiki%2Dgigaword%2D50,-400000
review_embedder = embeddings.review_embedder(embedding_model="glove-wiki-gigaword-50")

In [4]:
# This dict denotes how review labels (strings) should be mapped to one-hot encodings (tensors)
# Note that this mapping is for the original dataset,the dataset with remapped labels will look different
review_label_mapping = {
    "negative": torch.tensor([1., 0.]),
    "positive": torch.tensor([0., 1.]),
}

In [5]:
# Creating final embeddings
# Based on this dict and the embedding scheme selected above (glove-wiki-gigaword-50),
# we can embed our review text and labels to tensors
# 
# Note that there are a few extra parameters to the embedder which aren't shown below:
# 
# oov_feature: creates an extra label feature which is zero usually, except for when a word
#   of a review can't be embedded because it is not contained in the word vector list
#   of the chosen embedding scheme (the word is then "out-of-vocab"). When an OOV word
#   is encountered in the review text and oov_feature is True (default), then the resulting
#   word vector is a bunch of zeroes, plus a one in the oov_feature postition; when
#   this happens and oov_feature is False, the word is simply skipped.
# 
# title_body_feature: similar to oov_feature, this creates an extra label feature which is
#   zero for words appearing in the title of the review and one for words appearing in the
#   body of the review.
# train_features, train_labels = review_embedder.embed_dataset_features_and_labels(train_reviews, review_label_mapping)
# test_features, test_labels = review_embedder.embed_dataset_features_and_labels(test_reviews, review_label_mapping)

In [6]:
# New sampler, which can run on the full dataset
# TODO desc
train_sampler = embeddings.batched_review_embedder_sampler(train_reviews, review_embedder, review_label_mapping, batch_size=50)
val_sampler = embeddings.batched_review_embedder_sampler(val_reviews, review_embedder, review_label_mapping, batch_size=50)

# Testing the samplers
x_sample, y_sample = next(iter(train_sampler))
x_sample, y_sample.shape



(PackedSequence(data=tensor([[-0.9534, -0.1661, -0.7835,  ..., -0.4313,  0.0000,  0.0000],
         [ 0.4162,  0.0087, -0.0458,  ..., -0.2288,  0.0000,  0.0000],
         [-0.4248, -0.5238,  0.2449,  ...,  0.3188,  0.0000,  0.0000],
         ...,
         [ 0.1163,  0.5390, -0.3951,  ...,  0.0845,  0.0000,  1.0000],
         [-0.2658,  0.5187,  0.0656,  ...,  1.2443,  0.0000,  1.0000],
         [-0.2931,  0.3508,  1.1328,  ..., -0.3749,  0.0000,  1.0000]]), batch_sizes=tensor([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
         50, 48, 48, 47, 45, 45, 45, 45, 45, 45, 45, 45, 43, 43, 41, 40, 40, 39,
         38, 38, 38, 37, 36, 35, 35, 35, 35, 34, 34, 33, 33, 32, 30, 30, 30, 30,
         30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 25, 25,
         25, 24, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16, 16, 16, 16,
         16, 15, 15, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 10,  9, 

### Model creation

In [7]:
class review_LSTM(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_classifier: nn.Module):
        super().__init__()
        self._LSTM = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self._output_classifier = output_classifier
        self._hidden_size = hidden_size
        super().add_module("LSTM", self._LSTM)
    
    def forward(self, x: torch.Tensor | nn.utils.rnn.PackedSequence):
        # Give the review data to the LSTM to munch on
        output, (h_n, c_n) = self._LSTM.forward(x)
        
        # c_n is the cell state of the LSTM given all the data it has seen so far, and is supposed to
        # represent the LSTM's overall interpretation of the data; it can be used as a feature vector
        # for the output classifier to make a final class prediction. Reshape it to [batch_size x hidden_size],
        # then feed it to the output classifier
        c_n = torch.reshape(c_n, (-1, self._hidden_size))
        yhat = self._output_classifier.forward(c_n)
        
        # Return the results from the output classifier
        return yhat

In [8]:
input_size = 52
hidden_size = 100
output_size = 2

output_classifier = nn.Linear(hidden_size, output_size)

model = review_LSTM(input_size, hidden_size, output_classifier)

### Model training

In [9]:
# Model optimizer objects
optim = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

In [10]:
# Create a model runner to handle training
runner = model_runner.runner(model_name="LSTM_test_full_formatted_dataset", model=model, optimizer=optim, loss_fn=loss_fn)

In [11]:
# Train that model!
runner.train(train_sampler, val_sampler, num_epochs=10, autosave_interval_epochs=1)

Training Epoch 1: 100%|██████████| 38400/38400 [14:18<00:00, 44.74batches/s, batch loss=0.323, epoch train accuracy=76.95%]
Evaluating model accuracy: 100%|██████████| 9600/9600 [02:46<00:00, 57.83batches/s] 


This epoch was the most accurate so far: validation accuracy = 84.62%. Saving model state...
Reached epoch save interval, saving model state...


Training Epoch 2: 100%|██████████| 38400/38400 [29:32<00:00, 21.67batches/s, batch loss=0.375, epoch train accuracy=85.23%]    
Evaluating model accuracy: 100%|██████████| 9600/9600 [02:40<00:00, 59.69batches/s] 


This epoch was the most accurate so far: validation accuracy = 86.52%. Saving model state...
Reached epoch save interval, saving model state...


Training Epoch 3:   0%|          | 100/38400 [00:03<19:57, 31.99batches/s, batch loss=0.398, epoch train accuracy=86.32%] 


KeyboardInterrupt: 

In [None]:
runner._train_acc_history, runner._val_acc_history

([0.21902190219021903,
  0.2508250825082508,
  0.26752675267526754,
  0.24542454245424541,
  0.2737273727372737,
  0.23542354235423543,
  0.27262726272627263,
  0.2861286128612861,
  0.2591259125912591,
  0.29812981298129815,
  0.2765276527652765,
  0.29572957295729574,
  0.34493449344934496,
  0.37013701370137014,
  0.3852385238523852,
  0.40594059405940597,
  0.4146414641464146,
  0.42674267426742674,
  0.43504350435043504,
  0.4452445244524452,
  0.45104510451045104,
  0.446044604460446,
  0.45934593459345935,
  0.4667466746674667,
  0.4673467346734673,
  0.47664766476647663,
  0.48444844484448446,
  0.48434843484348433,
  0.49134913491349136,
  0.49704970497049705,
  0.5022502250225023,
  0.5052505250525052,
  0.5104510451045104,
  0.5104510451045104,
  0.5163516351635163,
  0.5221522152215221,
  0.5275527552755276,
  0.5318531853185319,
  0.5287528752875288,
  0.5398539853985399,
  0.5472547254725473,
  0.5508550855085509,
  0.005842424242424245,
  0.005907070707070706,
  0.595454