### Loading the Data

In [1]:
import os
import urllib.request
import tarfile

os.makedirs("data", exist_ok=True)

if not os.path.exists("data/aclImdb_v1.tar.gz"):
        #download database
        print("downloading database...")
        urllib.request.urlretrieve("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", "data/aclImdb_v1.tar.gz")
        print("download complete")

if not os.path.exists("data/aclImdb/"):
        #extract database
        print("extracting database...")
        with tarfile.open("data/aclImdb_v1.tar.gz", "r:gz") as tar:
                tar.extractall(path="data")
        print("database extracted")


In [2]:
import os
import glob

def read_imdb_data(data_dir='data/aclImdb'):
    data = {}
    labels = {}

     # Loop over the two splits: training and testing
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}

        # Loop over both sentiment categories: positive and negative
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []

            # Construct path to all text files of the current split and sentiment
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)

            # Read each review text file
            for f in files:
                with open(f, encoding="utf-8") as review:
                    data[data_type][sentiment].append(review.read())
                    # Assign label 1 for 'pos' and 0 for 'neg'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)

            # Sanity check: ensure that every text has a matching label
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)

    return data, labels

In [3]:
data, labels = read_imdb_data()
print("IMDb reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDb reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [4]:
import json
if not os.path.exists("data/data.json"):
        # Save the loaded review texts into a JSON file
        json.dump(data, open('data/data.json', 'w'))
        # Save the sentiment labels into another JSON file
        json.dump(labels, open('data/labels.json', 'w'))

In [5]:
# Open and load the movie review data from 'data.json'
f=open('data/data.json')
data = json.load(f)
f.close()

# Open and load the sentiment labels from 'labels.json'
f=open('data/labels.json')
labels = json.load(f)
f.close()

In [6]:
print("IMDb reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDb reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


# 1. Understanding the Data
function that gets the average, maximum, and minimum word length of a dataset.

In [7]:
def understanding_data(data, set):
        maxlength = 0
        minlength = 99999999999999999999999
        total = 0

        for sentiment in data[set]:
                for entry in data[set][sentiment]:
                        word_count = len(entry.split())
                        total += word_count
                        if word_count > maxlength:
                                maxlength = word_count
                        if word_count < minlength:
                                minlength = word_count
        set_size = len(data[set]['pos'])+len(data[set]['neg'])
        print("{} set avg word count: {}, max word count: {}, min word count {}".format(set, total/set_size, maxlength, minlength))


In [8]:
understanding_data(data, "train")
understanding_data(data, "test")

train set avg word count: 233.7872, max word count: 2470, min word count 10
test set avg word count: 228.52668, max word count: 2278, min word count 4


#### Word Length Results:
| | avg word count| max word count | min word count|
|---|---|---|---|
|train set| 233.7872 | 2470 | 10|
|test set | 229.52688 | 2278 | 4|

The word count of the reviews seem to vary quite drastically, with the longest review being almost 2500 words,\
and the shortest being only 4, and an average of about 230 words. Since the LSTM model reads one word at at time,\
reviews that are very long (~2500 words) might make training take significantly more time/memory, while on the\
other hand reviews that are very short (<10 words) probably won't give the model enough context to train on. 

# 2. Creating a Balanced Validation Split

In [9]:
from sklearn.utils import shuffle

def split_data(data, labels, val_size = 3000):
        pos_data, pos_labels = shuffle(data['train']['pos'], labels['train']['pos'], random_state=420) #keeps the shuffling consistent
        neg_data, neg_labels = shuffle(data['train']['neg'], labels['train']['neg'], random_state=420)

        val_size_pos = int(val_size/2)
        val_size_neg = val_size - val_size_pos

        train_set = pos_data[val_size_pos:] + neg_data[val_size_neg:]

        val_set = pos_data[:val_size_pos] + neg_data[:val_size_neg]

        test_set = data['test']['pos'] + data['test']['neg']

        print("Train: {} | Val: {} | Test: {}".format(
                len(train_set), len(val_set), len(test_set)
        ))
        print("IMDB data: train = pos {} / neg {} , val = pos {} / neg {}, test = pos {} / neg {}".format(
                len(pos_data) - val_size_pos, len(neg_data) - val_size_neg, 
                val_size_pos, val_size_neg, 
                len(data['test']['pos']), len(data['test']['neg'])
        ))

        train_labels = pos_labels[val_size_pos:] + neg_labels[val_size_neg:]
        val_labels = pos_labels[:val_size_pos] + neg_labels[:val_size_neg]
        test_labels = labels['test']['pos'] + labels['test']['neg']

        return train_set, val_set, test_set, train_labels, val_labels, test_labels

In [10]:
train_set, val_set, test_set, train_labels, val_labels, test_labels = split_data(data, labels)

Train: 22000 | Val: 3000 | Test: 25000
IMDB data: train = pos 11000 / neg 11000 , val = pos 1500 / neg 1500, test = pos 12500 / neg 12500


# 3.  Data Preprocessing
- Convert text to lowercase
- Remove punctuation
- Tokenize text
- Remove stop words
- Stemming

In [11]:
# This line imports the Natural Language Toolkit (NLTK) library, which provides various tools and resources for working with human language data.
import nltk

# This imports the stopwords module from NLTK, which contains a list of common English words (like "the", "a", "is")
# that often don't carry significant meaning in text analysis and are usually removed.
from nltk.corpus import stopwords

# This imports the PorterStemmer class from NLTK.
# Stemming is the process of reducing words to their root or base form (e.g., "running" becomes "run").
# The Porter stemmer is a widely used algorithm for this purpose.
from nltk.stem.porter import *

# This imports the re module, which provides support for regular expressions. Regular expressions are powerful tools for pattern matching and text manipulation.
import re

#  This imports the BeautifulSoup library, which is used for parsing HTML and XML documents. It's helpful for extracting text content from web pages or documents that might contain HTML tags.
from bs4 import BeautifulSoup

# This defines a function named review_to_words that takes a single argument review, which is expected to be a string containing the text of a movie review.
def review_to_words(review):
    # This line downloads the list of stopwords from NLTK if it hasn't been downloaded already. The quiet=True argument suppresses the download output.
    nltk.download("stopwords", quiet=True)

    # This creates an instance of the PorterStemmer class, which we'll use later for stemming words.
    stemmer = PorterStemmer()

    # This line uses BeautifulSoup to parse the input review as an HTML document ("html.parser" specifies the parser to use).
    # Then, .get_text() extracts the visible text content, effectively removing any HTML tags that might be present in the review.
    text = BeautifulSoup(review, "html.parser").get_text()

    # text.lower(): It converts the entire text to lowercase. This ensures that words like "The" and "the" are treated as the same.
    # re.sub(r"[^a-zA-Z0-9]", " ", ...): It replaces any character that is not an uppercase letter (a-z), a lowercase letter (A-Z), or a digit (0-9) with a space.
    # This helps in removing punctuation marks and other special characters.
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    # This line splits the processed text into a list of individual words using whitespace as the delimiter.
    words = text.split()

    # This line uses a list comprehension to filter out stopwords.
    # It iterates through the words list and keeps only those words that are not present in the English stopwords list provided by NLTK.
    words = [w for w in words if w not in stopwords.words("english")]

    #  This line uses another list comprehension to perform stemming.
    # It iterates through the filtered words and applies the stem() method of the PorterStemmer to each word, reducing it to its root form.
    words = [PorterStemmer().stem(w) for w in words] # stem

    return words

In [12]:
import json

#if data has not been processed
if not os.path.exists("data/processed/"):
        os.makedirs("data/processed/")

        train_set_processed = []
        for entry in train_set:
                train_set_processed.append(review_to_words(entry))

        val_set_processed = []
        for entry in val_set:
                val_set_processed.append(review_to_words(entry))

        test_set_processed = []
        for entry in test_set:
                test_set_processed.append(review_to_words(entry))

        # Save into a JSON file
        with open('data/processed/train_set_processed.json', 'w') as f:
                json.dump(train_set_processed, open('data/processed/train_set_processed.json', 'w'))
        with open('data/processed/test_set_processed.json', 'w') as f:
                json.dump(test_set_processed, open('data/processed/test_set_processed.json', 'w'))
        with open('data/processed/val_set_processed.json', 'w') as f:
                json.dump(val_set_processed, open('data/processed/val_set_processed.json', 'w'))


# 4. Implementing an LSTM Text Classifier

In [13]:
# Open and load preprocessed data
f=open('data/processed/test_set_processed.json')
test_set_processed = json.load(f)
f.close()

f=open('data/processed/train_set_processed.json')
train_set_processed = json.load(f)
f.close()

f=open('data/processed/val_set_processed.json')
val_set_processed = json.load(f)
f.close()

#display preprocessed data sample
print(train_set_processed[0])

['seen', 'movi', 'must', 'read', 'three', 'thor', 'heyerdahl', 'book', 'kon', 'tiki', 'ra', 'aku', 'aku', 'activ', 'look', 'copi', 'movi', 'thesi', 'peruvian', 'migrat', 'polynesia', 'aliv', 'well', 'consid', 'crew', 'gp', 'old', 'fashion', 'valv', 'tube', 'radio', '6', 'watt', 'output', 'voyag', 'heroic', 'say', 'least', 'pleas', 'repli', 'messag', 'tell', 'locat', 'copi', 'video', 'would', 'interest', 'buy']


### Build word_dict \+ Tokenize

to tokenize the data into integers for the LSTM

In [14]:
import numpy as np

def build_dict(data, vocab_size = 5000):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""

    # A dict storing the words that appear in the reviews along with how often they occur
    word_count = {}

    # This outer loop iterates through each sentence (which is a list of words) in the input data.
    # The inner loop iterates through each word within the current sentence
    for sentence in data:
        for word in sentence:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

    # Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    # sorted_words[-1] is the least frequently appearing word.

    sorted_words = sorted(word_count, key=word_count.get, reverse=True)

    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2 # 'infrequent' labels

    return word_dict

In [15]:
word_dict = build_dict(train_set_processed)
print(word_dict)
len(word_dict.keys())

{'movi': 2, 'film': 3, 'one': 4, 'like': 5, 'time': 6, 'good': 7, 'make': 8, 'charact': 9, 'get': 10, 'see': 11, 'watch': 12, 'stori': 13, 'even': 14, 'would': 15, 'realli': 16, 'well': 17, 'scene': 18, 'look': 19, 'show': 20, 'much': 21, 'end': 22, 'go': 23, 'peopl': 24, 'bad': 25, 'great': 26, 'also': 27, 'first': 28, 'love': 29, 'think': 30, 'way': 31, 'act': 32, 'play': 33, 'made': 34, 'thing': 35, 'could': 36, 'know': 37, 'say': 38, 'seem': 39, 'work': 40, 'plot': 41, 'two': 42, 'year': 43, 'actor': 44, 'seen': 45, 'come': 46, 'mani': 47, 'take': 48, 'life': 49, 'want': 50, 'never': 51, 'littl': 52, 'tri': 53, 'best': 54, 'ever': 55, 'man': 56, 'give': 57, 'better': 58, 'still': 59, 'perform': 60, 'find': 61, 'feel': 62, 'part': 63, 'back': 64, 'someth': 65, 'actual': 66, 'use': 67, 'director': 68, 'interest': 69, 'lot': 70, 'real': 71, 'old': 72, 'cast': 73, 'though': 74, 'live': 75, 'star': 76, 'enjoy': 77, 'guy': 78, 'new': 79, 'noth': 80, 'music': 81, '10': 82, 'anoth': 83, 'r

4998

In [16]:
def convert_and_pad(word_dict, sentence, pad=500):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict

    working_sentence = [NOWORD] * pad

    #  We iterate through the words in the input sentence. We also slice the sentence up to the pad length to handle reviews longer than our desired sequence length
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ

    return working_sentence, min(len(sentence), pad)


def convert_and_pad_data(word_dict, data, pad=500):
    result = []
    lengths = []

    # We iterate through each sentence in the input data.
    for sentence in data:
        # For each sentence, we call the convert_and_pad function to get the processed integer sequence (converted) and its original length (leng).
        converted, leng = convert_and_pad(word_dict, sentence, pad)

        # We append the converted sequence to our result list and the leng to our lengths list.
        result.append(converted)
        lengths.append(leng)

    return np.array(result), np.array(lengths)

In [17]:
train_set_tokenized, train_set_tokenized_len = convert_and_pad_data(word_dict, train_set_processed)
val_set_tokenized, val_set_tokenized_len = convert_and_pad_data(word_dict, val_set_processed)
test_set_tokenized, test_set_tokenized_len = convert_and_pad_data(word_dict, test_set_processed)

In [18]:
print(train_set_tokenized[0:5])
print(val_set_tokenized[0:5])

[[ 45   2 136 ...   0   0   0]
 [244 122 408 ...   0   0   0]
 [  5   1 338 ...   0   0   0]
 [ 12   2   4 ...   0   0   0]
 [120 960 139 ...   0   0   0]]
[[4321  120 1016 ...    0    0    0]
 [  60   93   44 ...    0    0    0]
 [ 400  901  230 ...    0    0    0]
 [3129  185  575 ...    0    0    0]
 [   1  505   85 ...    0    0    0]]


In [19]:
print(data['train']['pos'][0:5])
print(labels['train']['pos'][0:5])
print()

print(train_set[0:5])
print(train_labels[0:5])
print()

print(train_set_processed[0:5])
print(train_labels[0:5])
print()

print(train_set_tokenized[0:5])
print(train_labels[0:5])
print()

['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.', 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV\'s "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina\'s pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D\'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detect

### Implementing the LSTM text classifier

While coding the LSTM text classifier, I learned that there are some additional settings for the embedding\
layer. `padding_idx` will make the LSTM model ignore "0" - which is the padding we used when preprocessing\
data. Also, during the forward step, we will have to grab each batch's last non zero value. Otherwise if \
those steps are not done then the "0" padding will update the model's state and mess up the output.\
\
Additionally, the padding makes all lines end in the same pattern, which also causes bias in the model's\
training. \
\
I will experiment with different factors that affect the LSTM, including:
- Learning Rate
- Batch Size
- Number of epochs
- Hidden size
- Number of layers
- Optimization Algorithm



In [20]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
                super().__init__()

                # This creates the embedding layer. It takes integer indices (representing words) as input and maps them to dense vectors of size (embedding_dim)
                self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

                 # This defines the LSTM layer.
                self.lstm = nn.LSTM(embedding_dim, # The input size to the LSTM (the dimensionality of the embeddings).
                                    hidden_dim, # The number of hidden units in the LSTM
                                    num_layers= n_layers, # This specifies the number of LSTM layers to stack.
                                    batch_first=True # This is a crucial parameter. It tells the LSTM that the input and output tensors will have the batch size as the first dimension (i.e., [batch_size, seq_length, feature_dim]). This is a common convention in PyTorch.    
                                    )
                # This creates a fully connected (linear) layer that will take the output from the LSTM and produce a single output value for each review.
                # The input size is hidden_dim (the dimensionality of the final hidden state we'll use), and the output size is 1 (for binary classification).
                self.fc = nn.Linear(hidden_dim, 1)
        
        def forward(self, x):
                emb, _ = self.embedding(x), None
                out, _ = self.lstm(emb)                                   # [B, L, H]
                lengths = (x != 0).sum(dim=1)                             # [B]
                # grab each batch’s last non-pad timestep
                last_hid = out[torch.arange(out.size(0)), lengths-1]      # [B, H]
                return self.fc(last_hid)       

In [21]:
import torch

def train_model(epochs, model, train_loader, device, optimizer, loss_fn, val_loader):
        # The following code is similar to what we implement in Project Assignment 1
        for epoch in range(1, epochs + 1):
                model.train()
                total_loss = 0.0
                count = 0
                running_loss = 0.0
                for batch in train_loader:
                        batch_X, batch_y = batch

                        batch_X = batch_X.to(device)
                        batch_y = batch_y.to(device)

                        optimizer.zero_grad()
                        output = model.forward(batch_X).squeeze(1)

                        loss = loss_fn(output, batch_y)
                        loss.backward()
                        optimizer.step()

                        total_loss += loss.data.item()
                        running_loss += loss.data.item()

                        count+=1
                        if count%100 == 0:
                                print("Epoch: {}, BCELoss: {}".format(epoch, running_loss/100.0))
                                running_loss = 0.0

                avg_train_loss = total_loss / len(train_loader)

                #---validation---
                model.eval()
                total_val_loss = 0.0
                total_correct = 0
                total_samples = 0
                with torch.no_grad():
                        for batch in val_loader:
                                batch_X, batch_y = batch
                                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                                output = model(batch_X).squeeze(1)
                                total_val_loss += loss_fn(output, batch_y).item()

                                preds = (torch.sigmoid(output) >= 0.5).float()
                                total_correct += (preds == batch_y).sum().item()
                                total_samples  += batch_y.size(0)
                avg_val_loss = total_val_loss / len(val_loader)
                val_acc     = total_correct / total_samples
                # ——— LOGGING ———
                print(f"Epoch {epoch}/{epochs} — "
                f"Avg Train Loss: {avg_train_loss:.4f} | "
                f"Val Loss:   {avg_val_loss:.4f} | "
                f"Val Acc:    {val_acc:.4f}")

In [22]:
import torch

def create_tensors(train_set_tokenized, train_labels):
        train_data_tensor = torch.tensor(train_set_tokenized, dtype=torch.long)
        train_label_tensor = torch.tensor(train_labels, dtype=torch.float)  # or torch.long for class indices

        return train_data_tensor, train_label_tensor

In [23]:
from torch.utils.data import TensorDataset, DataLoader

def create_dataloader(train_data_tensor, train_label_tensor, val_data_tensor, val_label_tensor, batch_size):
        train_dataset = TensorDataset(train_data_tensor, train_label_tensor)
        train_loader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4)

        val_dataset = TensorDataset(val_data_tensor, val_label_tensor)
        val_loader = DataLoader(val_dataset, batch_size, shuffle=False, num_workers=2)

        return train_loader, val_loader

### Function to debug LSTM classifier

Function to test whether the LSTM model can theoretically learn. Take a small batch of data, \
and try to overfit the data. If the error can approach zero, that means that the classifier can learn.\
Sanity check to make sure that the model works, and that we just need to fine tune certain\
parameters, rather than the model actually not functioning.

In [24]:
def test():# grab a tiny batch
        # Convert to PyTorch tensors
        train_data_tensor, train_label_tensor = create_tensors(train_set_tokenized, train_labels)
        val_data_tensor, val_label_tensor = create_tensors(val_set_tokenized, val_labels)

        # Create dataset and dataloader, set batch size
        train_loader, val_loader = create_dataloader(train_data_tensor, train_label_tensor,val_data_tensor, val_label_tensor, 64)

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        inputs, targets = next(iter(train_loader))
        inputs, targets = inputs[:16].to(device), targets[:16].to(device)

        model = LSTMClassifier(embedding_dim=128,hidden_dim = 128,n_layers=1,vocab_size=5000)

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001) #Adam algorithm

        criterion = nn.BCEWithLogitsLoss()  # Binary cross entropy

        # Print initial probs so we can see change
        with torch.no_grad():
                init_logits = model(inputs).squeeze()
                print("Initial preds (sigmoid):", torch.sigmoid(init_logits).cpu().numpy()[:5])

        # try to drive loss → 0
        model.train()
        for epoch in range(100):
                optimizer.zero_grad()
                logits = model(inputs)
                loss = criterion(logits.squeeze(), targets)
                loss.backward()
                optimizer.step()
                if epoch % 10 == 0:
                        print(f"[tiny overfit] epoch {epoch:3d} loss = {loss.item():.4f}")

        # Print final probs
        with torch.no_grad():
                final_logits = model(inputs).squeeze()
                print("Final preds (sigmoid):", torch.sigmoid(final_logits).cpu().numpy()[:5])

In [24]:
test()

Initial preds (sigmoid): [0.50147635 0.49961942 0.48961756 0.524693   0.50284606]
[tiny overfit] epoch   0 loss = 0.6991
[tiny overfit] epoch  10 loss = 0.3470
[tiny overfit] epoch  20 loss = 0.0919
[tiny overfit] epoch  30 loss = 0.0164
[tiny overfit] epoch  40 loss = 0.0049
[tiny overfit] epoch  50 loss = 0.0068
[tiny overfit] epoch  60 loss = 0.0066
[tiny overfit] epoch  70 loss = 0.0058
[tiny overfit] epoch  80 loss = 0.0047
[tiny overfit] epoch  90 loss = 0.0038
Final preds (sigmoid): [0.00276094 0.00174304 0.00292544 0.00376894 0.00281168]


### Find optimal learning rate of LSTM:
Here we run some tests to find what learning rate produces the best results: 

|Learning Rate|Validation Accuracy|
|---|---|
|0.0001|76.67%|
|0.001|82.63%|
|0.01|86.10%|
|0.1|57.43%|

According to these results, the accuracy of the model increases as the learning rate increases, \
starting at ~77% at LR = 0.0001 to its peak at ~86% at LR = 0.01. As the learning rate gets even \
larger, the accuracy takes a big dip, with ~57% accuracy at LR = 0.1, which is only slightly \
better than guessing. 

The 0.0001 LR is okay, but it is not the best because the learning rate might be too low, which \
results in the model likely underfitting on the data. This is likely true due to the fact that \
the gradual increase in the learning rate slowly increases the accuracy, resulting in a more \
tight fit on the data. On the other hand, a LR of 0.1 is far to high and results in the model \
overshooting and not converging on a solution.

It also seems like for LR = 0.001, the BCELoss came to a stop at a minimum of 0.35, whereas \
for LR = 0.01 the BCELoss continued to show a downward trend at the second epoch, reaching a \
value of around 0.30. Which seems promising as the BCELoss might still decrease as the epochs \
increase.

**Therefore we will use the optimal learning rate, 0.01, for the model.**

In [25]:
import torch
import torch.nn as nn

def learning_rate_experiment(lr): 

        # Convert to PyTorch tensors
        train_data_tensor, train_label_tensor = create_tensors(train_set_tokenized, train_labels)
        val_data_tensor, val_label_tensor = create_tensors(val_set_tokenized, val_labels)

        # Create dataset and dataloader, set batch size
        train_loader, val_loader = create_dataloader(train_data_tensor, train_label_tensor,val_data_tensor, val_label_tensor, 64)

        # We set the training loss here
        loss_fn = nn.BCEWithLogitsLoss()  # Binary cross entropy
        
        #set device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


        for learning_rate in lr:
                epochs = 2 #keep epoch count low to test

                model = LSTMClassifier(embedding_dim=128,hidden_dim = 128,n_layers=1,vocab_size=5000)
                model.to(device)
                
                #set optimizer
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #Adam algorithm

                print("learning rate {}:".format(learning_rate))
                train_model(epochs, model, train_loader, device, optimizer, loss_fn, val_loader)
                print()


In [26]:
learning_rate_experiment([0.0001, 0.001, 0.01, 0.1])

learning rate 0.0001:
Epoch: 1, BCELoss: 0.6878650635480881
Epoch: 1, BCELoss: 0.6800026500225067
Epoch: 1, BCELoss: 0.6723925423622131
Epoch 1/2 — Avg Train Loss: 0.6774 | Val Loss:   0.6572 | Val Acc:    0.6247
Epoch: 2, BCELoss: 0.6194528722763062
Epoch: 2, BCELoss: 0.5557245442271233
Epoch: 2, BCELoss: 0.5087797981500626
Epoch 2/2 — Avg Train Loss: 0.5525 | Val Loss:   0.4897 | Val Acc:    0.7667

learning rate 0.001:
Epoch: 1, BCELoss: 0.6366023629903793
Epoch: 1, BCELoss: 0.5180443289875984
Epoch: 1, BCELoss: 0.4651417461037636
Epoch 1/2 — Avg Train Loss: 0.5260 | Val Loss:   0.3880 | Val Acc:    0.8293
Epoch: 2, BCELoss: 0.36244247272610663
Epoch: 2, BCELoss: 0.3836392605304718
Epoch: 2, BCELoss: 0.3698554764688015
Epoch 2/2 — Avg Train Loss: 0.3709 | Val Loss:   0.4033 | Val Acc:    0.8263

learning rate 0.01:
Epoch: 1, BCELoss: 0.6356582817435265
Epoch: 1, BCELoss: 0.4761842629313469
Epoch: 1, BCELoss: 0.4456121869385242
Epoch 1/2 — Avg Train Loss: 0.5039 | Val Loss:   0.3983 

### Find Optimal Batch Size of LSTM:

we will try different batch sizes of the LSTM:
32, 64, 128, 256

currently: LR = 0.01

|Batch Size|Validation Accuracy|Min Val Loss|
|---|---|---|
|32|86.43%|0.33|
|64|86.00%|0.32|
|128|86.60%|0.31|
|256|85.50%|0.35|

It seems like batch sizes 32-128 seem to perform similarly, with 256 slightly underperforming \
compared to the other batch sizes. However, out of all the different batch sizes, 128 seems to \
have slightly better perfomance compared to all the others, with the highest validation accuracy \
(86.60%) and the lowest minimum validation loss (0.31).

**The batch size 128 will be used**

In [26]:
import torch
import torch.nn as nn

def batch_size_experiment(bs): 

        # Convert to PyTorch tensors
        train_data_tensor, train_label_tensor = create_tensors(train_set_tokenized, train_labels)
        val_data_tensor, val_label_tensor = create_tensors(val_set_tokenized, val_labels)

        # We set the training loss here
        loss_fn = nn.BCEWithLogitsLoss()  # Binary cross entropy
        
        #set device
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        epochs = 2 #keep epoch count low to test

        for batchsize in bs:
                # Create dataset and dataloader, set batch size
                train_loader, val_loader = create_dataloader(train_data_tensor, train_label_tensor,val_data_tensor, val_label_tensor, batchsize)

                model = LSTMClassifier(embedding_dim=128,hidden_dim = 128,n_layers=1,vocab_size=5000)
                model.to(device)

                #set optimizer
                optimizer = torch.optim.Adam(model.parameters(), lr=0.01) #Adam algorithm

                print("batch size {}:".format(batchsize))
                train_model(epochs, model, train_loader, device, optimizer, loss_fn, val_loader)
                print()


In [27]:
batch_size_experiment([32, 64, 128, 256])

batch size 32:
Epoch: 1, BCELoss: 0.6362595546245575
Epoch: 1, BCELoss: 0.49490546345710756
Epoch: 1, BCELoss: 0.43934594944119454
Epoch: 1, BCELoss: 0.397197747528553
Epoch: 1, BCELoss: 0.3708895942568779
Epoch: 1, BCELoss: 0.3633361154794693
Epoch 1/2 — Avg Train Loss: 0.4409 | Val Loss:   0.3608 | Val Acc:    0.8450
Epoch: 2, BCELoss: 0.311865691319108
Epoch: 2, BCELoss: 0.33126242712140086
Epoch: 2, BCELoss: 0.3560057195276022
Epoch: 2, BCELoss: 0.33107839673757555
Epoch: 2, BCELoss: 0.34370814450085163
Epoch: 2, BCELoss: 0.33533686235547067
Epoch 2/2 — Avg Train Loss: 0.3325 | Val Loss:   0.3308 | Val Acc:    0.8643

batch size 64:
Epoch: 1, BCELoss: 0.6083816361427307
Epoch: 1, BCELoss: 0.4654716789722443
Epoch: 1, BCELoss: 0.4003325566649437
Epoch 1/2 — Avg Train Loss: 0.4760 | Val Loss:   0.3533 | Val Acc:    0.8497
Epoch: 2, BCELoss: 0.29645327508449554
Epoch: 2, BCELoss: 0.27452757745981216
Epoch: 2, BCELoss: 0.33267221361398697
Epoch 2/2 — Avg Train Loss: 0.3041 | Val Loss: 

### Find optimal hidden size of LSTM:

we will try four different hidden sizes of LSTM:\
64, 128, 256, 512\
`embedding_dim` will be the same as `hidden_dim` size. \

currently: LR = 0.01, Batch Size = 128

Results: 
|Hidden Size|Binary Cross Entropy|
|---|---|
|64|0.692|

In [None]:
import torch
import torch.nn as nn

def hidden_dim_experiment(hidden_dimnesions): 

        # Convert to PyTorch tensors
        train_data_tensor, train_label_tensor = create_tensors(train_set_tokenized, train_labels)
        val_data_tensor, val_label_tensor = create_tensors(val_set_tokenized, val_labels)

        # Create dataset and dataloader, set batch size
        train_loader, val_loader = create_dataloader(train_data_tensor, train_label_tensor,val_data_tensor, val_label_tensor, 128) #128 batch size

        # We set the training loss here
        loss_fn = nn.BCEWithLogitsLoss()  # Binary cross entropy
        
        for hd in hidden_dimnesions:
                epochs = 2 #keep epoch count low to test

                model = LSTMClassifier(embedding_dim=hd,hidden_dim = hd,n_layers=1,vocab_size=5000)

                #set optimizer
                optimizer = torch.optim.Adam(model.parameters(), lr=0.01) #Adam algorithm, LR = 0.01

                #select device
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model.to(device)

                print("hidden size {}:".format(hd))
                train_model(epochs, model, train_loader, device, optimizer, loss_fn, val_loader)
                print()


### Find optimal number of layers of LSTM

here we will try and find the optimal number of layers:

|Layers|Accuracy|Min Val Loss|
|---|---|---|
||||

In [None]:
hidden_dim_experiment([64, 128, 256, 512])

hidden size 64:
Epoch: 1, BCELoss: 0.5904939913749695
Epoch 1/2 — Avg Train Loss: 0.5388 | Val Loss:   0.3982 | Val Acc:    0.8363
Epoch: 2, BCELoss: 0.3303091543912888
Epoch 2/2 — Avg Train Loss: 0.3378 | Val Loss:   0.3369 | Val Acc:    0.8717

hidden size 128:
Epoch: 1, BCELoss: 0.5749831840395927
Epoch 1/2 — Avg Train Loss: 0.5198 | Val Loss:   0.3983 | Val Acc:    0.8363
Epoch: 2, BCELoss: 0.3116022069752216
Epoch 2/2 — Avg Train Loss: 0.3110 | Val Loss:   0.3159 | Val Acc:    0.8683

hidden size 256:
Epoch: 1, BCELoss: 0.544188356101513
Epoch 1/2 — Avg Train Loss: 0.4748 | Val Loss:   0.3339 | Val Acc:    0.8653
Epoch: 2, BCELoss: 0.27497520998120306
Epoch 2/2 — Avg Train Loss: 0.2825 | Val Loss:   0.3319 | Val Acc:    0.8587

hidden size 512:
Epoch: 1, BCELoss: 0.5169095033407212
Epoch 1/2 — Avg Train Loss: 0.4585 | Val Loss:   0.3495 | Val Acc:    0.8563
Epoch: 2, BCELoss: 0.25979474648833273
