# **FFNN Model**

In [19]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
from argparse import ArgumentParser

In [21]:
unk = '<UNK>'
class FFNN(nn.Module):
    def __init__(self, input_dim, h):
        super(FFNN, self).__init__()
        self.h = h
        self.W1 = nn.Linear(input_dim, h)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.output_dim = 5
        self.W2 = nn.Linear(h, self.output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, input_vector):
       # Ensure input_vector has a batch dimension
        if input_vector.dim() == 1:
          input_vector = input_vector.unsqueeze(0)
        # obtain first hidden layer representation
        hidden_rep = self.activation(self.W1(input_vector))

        # obtain output layer representation
        z = self.W2(hidden_rep)

        # obtain probability dist.
        predicted_vector = self.softmax(z)

        return predicted_vector

In [23]:
# Returns:
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab

In [24]:
# Returns:
# vocab = A set of strings corresponding to the vocabulary including <UNK>
# word2index = A dictionary mapping word/token to its index (a number in 0, ..., V - 1)
# index2word = A dictionary inverting the mapping of word2index
def make_indices(vocab):
    vocab_list = sorted(vocab)
    vocab_list.append(unk)
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index
        index2word[index] = word
    vocab.add(unk)
    return vocab, word2index, index2word

In [25]:
# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index))
        for word in document:
            index = word2index.get(word, word2index[unk])
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data

In [26]:
def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))

    return tra, val

In [27]:
import os
from argparse import ArgumentParser

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("-hd", "--hidden_dim", type=int, required=True, help="hidden_dim")
    parser.add_argument("-e", "--epochs", type=int, required=True, help="num of epochs to train")
    parser.add_argument("--train_data", required=True, help="/content/FFNN/training.json")
    parser.add_argument("--val_data", required=True, help="/content/FFNN/validation.json")
    parser.add_argument("--test_data", default="to fill", help="/content/FFNN/test.json")
    parser.add_argument('--do_train', action='store_true')

    arg_list = [
        "--hidden_dim", "16",
        "--epochs", "10",
        "--train_data", "/content/FFNN/training.json",
        "--val_data", "/content/FFNN/validation.json"
    ]
    args = parser.parse_args(arg_list)

    # Fix random seeds
    random.seed(42)
    torch.manual_seed(42)

    # Load data
    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data)
    vocab = make_vocab(train_data)
    vocab, word2index, index2word = make_indices(vocab)

    print("========== Vectorizing data ==========")
    train_data = convert_to_vector_representation(train_data, word2index)
    valid_data = convert_to_vector_representation(valid_data, word2index)

    model = FFNN(input_dim=len(vocab), h=args.hidden_dim)
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-5)

    # Ensure 'results' directory exists
    os.makedirs("results", exist_ok=True)

    # Open the output file in write mode
    with open("results/FFNNtest.out", "w") as f:
        f.write("Training and Validation Results\n")
        f.write("==============================\n")

        print("========== Training for {} epochs ==========".format(args.epochs))
        for epoch in range(args.epochs):
            # Training phase
            model.train()
            optimizer.zero_grad()
            correct = 0
            total = 0
            start_time = time.time()
            random.shuffle(train_data)
            minibatch_size = 16
            N = len(train_data)

            for minibatch_index in tqdm(range(N // minibatch_size)):
                optimizer.zero_grad()
                loss = None
                for example_index in range(minibatch_size):
                    input_vector, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                    predicted_vector = model(input_vector)
                    predicted_label = torch.argmax(predicted_vector)
                    correct += int(predicted_label == gold_label)
                    total += 1
                    example_loss = model.compute_Loss(predicted_vector.view(1, -1), torch.tensor([gold_label]))
                    if loss is None:
                        loss = example_loss
                    else:
                        loss += example_loss
                loss = loss / minibatch_size
                loss.backward()
                optimizer.step()

            training_accuracy = correct / total
            epoch_time = time.time() - start_time
            f.write(f"Epoch {epoch + 1} - Training accuracy: {training_accuracy:.4f}, Training time: {epoch_time:.2f} seconds\n")

            # Validation phase
            model.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for minibatch_index in tqdm(range(len(valid_data) // minibatch_size)):
                    loss = None
                    for example_index in range(minibatch_size):
                        input_vector, gold_label = valid_data[minibatch_index * minibatch_size + example_index]
                        predicted_vector = model(input_vector)
                        predicted_label = torch.argmax(predicted_vector)
                        correct += int(predicted_label == gold_label)
                        total += 1
                        example_loss = model.compute_Loss(predicted_vector.view(1, -1), torch.tensor([gold_label]))
                        if loss is None:
                            loss = example_loss
                        else:
                            loss += example_loss
                    loss = loss / minibatch_size

            validation_accuracy = correct / total
            f.write(f"Epoch {epoch + 1} - Validation accuracy: {validation_accuracy:.4f}\n")

        f.write("Training and validation complete.\n")




100%|██████████| 500/500 [00:17<00:00, 29.22it/s]
100%|██████████| 50/50 [00:00<00:00, 111.10it/s]
100%|██████████| 500/500 [00:17<00:00, 28.28it/s]
100%|██████████| 50/50 [00:00<00:00, 132.42it/s]
100%|██████████| 500/500 [00:18<00:00, 27.38it/s]
100%|██████████| 50/50 [00:00<00:00, 132.53it/s]
100%|██████████| 500/500 [00:17<00:00, 28.99it/s]
100%|██████████| 50/50 [00:00<00:00, 127.68it/s]
100%|██████████| 500/500 [00:17<00:00, 28.57it/s]
100%|██████████| 50/50 [00:00<00:00, 102.25it/s]
100%|██████████| 500/500 [00:18<00:00, 27.40it/s]
100%|██████████| 50/50 [00:00<00:00, 130.62it/s]
100%|██████████| 500/500 [00:17<00:00, 28.81it/s]
100%|██████████| 50/50 [00:00<00:00, 132.57it/s]
100%|██████████| 500/500 [00:16<00:00, 29.45it/s]
100%|██████████| 50/50 [00:00<00:00, 131.39it/s]
100%|██████████| 500/500 [00:17<00:00, 28.55it/s]
100%|██████████| 50/50 [00:00<00:00, 108.26it/s]
100%|██████████| 500/500 [00:17<00:00, 28.63it/s]
100%|██████████| 50/50 [00:00<00:00, 132.23it/s]


In [28]:
import json
import numpy as np

def load_data(file_path):
    """Load JSON data from the specified file path."""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def calculate_statistics(data):
    """Calculate number of examples, average words per review, and star rating distribution."""
    num_examples = len(data)
    word_counts = [len(review['text'].split()) for review in data]
    avg_words_per_review = np.mean(word_counts)

    # Count star ratings distribution
    star_ratings = [int(review['stars']) for review in data]
    unique, counts = np.unique(star_ratings, return_counts=True)
    star_distribution = dict(zip(unique, counts))

    # Convert star distribution to percentages
    star_distribution_percentage = {star: count / num_examples * 100 for star, count in star_distribution.items()}

    return num_examples, avg_words_per_review, star_distribution_percentage

# Load each dataset and calculate statistics
datasets = {
    "Training": "training.json",
    "Validation": "validation.json",
    "Test": "test.json"
}

for dataset_name, file_path in datasets.items():
    data = load_data(file_path)
    num_examples, avg_words, star_distribution = calculate_statistics(data)

    # Print out the statistics
    print(f"{dataset_name} Set:")
    print(f"  Number of Examples: {num_examples}")
    print(f"  Average Words per Review: {avg_words:.2f}")
    print(f"  Star Ratings Distribution: {star_distribution}")
    print("-" * 40)


FileNotFoundError: [Errno 2] No such file or directory: 'training.json'

# **RNN Model**

In [52]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
import string
from argparse import ArgumentParser
import pickle

In [53]:
unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class RNN(nn.Module):
    def __init__(self, input_dim, h):  # Add relevant parameters
        super(RNN, self).__init__()
        self.h = h
        self.numOfLayer = 1
        self.rnn = nn.RNN(input_dim, h, self.numOfLayer, nonlinearity='tanh')
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
      # Step 1: Pass inputs through the RNN layer to obtain hidden layer representation
      _, hidden = self.rnn(inputs)  # Obtain the final hidden state from the RNN

      # Step 2: Pass the hidden state through a linear layer to obtain output layer representations
      z = self.W(hidden[-1])  # Use the final hidden state for prediction

      # Step 3: Obtain probability distribution over classes
      predicted_vector = self.softmax(z)

      return predicted_vector

In [54]:
def load_data(train_data_path, val_data_path):
    """Load JSON data from the specified training and validation file paths."""
    with open(train_data_path) as training_f:
        training = json.load(training_f)
    with open(val_data_path) as valid_f:
        validation = json.load(valid_f)

    tra = [(elt["text"].split(), int(elt["stars"] - 1)) for elt in training]
    val = [(elt["text"].split(), int(elt["stars"] - 1)) for elt in validation]
    return tra, val

In [55]:
import pickle

try:
    with open('word_embedding.pkl', 'rb') as f:
        word_embedding = pickle.load(f)
    print("File loaded successfully.")
except pickle.UnpicklingError:
    print("UnpicklingError: The file is not a valid pickle file.")
except Exception as e:
    print(f"Error: {e}")

File loaded successfully.


In [57]:
import pickle
import numpy as np

# Create a random word embedding dictionary
vocab = ["<UNK>", "example", "words", "here"]
word_embedding = {word: np.random.rand(50) for word in vocab}

# Save to pickle file
with open('word_embedding.pkl', 'wb') as f:
    pickle.dump(word_embedding, f)

In [63]:
import sys
from argparse import ArgumentParser

if __name__ == "__main__":
    # Check if running in an interactive environment
    if 'ipykernel_launcher' in sys.argv[0] or 'google.colab' in sys.modules:
        # Set default values for arguments in interactive environments
        class Args:
            hidden_dim = 32
            epochs = 10
            train_data = "/content/RNN/training.json"
            val_data = "/content/RNN/validation.json"
            test_data = "/content/RNN/test.json"
            do_train = True
        args = Args()
    else:
        parser = ArgumentParser()
        parser.add_argument("-hd", "--hidden_dim", type=int, required=True, help="hidden_dim")
        parser.add_argument("-e", "--epochs", type=int, required=True, help="num of epochs to train")
        parser.add_argument("--train_data", required=True, help="path to training data")
        parser.add_argument("--val_data", required=True, help="path to validation data")
        parser.add_argument("--test_data", default="to fill", help="path to test data")
        parser.add_argument('--do_train', action='store_true')
        args = parser.parse_args()

    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data)

    print("========== Vectorizing data ==========")
    model = RNN(50, args.hidden_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Load pre-trained embeddings
    with open('./word_embedding.pkl', 'rb') as f:
        word_embedding = pickle.load(f)

    stopping_condition = False
    epoch = 0
    last_train_accuracy = 0
    last_validation_accuracy = 0

    while not stopping_condition:
      random.shuffle(train_data)
      model.train()
      # You will need further code to operationalize training, ffnn.py may be helpful
      print("Training started for epoch {}".format(epoch + 1))
      train_data = train_data
      correct = 0
      total = 0
      minibatch_size = 16
      N = len(train_data)

      loss_total = 0
      loss_count = 0
      for minibatch_index in tqdm(range(N // minibatch_size)):
          optimizer.zero_grad()
          loss = None
          for example_index in range(minibatch_size):
              input_words, gold_label = train_data[minibatch_index * minibatch_size + example_index]
              input_words = " ".join(input_words)

              # Remove punctuation
              input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()

              # Look up word embedding dictionary and handle missing embeddings with 'unk'
              vectors = [torch.tensor(word_embedding[i.lower()], dtype=torch.float32) if i.lower() in word_embedding else torch.tensor(word_embedding[unk], dtype=torch.float32) for i in input_words]

              # Stack the tensors into a single tensor of shape [sequence_length, embedding_dim]
              vectors = torch.stack(vectors).view(len(vectors), 1, -1).float()
              output = model(vectors)

              # Get loss
              example_loss = model.compute_Loss(output.view(1,-1), torch.tensor([gold_label]))

              # Get predicted label
              predicted_label = torch.argmax(output)

              correct += int(predicted_label == gold_label)
              # print(predicted_label, gold_label)
              total += 1
              if loss is None:
                  loss = example_loss
              else:
                  loss += example_loss

          loss = loss / minibatch_size
          loss_total += loss.data
          loss_count += 1
          loss.backward()
          optimizer.step()
      print(loss_total/loss_count)
      print("Training completed for epoch {}".format(epoch + 1))
      print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
      trainning_accuracy = correct/total


      model.eval()
      correct = 0
      total = 0
      random.shuffle(valid_data)
      print("Validation started for epoch {}".format(epoch + 1))
      valid_data = valid_data

      for input_words, gold_label in tqdm(valid_data):
          input_words = " ".join(input_words)
          input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()
          # Look up word embeddings and ensure each tensor is float32
          vectors = [torch.tensor(word_embedding[i.lower()], dtype=torch.float32) if i.lower() in word_embedding else torch.tensor(word_embedding[unk], dtype=torch.float32) for i in input_words]

          # Stack the tensors into a single tensor with dtype=torch.float32
          vectors = torch.stack(vectors).view(len(vectors), 1, -1)
          output = model(vectors)
          predicted_label = torch.argmax(output)
          correct += int(predicted_label == gold_label)
          total += 1
          # print(predicted_label, gold_label)
      print("Validation completed for epoch {}".format(epoch + 1))
      print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
      validation_accuracy = correct/total

      if validation_accuracy < last_validation_accuracy and trainning_accuracy > last_train_accuracy:
          stopping_condition=True
          print("Training done to avoid overfitting!")
          print("Best validation accuracy is:", last_validation_accuracy)
      else:
          last_validation_accuracy = validation_accuracy
          last_train_accuracy = trainning_accuracy

      epoch += 1


Training started for epoch 1


100%|██████████| 1000/1000 [04:03<00:00,  4.11it/s]


tensor(1.6440)
Training completed for epoch 1
Training accuracy for epoch 1: 0.206875
Validation started for epoch 1


100%|██████████| 800/800 [00:05<00:00, 135.68it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.4
Training started for epoch 2


100%|██████████| 1000/1000 [03:56<00:00,  4.22it/s]


tensor(1.6399)
Training completed for epoch 2
Training accuracy for epoch 2: 0.2045625
Validation started for epoch 2


100%|██████████| 800/800 [00:04<00:00, 160.01it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.4
Training started for epoch 3


100%|██████████| 1000/1000 [04:01<00:00,  4.14it/s]


tensor(1.6472)
Training completed for epoch 3
Training accuracy for epoch 3: 0.19775
Validation started for epoch 3


100%|██████████| 800/800 [00:06<00:00, 125.07it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.01
Training started for epoch 4


100%|██████████| 1000/1000 [04:00<00:00,  4.16it/s]


tensor(1.6444)
Training completed for epoch 4
Training accuracy for epoch 4: 0.2
Validation started for epoch 4


100%|██████████| 800/800 [00:05<00:00, 152.47it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.4
Training started for epoch 5


100%|██████████| 1000/1000 [03:57<00:00,  4.20it/s]


tensor(1.6434)
Training completed for epoch 5
Training accuracy for epoch 5: 0.2028125
Validation started for epoch 5


100%|██████████| 800/800 [00:05<00:00, 154.25it/s]

Validation completed for epoch 5
Validation accuracy for epoch 5: 0.01
Training done to avoid overfitting!
Best validation accuracy is: 0.4



