<a href="https://colab.research.google.com/github/SusanLL/CS4375_HW/blob/main/CS_4375_HW2_FFNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FFNN Model**

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
from argparse import ArgumentParser

In [2]:
unk = '<UNK>'
class FFNN(nn.Module):
    def __init__(self, input_dim, h):
        super(FFNN, self).__init__()
        self.h = h
        self.W1 = nn.Linear(input_dim, h)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.output_dim = 5
        self.W2 = nn.Linear(h, self.output_dim)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, input_vector):
       # Ensure input_vector has a batch dimension
        if input_vector.dim() == 1:
          input_vector = input_vector.unsqueeze(0)
        # obtain first hidden layer representation
        hidden_rep = self.activation(self.W1(input_vector))

        # obtain output layer representation
        z = self.W2(hidden_rep)

        # obtain probability dist.
        predicted_vector = self.softmax(z)

        return predicted_vector

In [3]:
# Returns:
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab

In [4]:
# Returns:
# vocab = A set of strings corresponding to the vocabulary including <UNK>
# word2index = A dictionary mapping word/token to its index (a number in 0, ..., V - 1)
# index2word = A dictionary inverting the mapping of word2index
def make_indices(vocab):
    vocab_list = sorted(vocab)
    vocab_list.append(unk)
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index
        index2word[index] = word
    vocab.add(unk)
    return vocab, word2index, index2word

In [5]:
# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index))
        for word in document:
            index = word2index.get(word, word2index[unk])
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data

In [6]:
def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))

    return tra, val

In [7]:
import os
from argparse import ArgumentParser
from tqdm import tqdm
import random
import torch
import time

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("-hd", "--hidden_dim", type=int, required=True, help="hidden_dim")
    parser.add_argument("-e", "--epochs", type=int, required=True, help="num of epochs to train")
    parser.add_argument("--train_data", required=True, help="/content/FFNN/training.json")
    parser.add_argument("--val_data", required=True, help="/content/FFNN/validation.json")
    parser.add_argument("--test_data", default="to fill", help="/content/FFNN/test.json")
    parser.add_argument('--do_train', action='store_true')

    arg_list = [
        "--hidden_dim", "16",
        "--epochs", "10",
        "--train_data", "/content/FFNN/training.json",
        "--val_data", "/content/FFNN/validation.json"
    ]
    args = parser.parse_args(arg_list)

    # Fix random seeds
    random.seed(42)
    torch.manual_seed(42)

    # Load data
    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data)
    vocab = make_vocab(train_data)
    vocab, word2index, index2word = make_indices(vocab)

    print("========== Vectorizing data ==========")
    train_data = convert_to_vector_representation(train_data, word2index)
    valid_data = convert_to_vector_representation(valid_data, word2index)

    model = FFNN(input_dim=len(vocab), h=args.hidden_dim)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-5)

    print("Training and Validation Results")
    print("==============================")

    print("========== Training for {} epochs ==========".format(args.epochs))
    for epoch in range(args.epochs):
        # Training phase
        model.train()
        optimizer.zero_grad()
        correct = 0
        total = 0
        start_time = time.time()
        random.shuffle(train_data)
        minibatch_size = 16
        N = len(train_data)

        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_vector, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                predicted_vector = model(input_vector)
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
                example_loss = model.compute_Loss(predicted_vector.view(1, -1), torch.tensor([gold_label]))
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss
            loss = loss / minibatch_size
            loss.backward()
            optimizer.step()

        training_accuracy = correct / total
        epoch_time = time.time() - start_time
        print(f"Epoch {epoch + 1} - Training accuracy: {training_accuracy:.4f}, Training time: {epoch_time:.2f} seconds")

        # Validation phase
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for minibatch_index in tqdm(range(len(valid_data) // minibatch_size)):
                loss = None
                for example_index in range(minibatch_size):
                    input_vector, gold_label = valid_data[minibatch_index * minibatch_size + example_index]
                    predicted_vector = model(input_vector)
                    predicted_label = torch.argmax(predicted_vector)
                    correct += int(predicted_label == gold_label)
                    total += 1
                    example_loss = model.compute_Loss(predicted_vector.view(1, -1), torch.tensor([gold_label]))
                    if loss is None:
                        loss = example_loss
                    else:
                        loss += example_loss
                loss = loss / minibatch_size

        validation_accuracy = correct / total
        print(f"Epoch {epoch + 1} - Validation accuracy: {validation_accuracy:.4f}")

    print("Training and validation complete.")


Training and Validation Results


100%|██████████| 500/500 [00:17<00:00, 28.84it/s]


Epoch 1 - Training accuracy: 0.5242, Training time: 17.36 seconds


100%|██████████| 50/50 [00:00<00:00, 122.75it/s]


Epoch 1 - Validation accuracy: 0.5413


100%|██████████| 500/500 [00:19<00:00, 26.13it/s]


Epoch 2 - Training accuracy: 0.5766, Training time: 19.15 seconds


100%|██████████| 50/50 [00:00<00:00, 119.31it/s]


Epoch 2 - Validation accuracy: 0.5763


100%|██████████| 500/500 [00:17<00:00, 28.84it/s]


Epoch 3 - Training accuracy: 0.6041, Training time: 17.36 seconds


100%|██████████| 50/50 [00:00<00:00, 132.10it/s]


Epoch 3 - Validation accuracy: 0.5687


100%|██████████| 500/500 [00:17<00:00, 27.86it/s]


Epoch 4 - Training accuracy: 0.6356, Training time: 17.96 seconds


100%|██████████| 50/50 [00:00<00:00, 98.79it/s]


Epoch 4 - Validation accuracy: 0.5875


100%|██████████| 500/500 [00:18<00:00, 27.17it/s]


Epoch 5 - Training accuracy: 0.6502, Training time: 18.42 seconds


100%|██████████| 50/50 [00:00<00:00, 124.64it/s]


Epoch 5 - Validation accuracy: 0.5813


100%|██████████| 500/500 [00:18<00:00, 27.32it/s]


Epoch 6 - Training accuracy: 0.6565, Training time: 18.31 seconds


100%|██████████| 50/50 [00:00<00:00, 132.21it/s]


Epoch 6 - Validation accuracy: 0.6025


100%|██████████| 500/500 [00:18<00:00, 27.65it/s]


Epoch 7 - Training accuracy: 0.6996, Training time: 18.09 seconds


100%|██████████| 50/50 [00:00<00:00, 93.45it/s]


Epoch 7 - Validation accuracy: 0.5437


100%|██████████| 500/500 [00:16<00:00, 29.78it/s]


Epoch 8 - Training accuracy: 0.6999, Training time: 16.80 seconds


100%|██████████| 50/50 [00:00<00:00, 133.70it/s]


Epoch 8 - Validation accuracy: 0.5962


100%|██████████| 500/500 [00:16<00:00, 30.17it/s]


Epoch 9 - Training accuracy: 0.7255, Training time: 16.58 seconds


100%|██████████| 50/50 [00:00<00:00, 139.47it/s]


Epoch 9 - Validation accuracy: 0.6200


100%|██████████| 500/500 [00:16<00:00, 30.16it/s]


Epoch 10 - Training accuracy: 0.7412, Training time: 16.59 seconds


100%|██████████| 50/50 [00:00<00:00, 136.67it/s]

Epoch 10 - Validation accuracy: 0.6062
Training and validation complete.





In [8]:
import json
import numpy as np

def load_data(file_path):
    """Load JSON data from the specified file path."""
    # Construct the full file path using os.path.join
    full_path = os.path.join('/content/FFNN', file_path)
    with open(full_path, 'r') as f:  # Open the file using the constructed path
        data = json.load(f)
    return data

def calculate_statistics(data):
    """Calculate number of examples, average words per review, and star rating distribution."""
    num_examples = len(data)
    word_counts = [len(review['text'].split()) for review in data]
    avg_words_per_review = np.mean(word_counts)

    # Count star ratings distribution
    star_ratings = [int(review['stars']) for review in data]
    unique, counts = np.unique(star_ratings, return_counts=True)
    star_distribution = dict(zip(unique, counts))

    # Convert star distribution to percentages
    star_distribution_percentage = {star: (count / num_examples) * 100 for star, count in star_distribution.items()}

    return num_examples, avg_words_per_review, star_distribution_percentage

# Load each dataset and calculate statistics
datasets = {
    "Training": "training.json",
    "Validation": "validation.json",
    "Test": "test.json"
}

for dataset_name, file_path in datasets.items():
    data = load_data(file_path)
    num_examples, avg_words, star_distribution = calculate_statistics(data)

    # Print out the statistics
    print(f"{dataset_name} Set:")
    print(f"  Number of Examples: {num_examples}")
    print(f"  Average Words per Review: {avg_words:.2f}")
    print(f"  Star Ratings Distribution: {star_distribution}")
    print("-" * 40)

Training Set:
  Number of Examples: 8000
  Average Words per Review: 141.25
  Star Ratings Distribution: {1: 40.0, 2: 40.0, 3: 20.0}
----------------------------------------
Validation Set:
  Number of Examples: 800
  Average Words per Review: 140.37
  Star Ratings Distribution: {1: 40.0, 2: 40.0, 3: 20.0}
----------------------------------------
Test Set:
  Number of Examples: 800
  Average Words per Review: 109.77
  Star Ratings Distribution: {3: 20.0, 4: 40.0, 5: 40.0}
----------------------------------------
