<a href="https://colab.research.google.com/github/SusanLL/CS4375_HW/blob/main/CS_4375_HW2_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **RNN Model**

In [28]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
import string
from argparse import ArgumentParser
import pickle

In [29]:
unk = '<UNK>'
class RNN(nn.Module):
    def __init__(self, input_dim, h):  # Add relevant parameters
        super(RNN, self).__init__()
        self.h = h
        self.numOfLayer = 1
        self.rnn = nn.RNN(input_dim, h, self.numOfLayer, nonlinearity='tanh')
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
      # Step 1: Pass inputs through the RNN layer to obtain hidden layer representation
      _, hidden = self.rnn(inputs)  # Obtain the final hidden state from the RNN

      # Step 2: Pass the hidden state through a linear layer to obtain output layer representations
      z = self.W(hidden[-1])  # Use the final hidden state for prediction

      # Step 3: Obtain probability distribution over classes
      predicted_vector = self.softmax(z)

      return predicted_vector

In [30]:
import json

def load_data(train_data_path, val_data_path):  # Add val_data_path as an argument
    """Load JSON data from the specified training and validation file paths."""
    with open(train_data_path) as training_f:
        training = json.load(training_f)
    with open(val_data_path) as valid_f:  # Open and load the validation data
        validation = json.load(valid_f)

    tra = [(elt["text"].split(), int(elt["stars"] - 1)) for elt in training]
    val = [(elt["text"].split(), int(elt["stars"] - 1)) for elt in validation]  # Process validation data
    return tra, val  # Return both training and validation data

In [31]:
import pickle
import numpy as np

# Create a random word embedding dictionary
vocab = ["<UNK>", "example", "words", "here"]
word_embedding = {word: np.random.rand(50) for word in vocab}

# Save to pickle file
with open('word_embedding.pkl', 'wb') as f:
    pickle.dump(word_embedding, f)

In [32]:
import pickle

try:
    with open('word_embedding.pkl', 'rb') as f:
        word_embedding = pickle.load(f)
    print("File loaded successfully.")
except pickle.UnpicklingError:
    print("UnpicklingError: The file is not a valid pickle file.")
except Exception as e:
    print(f"Error: {e}")

File loaded successfully.


In [35]:
# Initialize variables to store results
results = {"training": [], "validation": []}
best_validation_accuracy = 0.0
no_improvement_epochs = 0  # Counter for early stopping
max_no_improvement = 3  # Stop after 3 epochs with no improvement
epoch = 0

while epoch < args.epochs:
    random.shuffle(train_data)
    model.train()
    print(f"Training started for epoch {epoch + 1}")

    # Training metrics
    correct = 0
    total = 0
    minibatch_size = 32
    N = len(train_data)

    loss_total = 0
    loss_count = 0

    # Training loop
    for minibatch_index in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for example_index in range(minibatch_size):
            input_words, gold_label = train_data[minibatch_index * minibatch_size + example_index]
            input_words = " ".join(input_words).translate(str.maketrans("", "", string.punctuation)).split()

            # Look up word embeddings
            vectors = [torch.tensor(word_embedding[i.lower()], dtype=torch.float32) if i.lower() in word_embedding else torch.tensor(word_embedding[unk], dtype=torch.float32) for i in input_words]
            vectors = torch.stack(vectors).view(len(vectors), 1, -1).float()
            output = model(vectors)

            # Calculate loss
            example_loss = model.compute_Loss(output.view(1, -1), torch.tensor([gold_label]))
            predicted_label = torch.argmax(output)
            correct += int(predicted_label == gold_label)
            total += 1

            if loss is None:
                loss = example_loss
            else:
                loss += example_loss

        loss = loss / minibatch_size
        loss_total += loss.data
        loss_count += 1
        loss.backward()
        optimizer.step()

    training_accuracy = correct / total
    results["training"].append((epoch + 1, training_accuracy, loss_total / loss_count))
    print(f"Training accuracy for epoch {epoch + 1}: {training_accuracy:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0

    print(f"Validation started for epoch {epoch + 1}")
    for input_words, gold_label in tqdm(valid_data):
        input_words = " ".join(input_words).translate(str.maketrans("", "", string.punctuation)).split()
        vectors = [torch.tensor(word_embedding[i.lower()], dtype=torch.float32) if i.lower() in word_embedding else torch.tensor(word_embedding[unk], dtype=torch.float32) for i in input_words]
        vectors = torch.stack(vectors).view(len(vectors), 1, -1).float()
        output = model(vectors)
        predicted_label = torch.argmax(output)
        correct += int(predicted_label == gold_label)
        total += 1

    validation_accuracy = correct / total
    results["validation"].append((epoch + 1, validation_accuracy))
    print(f"Validation accuracy for epoch {epoch + 1}: {validation_accuracy:.4f}")

    # Early stopping condition
    if validation_accuracy > best_validation_accuracy:
        best_validation_accuracy = validation_accuracy
        no_improvement_epochs = 0  # Reset counter if improvement occurs
    else:
        no_improvement_epochs += 1

    if no_improvement_epochs >= max_no_improvement:
        print(f"Early stopping triggered after {no_improvement_epochs} epochs with no improvement.")
        break

    epoch += 1

# Print final results
print("\nTraining and Validation Results")
print("=" * 30)
for epoch, train_acc, train_loss in results["training"]:
    print(f"Epoch {epoch}: Training Accuracy = {train_acc:.4f}, Loss = {train_loss:.4f}")
for epoch, val_acc in results["validation"]:
    print(f"Epoch {epoch}: Validation Accuracy = {val_acc:.4f}")


Training started for epoch 1


100%|██████████| 500/500 [04:45<00:00,  1.75it/s]


Training accuracy for epoch 1: 0.2044
Validation started for epoch 1


100%|██████████| 800/800 [00:07<00:00, 104.81it/s]


Validation accuracy for epoch 1: 0.0000
Training started for epoch 2


100%|██████████| 500/500 [04:37<00:00,  1.80it/s]


Training accuracy for epoch 2: 0.1964
Validation started for epoch 2


100%|██████████| 800/800 [00:07<00:00, 102.10it/s]


Validation accuracy for epoch 2: 0.2000
Training started for epoch 3


100%|██████████| 500/500 [04:43<00:00,  1.76it/s]


Training accuracy for epoch 3: 0.1971
Validation started for epoch 3


100%|██████████| 800/800 [00:06<00:00, 123.28it/s]


Validation accuracy for epoch 3: 0.4000
Training started for epoch 4


100%|██████████| 500/500 [04:38<00:00,  1.80it/s]


Training accuracy for epoch 4: 0.1994
Validation started for epoch 4


100%|██████████| 800/800 [00:07<00:00, 107.40it/s]


Validation accuracy for epoch 4: 0.4000
Training started for epoch 5


100%|██████████| 500/500 [04:35<00:00,  1.82it/s]


Training accuracy for epoch 5: 0.2021
Validation started for epoch 5


100%|██████████| 800/800 [00:07<00:00, 106.22it/s]


Validation accuracy for epoch 5: 0.0000
Training started for epoch 6


100%|██████████| 500/500 [04:31<00:00,  1.84it/s]


Training accuracy for epoch 6: 0.2057
Validation started for epoch 6


100%|██████████| 800/800 [00:06<00:00, 122.56it/s]

Validation accuracy for epoch 6: 0.0000
Early stopping triggered after 3 epochs with no improvement.

Training and Validation Results
Epoch 1: Training Accuracy = 0.2044, Loss = 1.6915
Epoch 2: Training Accuracy = 0.1964, Loss = 1.6998
Epoch 3: Training Accuracy = 0.1971, Loss = 1.6967
Epoch 4: Training Accuracy = 0.1994, Loss = 1.6900
Epoch 5: Training Accuracy = 0.2021, Loss = 1.6904
Epoch 6: Training Accuracy = 0.2057, Loss = 1.7023
Epoch 1: Validation Accuracy = 0.0000
Epoch 2: Validation Accuracy = 0.2000
Epoch 3: Validation Accuracy = 0.4000
Epoch 4: Validation Accuracy = 0.4000
Epoch 5: Validation Accuracy = 0.0000
Epoch 6: Validation Accuracy = 0.0000





In [34]:
import json
import numpy as np

def load_data(file_path):
    """Load JSON data from the specified file path."""
    # Construct the full file path using os.path.join
    full_path = os.path.join('/content/RNN', file_path)
    with open(full_path, 'r') as f:  # Open the file using the constructed path
        data = json.load(f)
    return data

def calculate_statistics(data):
    """Calculate number of examples, average words per review, and star rating distribution."""
    num_examples = len(data)
    word_counts = [len(review['text'].split()) for review in data]
    avg_words_per_review = np.mean(word_counts)

    # Count star ratings distribution
    star_ratings = [int(review['stars']) for review in data]
    unique, counts = np.unique(star_ratings, return_counts=True)
    star_distribution = dict(zip(unique, counts))

    # Convert star distribution to percentages
    star_distribution_percentage = {star: (count / num_examples) * 100 for star, count in star_distribution.items()}

    return num_examples, avg_words_per_review, star_distribution_percentage

# Load each dataset and calculate statistics
datasets = {
    "Training": "training.json",
    "Validation": "validation.json",
    "Test": "test.json"
}

for dataset_name, file_path in datasets.items():
    data = load_data(file_path)
    num_examples, avg_words, star_distribution = calculate_statistics(data)

    # Print out the statistics
    print(f"{dataset_name} Set:")
    print(f"  Number of Examples: {num_examples}")
    print(f"  Average Words per Review: {avg_words:.2f}")
    print(f"  Star Ratings Distribution: {star_distribution}")
    print("-" * 40)

Training Set:
  Number of Examples: 16000
  Average Words per Review: 124.69
  Star Ratings Distribution: {1: 20.0, 2: 20.0, 3: 20.0, 4: 20.0, 5: 20.0}
----------------------------------------
Validation Set:
  Number of Examples: 800
  Average Words per Review: 140.37
  Star Ratings Distribution: {1: 40.0, 2: 40.0, 3: 20.0}
----------------------------------------
Test Set:
  Number of Examples: 800
  Average Words per Review: 109.77
  Star Ratings Distribution: {3: 20.0, 4: 40.0, 5: 40.0}
----------------------------------------
