<a href="https://colab.research.google.com/github/RanxduG/Celestial-Flames/blob/main/Question1answer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing Libraries

In [None]:
# models.py
import numpy as np
import collections
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils

##MODELS FOR PART 1

In [None]:
class ConsonantVowelClassifier(object):
    def predict(self, context):
        raise Exception("Only implemented in subclasses")

In [None]:
class FrequencyBasedClassifier(ConsonantVowelClassifier):
    def __init__(self, consonant_counts, vowel_counts):
      self.consonant_counts = consonant_counts
      self.vowel_counts = vowel_counts


    def predict(self, context):
      if self.consonant_counts[context[-1]] > self.vowel_counts[context[-1]]:
          return 0
      else:
          return 1

In [None]:
class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2, 2)  # *2 due to bidirectional LSTM

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Apply dropout to the last LSTM output
        final_output = self.fc(lstm_out)
        return final_output

    def predict(self, x):
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            outputs = self.forward(x)
            _, predicted = torch.max(outputs, 1)
        return predicted


In [None]:
def train_frequency_based_classifier(cons_exs, vowel_exs):
    consonant_counts = collections.Counter()
    vowel_counts = collections.Counter()
    for ex in cons_exs:
        consonant_counts[ex[-1]] += 1
    for ex in vowel_exs:
        vowel_counts[ex[-1]] += 1
    return FrequencyBasedClassifier(consonant_counts, vowel_counts)

In [None]:
def train_rnn_classifier(args, train_cons_exs, train_vowel_exs, dev_cons_exs, dev_vowel_exs, vocab_index):
    # Updated hyperparameters
    embedding_dim = 50 #dont change this
    hidden_dim = 128 #execution time increases if this increases
    vocab_size = len(vocab_index)
    dropout_rate = 0.7 #dont increase this
    batch_size = 128 #execution time increases if this increases
    num_epochs = 50 #dont change this

    # Initialize the model
    model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, dropout_rate)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.003)

    # Prepare training data
    train_data = train_cons_exs + train_vowel_exs
    labels = [0] * len(train_cons_exs) + [1] * len(train_vowel_exs)

    # Convert examples to indices
    train_indices = [[vocab_index[c] for c in ex] for ex in train_data]

    # Pad sequences to the same length
    train_indices_tensor = rnn_utils.pad_sequence(
        [torch.tensor(seq, dtype=torch.long) for seq in train_indices],
        batch_first=True
    )  # [batch_size, max_seq_length]

    train_labels_tensor = torch.tensor(labels, dtype=torch.long)  # [batch_size]

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for i in range(0, len(train_indices_tensor), batch_size):
            batch_x = train_indices_tensor[i:i + batch_size]
            batch_y = train_labels_tensor[i:i + batch_size]

            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}')

    # Evaluation on development set
    model.eval()
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        dev_data = dev_cons_exs + dev_vowel_exs
        dev_labels = [0] * len(dev_cons_exs) + [1] * len(dev_vowel_exs)

        # Convert development examples to indices
        dev_indices = [[vocab_index[c] for c in ex] for ex in dev_data]

        # Pad development sequences to the same length
        dev_indices_tensor = rnn_utils.pad_sequence(
            [torch.tensor(seq, dtype=torch.long) for seq in dev_indices],
            batch_first=True
        )  # [batch_size, max_seq_length]

        # Get predictions
        predicted_labels = model.predict(dev_indices_tensor)

        # Calculate accuracy
        correct_predictions += (predicted_labels == torch.tensor(dev_labels)).sum().item()
        total_predictions += len(dev_labels)

        # Print results
        for i, prediction in enumerate(predicted_labels):
            actual = dev_labels[i]
            result = "Correct" if prediction.item() == actual else "Incorrect"
            print(f'Example: {dev_data[i]}, Predicted: {prediction.item()}, Actual: {actual}, Result: {result}')

    # Accuracy calculation
    accuracy = correct_predictions / total_predictions
    print(f'Accuracy on Development Set: {accuracy:.4f}')

    return model


##DATA LOADING AND TRAINING

In [None]:
def load_examples(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]

def create_vocab_index(examples):
    vocab_set = set()
    for ex in examples:
        vocab_set.update(ex)  # Add all characters from the examples
    vocab_list = sorted(vocab_set)  # Sort to maintain consistent indexing
    return {char: idx for idx, char in enumerate(vocab_list)}

if __name__ == "__main__":
    # Load training and development examples
    train_cons_exs = load_examples('train-consonant-examples.txt')
    train_vowel_exs = load_examples('train-vowel-examples.txt')
    dev_cons_exs = load_examples('dev-consonant-examples.txt')
    dev_vowel_exs = load_examples('dev-vowel-examples.txt')

    # Create vocabulary index
    vocab_index = create_vocab_index(train_cons_exs + train_vowel_exs + dev_cons_exs + dev_vowel_exs)

    # Train the classifier
    model = train_rnn_classifier(None, train_cons_exs, train_vowel_exs, dev_cons_exs, dev_vowel_exs, vocab_index)

Epoch 1/50, Loss: 98.8607
Epoch 2/50, Loss: 107.0596
Epoch 3/50, Loss: 64.1336
Epoch 4/50, Loss: 51.5022
Epoch 5/50, Loss: 49.4648
Epoch 6/50, Loss: 49.5472
Epoch 7/50, Loss: 48.9637
Epoch 8/50, Loss: 48.9198
Epoch 9/50, Loss: 48.7752
Epoch 10/50, Loss: 48.4145
Epoch 11/50, Loss: 48.1088
Epoch 12/50, Loss: 47.8295
Epoch 13/50, Loss: 47.3403
Epoch 14/50, Loss: 46.8559
Epoch 15/50, Loss: 46.4586
Epoch 16/50, Loss: 45.7394
Epoch 17/50, Loss: 45.1634
Epoch 18/50, Loss: 44.4679
Epoch 19/50, Loss: 44.0345
Epoch 20/50, Loss: 43.1315
Epoch 21/50, Loss: 42.4870
Epoch 22/50, Loss: 43.0631
Epoch 23/50, Loss: 41.9525
Epoch 24/50, Loss: 41.2354
Epoch 25/50, Loss: 41.2580
Epoch 26/50, Loss: 40.6848
Epoch 27/50, Loss: 39.9812
Epoch 28/50, Loss: 38.9078
Epoch 29/50, Loss: 38.4998
Epoch 30/50, Loss: 37.9506
Epoch 31/50, Loss: 37.4029
Epoch 32/50, Loss: 36.8168
Epoch 33/50, Loss: 36.2430
Epoch 34/50, Loss: 35.5242
Epoch 35/50, Loss: 34.8855
Epoch 36/50, Loss: 34.4566
Epoch 37/50, Loss: 33.3000
Epoch 38/

'class LanguageModel(object):\n    def __init__(self, model_emb, model_dec, vocab_index):\n        self.model_emb = model_emb\n        self.model_dec = model_dec\n        self.vocab_index = vocab_index\n        self.hidden_size = model_dec.hidden_size\n        self.output_layer = nn.Linear(self.hidden_size, len(vocab_index))  # Linear layer to map to vocab size\n\n\nclass RNNLanguageModel(nn.Module):\n    def __init__(self, embedding_layer, rnn_layer, vocab_size):\n        super(RNNLanguageModel, self).__init__()\n        self.model_emb = embedding_layer  # Embedding layer\n        self.model_dec = rnn_layer  # GRU/LSTM or any RNN layer\n        self.fc = nn.Linear(rnn_layer.hidden_size, vocab_size)  # Linear layer to project to vocab size\n        self.dropout = nn.Dropout(0.5)  # Dropout to prevent overfitting\n\n    def forward(self, inputs):\n        embeddings = self.model_emb(inputs)  # Convert inputs to embeddings\n        embeddings = self.dropout(embeddings)\n        rnn_outpu

##MODELS FOR PART 2

In [None]:
'''class LanguageModel(object):
    def __init__(self, model_emb, model_dec, vocab_index):
        self.model_emb = model_emb
        self.model_dec = model_dec
        self.vocab_index = vocab_index
        self.hidden_size = model_dec.hidden_size
        self.output_layer = nn.Linear(self.hidden_size, len(vocab_index))  # Linear layer to map to vocab size


class RNNLanguageModel(nn.Module):
    def __init__(self, embedding_layer, rnn_layer, vocab_size):
        super(RNNLanguageModel, self).__init__()
        self.model_emb = embedding_layer  # Embedding layer
        self.model_dec = rnn_layer  # GRU/LSTM or any RNN layer
        self.fc = nn.Linear(rnn_layer.hidden_size, vocab_size)  # Linear layer to project to vocab size
        self.dropout = nn.Dropout(0.5)  # Dropout to prevent overfitting

    def forward(self, inputs):
        embeddings = self.model_emb(inputs)  # Convert inputs to embeddings
        embeddings = self.dropout(embeddings)
        rnn_output, hidden = self.model_dec(embeddings)  # Get RNN outputs
        rnn_output = self.dropout(rnn_output)
        logits = self.fc(rnn_output)  # Project RNN outputs to vocab size
        return logits, hidden


def train_lm(args, train_text, dev_text, vocab_index):
    # Define hyperparameters
    embedding_dim = 128  # Increased embedding size
    hidden_size = 128  # Increased hidden size
    num_layers = 3  # More layers for more complex representation
    learning_rate = 0.0005  # Adjust learning rate
    num_epochs = 20  # Increase number of epochs
    batch_size = 32
    vocab_size = len(vocab_index)

    # Instantiate the RNN model
    rnn_model = RNNLanguageModel(
        nn.Embedding(vocab_size, embedding_dim),
        nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=0.3),
        vocab_size  # Pass vocab_size to the linear layer
    )

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

    # Early stopping parameters
    best_dev_accuracy = 0
    early_stop_patience = 5
    patience_counter = 0

    # Training loop
    for epoch in range(num_epochs):
        rnn_model.train()  # Set model to training mode
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0

        for i in range(0, len(train_text) - batch_size, batch_size):
            inputs = torch.tensor([[vocab_index[char] for char in seq] for seq in train_text[i:i + batch_size]])
            targets = torch.tensor([[vocab_index[char] for char in seq[1:]] + [vocab_index[train_text[i][-1]]] for seq in train_text[i:i + batch_size]])

            # Forward pass
            optimizer.zero_grad()
            output, _ = rnn_model(inputs)

            # Reshape output and targets for loss function
            batch_size, seq_len, vocab_output_size = output.shape
            output = output.view(batch_size * seq_len, vocab_output_size)  # Reshape to (batch_size * seq_len, vocab_size)
            targets = targets.view(-1)  # Flatten targets

            # Compute loss
            loss = criterion(output, targets)
            total_loss += loss.item()

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), max_norm=1.0)  # Clip gradients
            optimizer.step()

            # Accuracy calculation
            _, predicted = torch.max(output, 1)
            correct_predictions += (predicted == targets).sum().item()
            total_predictions += targets.size(0)

        # Validate on the dev set
        rnn_model.eval()  # Set model to evaluation mode
        dev_correct = 0
        dev_total = 0

        with torch.no_grad():
            for i in range(0, len(dev_text) - batch_size, batch_size):
                inputs = torch.tensor([[vocab_index[char] for char in seq] for seq in dev_text[i:i + batch_size]])
                targets = torch.tensor([[vocab_index[char] for char in seq[1:]] + [vocab_index[dev_text[i][-1]]] for seq in dev_text[i:i + batch_size]])

                output, _ = rnn_model(inputs)
                output = output.view(batch_size * seq_len, vocab_output_size)
                targets = targets.view(-1)

                _, predicted = torch.max(output, 1)
                dev_correct += (predicted == targets).sum().item()
                dev_total += targets.size(0)

        dev_accuracy = (dev_correct / dev_total) * 100
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}, Training Accuracy: {(correct_predictions / total_predictions) * 100:.2f}%, Dev Accuracy: {dev_accuracy:.2f}%")

        # Learning rate scheduler step
        scheduler.step(total_loss)

        # Early stopping
        if dev_accuracy > best_dev_accuracy:
            best_dev_accuracy = dev_accuracy
            patience_counter = 0  # Reset patience if accuracy improves
        else:
            patience_counter += 1  # Increment if no improvement

        if patience_counter >= early_stop_patience:
            print("Early stopping due to no improvement.")
            break

    return rnn_model


def load_examples(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]


if __name__ == "__main__":
    # Load training and development examples
    train_cons_exs = load_examples('train-consonant-examples.txt')
    train_vowel_exs = load_examples('train-vowel-examples.txt')
    dev_cons_exs = load_examples('dev-consonant-examples.txt')
    dev_vowel_exs = load_examples('dev-vowel-examples.txt')

    # Combine all the training and development data
    train_text = ''.join(train_cons_exs + train_vowel_exs)
    dev_text = ''.join(dev_cons_exs + dev_vowel_exs)

    # Create vocabulary index directly from the text
    vocab_set = set(train_text + dev_text)
    vocab_index = {char: idx for idx, char in enumerate(sorted(vocab_set))}

    # Train the language model
    model = train_lm(None, train_text, dev_text, vocab_index)

    print("Training complete. RNN language model is ready.")'''


###lm.py code

In [None]:
# lm.py

import argparse
import time
from models import *
from utils import *

####################################################
# DO NOT MODIFY THIS FILE IN YOUR FINAL SUBMISSION #
####################################################


def _parse_args():
    """
    Command-line arguments to the system. --model switches between the main modes you'll need to use.
    The other arguments are provided for convenience.
    """
    parser = argparse.ArgumentParser(description='lm.py')
    parser.add_argument('--model', type=str, default='UNIFORM', help='model to run (UNIFORM or RNN)')
    parser.add_argument('--train_path', type=str, default='text8-100k.txt', help='path to train set')
    parser.add_argument('--dev_path', type=str, default='text8-dev.txt', help='path to dev set')

    # Add this line to ignore unknown arguments from Jupyter/Colab
    args, unknown = parser.parse_known_args()  # This will allow extra arguments (like -f) to be ignored
    return args



def read_text(file):
    """
    :param file:
    :return: The text in the given file as a single string
    """
    all_text = ""
    for line in open(file):
        all_text += line.strip()
    print("%i chars read in" % len(all_text))
    return all_text


def print_evaluation(text, lm):
    """
    Runs the language model on the given text and prints three metrics: log probability of the text under this model
    (treating the text as one log sequence), average log probability (the previous value divided by sequence length),
    and perplexity (averaged "branching favor" of the model)
    :param text: the text to evaluate
    :param lm: model to evaluate
    """
    log_prob = lm.get_log_prob_sequence(text, " ")
    print("Log prob of text %f" % log_prob)
    print("Avg log prob: %f" % (log_prob/len(text)))
    perplexity = np.exp(-log_prob/len(text))
    print("Perplexity: %f" % perplexity)


if __name__ == '__main__':
    start_time = time.time()
    args = _parse_args()
    print(args)

    train_text = read_text(args.train_path)
    dev_text = read_text(args.dev_path)

    # Vocabs is lowercase letters a to z and space
    vocab = [chr(ord('a') + i) for i in range(0, 26)] + [' ']
    vocab_index = Indexer()
    for char in vocab:
        vocab_index.add_and_get_index(char)
    print(repr(vocab_index))

    print("First 100 characters of train:")
    print(train_text[0:100])
    system_to_run = args.model
    # Train our model
    if system_to_run == "RNN":
        model = train_lm(args, train_text, dev_text, vocab_index)
    elif system_to_run == "UNIFORM":
        model = UniformLanguageModel(len(vocab))
    else:
        raise Exception("Pass in either UNIFORM or LSTM to run the appropriate system")

    print_evaluation(dev_text, model)

Namespace(model='UNIFORM', train_path='text8-100k.txt', dev_path='text8-dev.txt')
99999 chars read in
499 chars read in
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']
First 100 characters of train:
anarchism originated as a term of abuse first used against early working class radicals including th
Log prob of text -1644.622596
Avg log prob: -3.295837
Perplexity: 27.000000


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# RNN Language Model Definition
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)  # You can use LSTM here as well
        self.fc = nn.Linear(hidden_size, vocab_size)  # Output layer to predict next character

    def forward(self, x, hidden_state):
        x = self.embedding(x)  # Convert character indices to embeddings
        out, hidden_state = self.rnn(x, hidden_state)  # Pass through RNN
        out = self.fc(out)  # Project RNN outputs to vocab size
        return out, hidden_state

# Function to chunk data for training/evaluation
def chunk_data(text, chunk_size, vocab_index):
    input_data = []
    target_data = []
    for i in range(0, len(text) - chunk_size, chunk_size):
        input_chunk = text[i:i + chunk_size]
        target_chunk = text[i + 1:i + chunk_size + 1]
        input_data.append([vocab_index.index_of(c) for c in input_chunk])
        target_data.append([vocab_index.index_of(c) for c in target_chunk])
    return torch.tensor(input_data), torch.tensor(target_data)

# Training loop
def train_lm(args, train_text, dev_text, vocab_index):
    # Hyperparameters
    embed_size = 128
    hidden_size = 256
    chunk_size = 100  # You can experiment with this

    # Model, loss, optimizer
    vocab_size = len(vocab_index)
    model = RNNLanguageModel(vocab_size, embed_size, hidden_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Process data into chunks
    train_input, train_target = chunk_data(train_text, chunk_size, vocab_index)

    # Training loop
    epochs = 10
    for epoch in range(epochs):
        hidden_state = None  # Start with None or init hidden state with zeros

        for batch_idx in range(len(train_input)):
            inputs = train_input[batch_idx].unsqueeze(0)  # Add batch dimension
            targets = train_target[batch_idx].unsqueeze(0)

            optimizer.zero_grad()
            output, hidden_state = model(inputs, hidden_state)  # Forward pass

            # Detach hidden state to prevent backpropagation through time
            if hidden_state is not None:
                hidden_state = hidden_state.detach()

            loss = criterion(output.view(-1, vocab_size), targets.view(-1))
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            if batch_idx % 100 == 0:
                print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}')

        # Evaluate on dev set after each epoch
        print("Evaluating on dev set...")
        evaluate(dev_text, model, vocab_index, chunk_size)

    return model

# Evaluation function to calculate log prob, perplexity, and accuracy
def evaluate(dev_text, model, vocab_index, chunk_size):
    model.eval()  # Set model to evaluation mode
    dev_input, dev_target = chunk_data(dev_text, chunk_size, vocab_index)

    total_log_prob = 0
    total_perplexity = 0
    total_count = 0
    total_correct = 0  # To count correct predictions
    criterion = nn.CrossEntropyLoss(reduction='sum')  # Summing the losses

    with torch.no_grad():
        hidden_state = None
        for batch_idx in range(len(dev_input)):
            inputs = dev_input[batch_idx].unsqueeze(0)
            targets = dev_target[batch_idx].unsqueeze(0)
            output, hidden_state = model(inputs, hidden_state)

            log_prob = -criterion(output.view(-1, len(vocab_index)), targets.view(-1))
            perplexity = torch.exp(log_prob / targets.numel())  # Normalize by number of targets

            total_log_prob += log_prob.item()
            total_perplexity += perplexity.item()
            total_count += targets.numel()

            # Calculate accuracy
            predictions = output.argmax(dim=-1)  # Get the index of the max logit (predicted character)
            correct = (predictions == targets).sum().item()
            total_correct += correct

    avg_log_prob = total_log_prob / total_count
    avg_perplexity = total_perplexity / len(dev_input)
    accuracy = total_correct / total_count * 100  # Convert to percentage

    # Structured output
    print(f'Log prob of text: {total_log_prob:.6f}')
    print(f'Avg log prob: {avg_log_prob:.6f}')
    print(f'Perplexity: {avg_perplexity:.6f}')
    print(f'Accuracy: {accuracy:.2f}%')  # Print accuracy

# Main part to load data and run the model
def main():
    # Placeholder paths
    train_path = 'text8-100k.txt'
    dev_path = 'text8-dev.txt'

    # Load and preprocess data
    with open(train_path, 'r') as f:
        train_text = f.read().lower()
    with open(dev_path, 'r') as f:
        dev_text = f.read().lower()

    # Sample vocab_index creation (You should have a proper class or function for this)
    vocab = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ']

    class VocabIndex:
        def __init__(self, vocab):
            self.vocab = vocab

        def index_of(self, char):
            return self.vocab.index(char)

        def char_of(self, index):
            return self.vocab[index]

        def __len__(self):
            return len(self.vocab)  # This allows `len(vocab_index)` to work

    vocab_index = VocabIndex(vocab)

    # Run the training function
    args = {'model': 'RNN', 'train_path': train_path, 'dev_path': dev_path}
    model = train_lm(args, train_text, dev_text, vocab_index)

if __name__ == "__main__":
    main()


Epoch 0, Batch 0, Loss: 3.2877674102783203
Epoch 0, Batch 100, Loss: 2.031649112701416
Epoch 0, Batch 200, Loss: 1.8407940864562988
Epoch 0, Batch 300, Loss: 2.266396999359131
Epoch 0, Batch 400, Loss: 1.6026535034179688
Epoch 0, Batch 500, Loss: 2.265138864517212
Epoch 0, Batch 600, Loss: 1.7970640659332275
Epoch 0, Batch 700, Loss: 2.388536214828491
Epoch 0, Batch 800, Loss: 2.1008963584899902
Epoch 0, Batch 900, Loss: 2.5031557083129883
Evaluating on dev set...
Log prob of text: -970.840714
Avg log prob: -2.427102
Perplexity: 0.091863
Accuracy: 30.00%
Epoch 1, Batch 0, Loss: 2.597507953643799
Epoch 1, Batch 100, Loss: 2.3390021324157715
Epoch 1, Batch 200, Loss: 1.8133666515350342
Epoch 1, Batch 300, Loss: 2.378232479095459
Epoch 1, Batch 400, Loss: 1.6543529033660889
Epoch 1, Batch 500, Loss: 2.191693067550659
Epoch 1, Batch 600, Loss: 2.011711597442627
Epoch 1, Batch 700, Loss: 2.402981758117676
Epoch 1, Batch 800, Loss: 2.2110376358032227
Epoch 1, Batch 900, Loss: 2.2995042800903

KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import argparse


#####################
# MODELS FOR PART 2 #
#####################

class LanguageModel(object):
    def get_log_prob_single(self, next_char, context):
        """
        Scores one character following the given context. That is, returns
        log P(next_char | context)
        The log should be base e
        :param next_char:
        :param context: a single character to score
        :return:
        """
        raise Exception("Only implemented in subclasses")

    def get_log_prob_sequence(self, next_chars, context):
        """
        Scores a bunch of characters following context. That is, returns
        log P(nc1, nc2, nc3, ... | context) = log P(nc1 | context) + log P(nc2 | context, nc1), ...
        The log should be base e
        :param next_chars:
        :param context:
        :return:
        """
        raise Exception("Only implemented in subclasses")


class UniformLanguageModel(LanguageModel):
    def __init__(self, voc_size):
        self.voc_size = voc_size

    def get_log_prob_single(self, next_char, context):
        return np.log(1.0 / self.voc_size)

    def get_log_prob_sequence(self, next_chars, context):
        return np.log(1.0 / self.voc_size) * len(next_chars)


class RNNLanguageModel(LanguageModel):
    def __init__(self, model_emb, model_dec, vocab_index):
        self.model_emb = model_emb
        self.model_dec = model_dec
        self.vocab_index = vocab_index

    def get_log_prob_single(self, next_char, context):
        # Convert next_char to the index in the vocab
        next_char_idx = self.vocab_index.index(next_char)  # Change this line to use `index()`

        # Convert context to a tensor of indices
        context_idx = torch.tensor([self.vocab_index.index(c) for c in context], dtype=torch.long).unsqueeze(0)

        # Forward pass through the model
        with torch.no_grad():
            output, _ = self.model_emb(context_idx)
            logits = self.model_dec(output)

        # Log probability of the next character
        log_prob = torch.log_softmax(logits[:, -1, :], dim=-1)
        return log_prob[0, next_char_idx].item()

    def get_log_prob_sequence(self, next_chars, context):
        log_prob_sum = 0
        for i in range(len(next_chars)):
            log_prob_sum += self.get_log_prob_single(next_chars[i], context)
            context += next_chars[i]
        return log_prob_sum


class RNNLanguageModelImpl(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNLanguageModelImpl, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)  # You can use LSTM here as well
        self.fc = nn.Linear(hidden_size, vocab_size)  # Output layer to predict next character

    def forward(self, x, hidden_state):
        x = self.embedding(x)  # Convert character indices to embeddings
        out, hidden_state = self.rnn(x, hidden_state)  # Pass through RNN
        out = self.fc(out)  # Project RNN outputs to vocab size
        return out, hidden_state


def chunk_data(text, chunk_size, vocab_index):
    input_data = []
    target_data = []
    for i in range(0, len(text) - chunk_size, chunk_size):
        input_chunk = text[i:i + chunk_size]
        target_chunk = text[i + 1:i + chunk_size + 1]
        input_data.append([vocab_index[c] for c in input_chunk])  # Use vocab_index[c]
        target_data.append([vocab_index[c] for c in target_chunk])  # Use vocab_index[c]
    return torch.tensor(input_data), torch.tensor(target_data)


def train_lm(args, train_text, dev_text, vocab_index):
    # Hyperparameters
    embed_size = 128
    hidden_size = 256
    chunk_size = 100  # You can experiment with this

    # Model, loss, optimizer
    vocab_size = len(vocab_index)

    if args.model == 'RNN':
        model_emb = RNNLanguageModelImpl(vocab_size, embed_size, hidden_size)
        model_dec = nn.Linear(hidden_size, vocab_size)  # Define output layer separately
        model = RNNLanguageModel(model_emb, model_dec, vocab_index)
    elif args.model == 'UNIFORM':
        model = UniformLanguageModel(vocab_size)
        model_emb = None  # No embedding for Uniform model
    else:
        raise ValueError("Unknown model type: " + args.model)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model_emb.parameters(), lr=0.001) if args.model == 'RNN' else None

    # Process data into chunks
    train_input, train_target = chunk_data(train_text, chunk_size, vocab_index)

    # Training loop
    epochs = 10
    for epoch in range(epochs):
        hidden_state = None  # Start with None or init hidden state with zeros

        if args.model == 'RNN':
            for batch_idx in range(len(train_input)):
                inputs = train_input[batch_idx].unsqueeze(0)  # Add batch dimension
                targets = train_target[batch_idx].unsqueeze(0)

                optimizer.zero_grad()
                output, hidden_state = model_emb(inputs, hidden_state)  # Forward pass

                # Detach hidden state to prevent backpropagation through time
                if hidden_state is not None:
                    hidden_state = hidden_state.detach()

                loss = criterion(output.view(-1, vocab_size), targets.view(-1))
                loss.backward()  # Backpropagation
                optimizer.step()  # Update weights

                if batch_idx % 100 == 0:
                    print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}')

        # Evaluate on dev set after each epoch
        print("Evaluating on dev set...")
        evaluate(dev_text, model_emb, vocab_index, chunk_size)

    return model


def evaluate(dev_text, model_emb, vocab_index, chunk_size, model_type='RNN'):
    if model_type == 'RNN' and model_emb is not None:
        model_emb.eval()  # Set model to evaluation mode if it's an RNN model

    # Process dev text into input and target chunks
    dev_input, dev_target = chunk_data(dev_text, chunk_size, vocab_index)

    total_loss = 0
    correct = 0
    total = 0

    # Assuming we want to evaluate the model on the dev set
    with torch.no_grad():  # No gradient calculations during evaluation
        for batch_idx in range(len(dev_input)):
            inputs = dev_input[batch_idx].unsqueeze(0)  # Add batch dimension
            targets = dev_target[batch_idx].unsqueeze(0)

            if model_type == 'RNN' and model_emb is not None:
                output, _ = model_emb(inputs, None)  # Forward pass for RNN model
            elif model_type == 'UNIFORM':
                # For uniform model, predict using uniform distribution over vocabulary
                # Assuming uniform model randomly picks a character from vocab
                output = torch.full((inputs.size(0), inputs.size(1), len(vocab_index)), 1.0 / len(vocab_index)).to(inputs.device)

            loss = nn.CrossEntropyLoss()(output.view(-1, len(vocab_index)), targets.view(-1))
            total_loss += loss.item()

            # Calculate accuracy (if needed)
            _, predicted = output.max(2)
            correct += (predicted == targets).sum().item()
            total += targets.numel()

    avg_loss = total_loss / len(dev_input)
    accuracy = correct / total

    print(f"Evaluation loss: {avg_loss:.4f}")
    print(f"Evaluation accuracy: {accuracy:.4f}")


def main():
    # Argument parser
    parser = argparse.ArgumentParser(description='Language Model Training and Evaluation')
    parser.add_argument('--model', type=str, default='UNIFORM', help='Model to run (UNIFORM or RNN)')
    parser.add_argument('--train_path', type=str, default='text8-100k.txt', help='Path to train set')
    parser.add_argument('--dev_path', type=str, default='text8-dev.txt', help='Path to dev set')

    # Use parse_known_args to ignore unknown arguments
    args, unknown = parser.parse_known_args()

    # Load and preprocess data
    with open(args.train_path, 'r') as f:
        train_text = f.read()

    with open(args.dev_path, 'r') as f:
        dev_text = f.read()

    # Build vocabulary
    vocab = sorted(set(train_text))  # Update with your actual vocab
    vocab_index = {char: idx for idx, char in enumerate(vocab)}

    # Train model
    model = train_lm(args, train_text, dev_text, vocab_index)

    print("Training complete!")


if __name__ == '__main__':
    main()


Evaluating on dev set...


UnboundLocalError: local variable 'output' referenced before assignment