### Try COLAB

In [64]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
    !pip3 install torch matplotlib torchmetrics scikit-image segmentation-models-pytorch

# Import

In [65]:
import torch
from torch import nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence # for padding data

import pandas as pd # for making csv file

from sklearn.metrics import accuracy_score, f1_score

import wandb

import seaborn as sns
import matplotlib.pyplot as plt

import random


### CUDA

In [66]:
# GPU Support?
if torch.cuda.is_available():
    print("Using the GPU")
    device = torch.device('cuda')
else:
    print("Using the CPU")
    device = torch.device('cpu')

Using the GPU


# Generate Dataset

In [67]:
# Generate a dataset of sequences containing characters 'a', 'b', and 'c' in order
def gen_data(N=10000, min_len=3, max_len=20) -> list:
    """Generates a dataset of sequences containing characters 'a', 'b', and 'c' in order.
    Parameters: N: int, number of sequences to generate
                min_len: int, minimum length of a sequence
                max_len: int, maximum length of a sequence
    Returns: list of strings, each string is a sequence of characters 'a', 'b', and 'c' in order"""
    dataset = []
    max_member = max_len // 3 # three times this number is less than max_len
    min_member = min_len // 3 # three times this number is less than min_len
    if min_len < 3:
        min_len = 3
        print("Minimum length must be at least 3. Setting min_len to 3.")
    if max_len > 1000:
        max_len = 1000
        print("Maximum length must be at most 1000. Setting max_len to 1000.")

    for n in range(N):
        if n % 4 == 0: # 25% of the time add actual member of formal language family
            length = random.randint(min_member, max_member)
            sequence = 'a' * length + 'b' * length + 'c' * length
            dataset.append(sequence)
        else:
            length = random.randint(min_len-3, max_len-3)  # Random sequence length between 3 and (20-3 = 17). 3 is the smallest possible length for a sequence to be in the language
            counts = [1, 1, 1] # Initialize counts for 'a', 'b', and 'c'. Will have at least one of each letter.

            # Distribute the length among a, b, and c
            for i in range(length):
                counts[random.randint(0, 2)] += 1

            # Ensure alphabetical order and create the sequence
            sequence = 'a' * counts[0] + 'b' * counts[1] + 'c' * counts[2]
            dataset.append(sequence)
    return dataset

### Functions for labels and encoding

In [68]:
# Get labels
def get_labels(data):
    y = torch.zeros(len(data))
    for i, sequence in enumerate(data):
        if sequence == 'a'*(len(sequence)//3) + 'b'*(len(sequence)//3) + 'c'*(len(sequence)//3):
            y[i] = 1
    return y

In [69]:
# Create a one-hot encoding of the sequences and a labels tensor
def one_hot_encode(sequence):
    encoded = torch.zeros(len(sequence), 3)
    for i, char in enumerate(sequence):
        encoded[i, 'abc'.index(char)] = 1
    return encoded

## One-Hot Encoding

In [70]:
# Example of one-hot encoding
dataset = gen_data()
encoded_dataset = [one_hot_encode(sequence) for sequence in dataset]
print(encoded_dataset[0], "\n", dataset[0])

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]) 
 abc


In [71]:
# pad sequences
padded_dataset = pad_sequence(encoded_dataset, batch_first=True)
print(padded_dataset.shape)

torch.Size([10000, 20, 3])


In [72]:
def get_encoded_dataset(N, min_len=3, max_len=20):
    dataset = gen_data(N, min_len, max_len)
    encoded_dataset = [one_hot_encode(sequence) for sequence in dataset]
    padded_dataset = pad_sequence(encoded_dataset, batch_first=True)
    y = get_labels(dataset)
    return padded_dataset, y

In [73]:
# Make training data
train_size = int(0.8*len(dataset)) # 80/20 train/test split
test_size = len(dataset) - train_size

In [74]:

train_data = padded_dataset[:train_size]
test_data = padded_dataset[train_size:]
y_train = get_labels(dataset[:train_size])
y_test = get_labels(dataset[train_size:])
print(train_data.shape, test_data.shape)
print(y_train.shape, y_test.shape)

torch.Size([8000, 20, 3]) torch.Size([2000, 20, 3])
torch.Size([8000]) torch.Size([2000])


In [75]:
y_train.sum()/len(y_train) # % of sequences that are in the language

tensor(0.3180)

In [76]:
train_loader = DataLoader(list(zip(train_data, y_train)), batch_size=128, shuffle=True)
test_loader = DataLoader(list(zip(test_data, y_test)), batch_size=test_data.size(0), shuffle=False)

In [77]:
# Test train_loader
for data, labels in train_loader:
    print(data.shape, labels.shape)
    print(data[0], labels[0])
    break

torch.Size([128, 20, 3]) torch.Size([128])
tensor([[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]) tensor(0.)


# RNN Model

In [78]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        # x -> (batch_length, seq_length, input_size/vocab_size)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        out, _ = self.rnn(input)
        # out -> (batch_size, seq_length, hidden_size)
        # reshape to get last output
        out = out[:, -1, :]
        out = self.fc(out)
        return out 


# LSTM Model

In [79]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # x -> (batch_length, seq_length, input_size)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        out, _ = self.lstm(input)
        # out -> (batch_size, seq_length, hidden_size)
        # reshape to get last output
        out = out[:, -1, :]
        out = self.fc(out)

        return out


# MLOps and Sweep

# Artifact

In [80]:
# # Make dataset with labels to csv
# df = pd.DataFrame(dataset)
# df['label'] = get_labels(dataset)
# df.to_csv('formal_language.csv', index=False)
# min_len, max_len = min(len(seq) for seq in dataset), max(len(seq) for seq in dataset)

# # Instantiate a WandB run
# wandb.login()
# run = wandb.init(project="formal_language_rnn_lstm")

# # Create an artifact for data
# artifact = wandb.Artifact(f"FL_data_{str(min_len)}_{str(max_len)}", type="dataset") 
# artifact.add_file("formal_language.csv") 
# run.log_artifact(artifact)

# Sweeping

In [81]:
# Sweep config
sweep_config = {
    "method": "random",
    "metric": {"name": "loss", "goal": "minimize"},
    "parameters": {
        "model": { "values": ["RNN", "LSTM"] },
        "epochs": {"values": [200, 500] },    
        "optimizer": { "values": ["SGD", "Adam"] },
        "hidden_size": {
            "values": [2, 20, 50]
        },
        "num_layers": {
            "values": [1, 2]
        },
        "learning_rate": {
            "values": [0.001, 0.01]
        }
    }
}

In [82]:
def train(config=None):
    with wandb.init(project="formal_language_rnn_lstm", config=config):
        config = wandb.config

        # Get hyperparameters
        hidden_size = config.hidden_size
        num_layers = config.num_layers
        learning_rate = config.learning_rate
        num_epochs = config.epochs

        # Input size and number of classes
        num_classes = 1 # binary classification
        input_size = 3 # 'a' 'b' 'c'

        # Set criterion
        criterion = nn.BCEWithLogitsLoss()

        # Get model
        if config.model == "RNN":
            model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
        else:
            model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)
        
        # Get optimizer
        if config.optimizer == "SGD":
            optimizer = optim.SGD(model.parameters(), lr=learning_rate)
        else:
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        wandb.watch(model, criterion, log="all")

        # Training loop
        for epoch in range(num_epochs):
            for data, labels in train_loader:
                data, labels = data.to(device), labels.to(device)

                # Forward pass
                outputs = model(data)
                loss = criterion(outputs.squeeze(), labels)

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            if (epoch+1) % 10 == 0:
                wandb.log({"epoch": epoch+1, "loss": loss.item()})
            if (epoch+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        print("Finished training")

In [83]:
# Initialize sweep_id
# sweep_id = wandb.sweep(sweep_config, project="formal_language_rnn_lstm")

In [84]:
# Run the sweep
# wandb.agent(sweep_id, function=train, count=10)

# Train based on Sweep

In [99]:
# params
input_size =  3 # 'a' 'b' 'c'
num_classes = 1 # binary classification
hidden_size = 40 # based on sweep
num_layers = 1 # hyperparameter; can be tuned


criterion = nn.BCEWithLogitsLoss() # "hyperparameter" (maybe BCE without LogitsLoss is better?)
learning_rate = 0.005 # Between sweep rates (strongly uncorrelated with loss)

In [103]:
rnn = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)

# Training loop RNN
num_epochs = 30 # slower convergence than LSTM
for epoch in range(num_epochs):
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        
        # Forward pass
        outputs = rnn(data)
        loss = criterion(outputs.squeeze(), labels) # BCEWithLogitsLoss expects 1D input, output from RNN is 2D
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
print("Finished Training")

Epoch [10/30], Loss: 0.1164
Epoch [20/30], Loss: 0.1519
Epoch [30/30], Loss: 0.3469
Finished Training


In [104]:
lstm = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)
optimizer = optim.Adam(lstm.parameters(), lr=learning_rate) # Adam (no vanishing gradient problem)

# Training loop LSTM
num_epochs = 30 # convergence faster with Adam
for epoch in range(num_epochs):
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)

        # Forward pass
        outputs = lstm(data)
        loss = criterion(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
print("Finished training")

Epoch [10/30], Loss: 0.2475
Epoch [20/30], Loss: 0.3070
Epoch [30/30], Loss: 0.7901
Finished training


# Evaluation

In [105]:
def report_evaluation(model, loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for data, label in loader:
            data, label = data.to(device), label.to(device)

            # Get the model's predictions
            output = model(data)
            pred = torch.round(torch.sigmoid(output)) # Sigmoid to get probabilities, round to get binary predictions

            y_true.extend(label.tolist())
            y_pred.extend(pred.tolist())

    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, f1

## Report F1 Score (and accuracy)

In [107]:
rnn_accuracy, rnn_f1 = report_evaluation(rnn, test_loader)
lstm_accuracy, lstm_f1 = report_evaluation(lstm, test_loader)


print(f'RNN Test Accuracy: {rnn_accuracy*100:.2f}%')
print(f'RNN F1 Score: {rnn_f1:.2f}')
print(f'LSTM Test Accuracy: {lstm_accuracy*100:.2f}%')
print(f'LSTM F1 Score: {lstm_f1:.2f}')

RNN Test Accuracy: 81.15%
RNN F1 Score: 0.71
LSTM Test Accuracy: 84.75%
LSTM F1 Score: 0.79


# Generalization and Plot

In [108]:
# Generate new test data of len 21 to 30 to see how well the models generalize
long_data, y_long = get_encoded_dataset(1000, min_len=21, max_len=30)
long_loader = DataLoader(list(zip(long_data, y_long)), batch_size=long_data.size(0), shuffle=False)

In [109]:
def f1_of_seq_len(model, loader):
    model.eval()
    length_to_true_pred = {}
    with torch.no_grad():
        for data, label in loader:
            data, label = data.to(device), label.to(device)

            # Get the model's predictions
            output = model(data)
            pred = torch.round(torch.sigmoid(output))
            print(pred.sum().item())
            
            # Get sequence lengths
            seq_lengths = (data.sum(dim=2) != 0).sum(dim=1).tolist()
            for i, length in enumerate(seq_lengths):
                if length not in length_to_true_pred:
                    length_to_true_pred[length] = [[], []]
                length_to_true_pred[length][0].append(label[i].item())
                length_to_true_pred[length][1].append(pred[i].item())
    f1_scores = {}
    for length, (true, pred) in length_to_true_pred.items():
        f1_scores[length] = f1_score(true, pred, average='binary', zero_division=0)
    return f1_scores

In [110]:
rnn_f1 = f1_of_seq_len(rnn, long_loader)
lstm_f1 = f1_of_seq_len(lstm, long_loader)
print(rnn_f1)
print(lstm_f1)

22.0
461.0
{24: 0.0, 29: 0.0, 25: 0.0, 27: 0.0, 30: 0.0, 21: 0.0, 26: 0.0, 22: 0.0, 28: 0.0, 23: 0.0}
{24: 0.0, 29: 0.0, 25: 0.0, 27: 0.0, 30: 0.0, 21: 0.0, 26: 0.0, 22: 0.0, 28: 0.0, 23: 0.0}
