### Try COLAB

In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False
if IN_COLAB:
    !pip3 install torch matplotlib torchmetrics scikit-image segmentation-models-pytorch

# Import

In [None]:
import torch
from torch import nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence # for padding data

from sklearn.metrics import accuracy_score, f1_score

import wandb

import seaborn as sns

### CUDA

In [None]:
# GPU Support?
if torch.cuda.is_available():
    print("Using the GPU")
    device = torch.device('cuda')
else:
    print("Using the CPU")
    device = torch.device('cpu')

# Generate Dataset

In [None]:
import random

# Generate a dataset of sequences containing characters 'a', 'b', and 'c' in order
N = 1000 # Size of dataset (number of sequences)
dataset = []

for n in range(N):
    if n % 4 == 0: # 25% of the time add actual member of formal language family
        length = random.randint(1, 6)
        sequence = 'a' * length + 'b' * length + 'c' * length
        dataset.append(sequence)
    else:
        length = random.randint(3, 17)  # Random sequence length between 3 and (20-3 = 17). 3 is the smallest possible length for a sequence to be in the language
        counts = [1, 1, 1] # Initialize counts for 'a', 'b', and 'c'. Will have at least one of each letter.

        # Distribute the length among a, b, and c
        for i in range(length):
            counts[random.randint(0, 2)] += 1

        # Ensure alphabetical order and create the sequence
        sequence = 'a' * counts[0] + 'b' * counts[1] + 'c' * counts[2]
        dataset.append(sequence)

In [None]:
# print max length of dataset
max_len = max([len(s) for s in dataset])
min_len = min([len(s) for s in dataset])
print(min_len, max_len) # Should be 3, 20

### Functions for labels and encoding

In [None]:
# Get labels
def get_labels(data):
    y = torch.zeros(len(data))
    for i, sequence in enumerate(data):
        if sequence == 'a'*(len(sequence)//3) + 'b'*(len(sequence)//3) + 'c'*(len(sequence)//3):
            y[i] = 1
    return y

In [None]:
# Create a one-hot encoding of the sequences and a labels tensor
def one_hot_encode(sequence):
    encoded = torch.zeros(len(sequence), 3)
    for i, char in enumerate(sequence):
        encoded[i, 'abc'.index(char)] = 1
    return encoded

## One-Hot Encoding

In [None]:
encoded_dataset = [one_hot_encode(sequence) for sequence in dataset]
print(encoded_dataset[0], "\n", dataset[0])

In [None]:
# pad sequences
padded_dataset = pad_sequence(encoded_dataset, batch_first=True)
print(padded_dataset.shape)

In [None]:
# Make training data
train_size = int(0.8*N) # 80/20 train/test split
test_size = N - train_size

train_data = padded_dataset[:train_size]
test_data = padded_dataset[train_size:]
y_train = get_labels(dataset[:train_size])
y_test = get_labels(dataset[train_size:])
print(train_data.shape, test_data.shape)
print(y_train.shape, y_test.shape)

In [None]:
train_loader = DataLoader(list(zip(train_data, y_train)), batch_size=train_data.size(0), shuffle=True) # HVAD SKAL BATCH SIZE VÆRE??
test_loader = DataLoader(list(zip(test_data, y_test)), batch_size=test_data.size(0), shuffle=False)

In [None]:
# Test train_loader
for data, labels in train_loader:
    print(data.shape, labels.shape)
    print(data[0], labels[0])
    break

# RNN Model

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        # x -> (batch_length, seq_length, input_size)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input):
        h0 = torch.zeros(self.num_layers, input.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(input, h0)
        # out -> (batch_size, seq_length, hidden_size)
        # reshape to get last output
        out = out[:, -1, :]
        out = self.fc(out)
        return out


# Training (Change; make function and prep sweep)

In [None]:

# hyperparams
input_size =  3 # 'a' 'b' 'c'
num_classes = 1 # binary classification
hidden_size = 128 # hyperparameter; can be tuned
num_layers = 1 # hyperparameter; can be tuned

rnn = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

criterion = nn.BCEWithLogitsLoss()
learning_rate = 0.005 # hyperparameter; can be tuned
optimizer = optim.SGD(rnn.parameters(), lr=learning_rate)

In [None]:
# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        
        # Forward pass
        outputs = rnn(data)
        loss = criterion(outputs.squeeze(), labels) # BCEWithLogitsLoss expects 1D input, output from RNN is 2D
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation (Change; Make function and determine validity)

In [None]:
# Set the model to evaluation mode
rnn.eval()

# Initialize lists to store predictions and actual labels
train_predictions = []
test_predictions = []
train_actuals = []
test_actuals = []

# Iterate over the test data
for inputs, labels in test_loader:
    # Move inputs and labels to the same device as your model
    inputs = inputs.to(device)
    labels = labels.to(device)

    # Get the model's predictions
    outputs = rnn(inputs)

    # Convert the outputs to binary predictions (0 or 1)
    pred = torch.round(torch.sigmoid(outputs))

    # Store the predictions and actual labels
    test_predictions.extend(pred.tolist())
    test_actuals.extend(labels.tolist())

# Iterate over the training data
for inputs, labels in train_loader:
    # Move inputs and labels to the same device as your model
    inputs = inputs.to(device)
    labels = labels.to(device)

    # Get the model's predictions
    outputs = rnn(inputs)

    # Convert the outputs to binary predictions (0 or 1)
    pred = torch.round(torch.sigmoid(outputs))

    # Store the predictions and actual labels
    train_predictions.extend(pred.tolist())
    train_actuals.extend(labels.tolist())

# Calculate accuracy and F1 score
train_accuracy = accuracy_score(train_actuals, train_predictions)
test_accuracy = accuracy_score(test_actuals, test_predictions)
f1 = f1_score(test_actuals, test_predictions)

print(f'Training Accuracy: {test_accuracy*100:.2f}%')
print(f'Test Accuracy: {test_accuracy*100:.2f}%')
print(f'F1 Score: {f1:.2f}')