In [32]:
import random
import string
from tqdm import tqdm

# Define constants
MAX_LENGTH = 200
OOD_MAX_LENGTH = 400
# VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
VALID_CHARACTERS = ["p", "e", "b", "a", "s"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings

# Function to generate valid a*b* strings
def generate_valid_string(min_length = 0, max_length = MAX_LENGTH):
    num_a = random.randint(0, max_length - 2)
    num_b = random.randint(min(0, min_length - num_a), max_length - 2 - num_a)
    valid_str = "a" * num_a + "b" * num_b
    num_p = random.randint(0, max_length - len(valid_str) - 2)
    return (
        START_TOKEN
        + PADDING_TOKEN * num_p
        + valid_str
        + END_TOKEN
        + PADDING_TOKEN * ((max_length - len(valid_str) - 2) - num_p)
    )

def generate_valid(length):
    num_a = random.randint(0, length)
    num_b = length - num_a
    return "a" * num_a + "b" * num_b

# Function to generate invalid strings
def generate_invalid_string(min_length = 1, max_length = MAX_LENGTH):
    length = random.randint(min_length, max_length - 2)
    if length == 1:
        num_p = random.randint(0, max_length - 2)
        return START_TOKEN + PADDING_TOKEN * num_p + "ba" + END_TOKEN + PADDING_TOKEN * (max_length - 2 - num_p)

    if random.random() < 0.5:
      while True:
          # Random string of a's and b's which isn't a valid a*b* string
          invalid_str = "".join(random.choices(MAIN_CHARACTERS, k=length))
          if "ba" in invalid_str:
              break
      num_p = random.randint(0, max_length - len(invalid_str) - 2)
      return (
          START_TOKEN
          + PADDING_TOKEN * num_p
          + invalid_str
          + END_TOKEN
          + PADDING_TOKEN * (max_length - len(invalid_str) - 2)
      )
    else:
      string = generate_valid(length)
      index = random.randint(0, length - 2)
      string[index] = 'b'
      string[index + 1] = 'a'
      num_p = random.randint(0, max_length - len(invalid_str) - 2)
      return (
          START_TOKEN
          + PADDING_TOKEN * num_p
          + invalid_str
          + END_TOKEN
          + PADDING_TOKEN * (max_length - len(invalid_str) - 2)
      )

# Generate dataset
dataset = []
num_samples = 1000  # Total number of samples

for _ in range(num_samples):
    while True:
        x = generate_valid_string()
        if not (len(x) == MAX_LENGTH):
            continue
        x += PADDING_TOKEN * (OOD_MAX_LENGTH - MAX_LENGTH)
        if not (len(x) == OOD_MAX_LENGTH):
            continue
        dataset.append((x, 1))
        break

# remove duplicates
valid_dataset = list(set(dataset))
dataset = []

for _ in range(num_samples):
    while True:
        x = generate_invalid_string()
        if not (len(x) == MAX_LENGTH):
            continue
        x += PADDING_TOKEN * (OOD_MAX_LENGTH - MAX_LENGTH)
        if not (len(x) == OOD_MAX_LENGTH):
            continue
        dataset.append((x, 0))
        break

# Remove all duplicates
invalid_dataset = list(set(dataset))[:len(valid_dataset)]
print(len(valid_dataset))
print(len(invalid_dataset))

split = len(valid_dataset) * 4 // 5;
train_dataset = valid_dataset[:split] + invalid_dataset[:split]
test_dataset = valid_dataset[split:] + invalid_dataset[split:]

num_ood_samples = 1000
dataset = []
for _ in range(num_ood_samples):
    while True:
        x = generate_valid_string(min_length=MAX_LENGTH + 2, max_length=OOD_MAX_LENGTH)
        if not (len(x) == OOD_MAX_LENGTH):
            continue
        dataset.append((x, 1))
        break

# remove duplicates
ood_valid_dataset = list(set(dataset))
dataset = []

for _ in range(num_ood_samples):
    while True:
        x = generate_invalid_string(min_length=MAX_LENGTH + 2, max_length=OOD_MAX_LENGTH)
        if not (len(x) == OOD_MAX_LENGTH):
            continue
        dataset.append((x, 0))
        break

# Remove all duplicates
ood_invalid_dataset = list(set(dataset))[:len(ood_valid_dataset)]
print(len(ood_valid_dataset), len(ood_invalid_dataset))

ood_dataset = ood_valid_dataset + ood_invalid_dataset
print(len(train_dataset))
print(len(test_dataset))
print(len(ood_dataset))


# Write to file
with open("train_dataset_padded.txt", "w") as f:
    for data, label in train_dataset:
        f.write(f"{data} {label}\n")

with open("test_dataset_padded.txt", "w") as f:
    for data, label in test_dataset:
        f.write(f"{data} {label}\n")

with open("ood_dataset_padded.txt", "w") as f:
    for data, label in ood_dataset:
        f.write(f"{data} {label}\n")

966
966
991 991
1544
388
1982


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Define constants
MAX_LENGTH = 200
OOD_MAX_LENGTH = 400
# VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
VALID_CHARACTERS = ["p", "e", "b", "a", "s"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings
VOCAB_SIZE = len(VALID_CHARACTERS)
EMBEDDING_DIM = 6
NUM_HEADS = 1
NUM_LAYERS = 1
HIDDEN_DIM = 1
BATCH_SIZE = 512

# Mapping characters to indices
char_to_index = {ch: idx for idx, ch in enumerate(VALID_CHARACTERS)}


# Transformer model
class ActivationDatasetGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(ActivationDatasetGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, OOD_MAX_LENGTH, embedding_dim))
        encoder_layers = nn.TransformerEncoderLayer(
            embedding_dim, num_heads, hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder
        x = self.transformer_encoder(x)
        return x

state_dict = torch.load('/content/1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth')

datagen_model = ActivationDatasetGenerator(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

del state_dict['fc.weight']
del state_dict['fc.bias']

datagen_model.load_state_dict(state_dict)



<All keys matched successfully>

In [None]:
import random
import string
from tqdm import tqdm

# Define constants
MAX_LENGTH = 200
OOD_MAX_LENGTH = 400
# VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
VALID_CHARACTERS = ["p", "e", "b", "a", "s"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings

# Function to generate valid a*b* strings
def generate_valid_string(min_length = 0, max_length = MAX_LENGTH):
    num_a = random.randint(0, max_length - 2)
    num_b = random.randint(min(0, min_length - num_a), max_length - 2 - num_a)
    valid_str = "a" * num_a + "b" * num_b
    num_p = random.randint(0, max_length - len(valid_str) - 2)
    return (
        START_TOKEN
        + PADDING_TOKEN * num_p
        + valid_str
        + END_TOKEN
        + PADDING_TOKEN * ((max_length - len(valid_str) - 2) - num_p)
    )

# Function to generate invalid strings
def generate_invalid_string(min_length = 1, max_length = MAX_LENGTH):
    length = random.randint(min_length, max_length - 2)
    if length == 1:
        num_p = random.randint(0, max_length - 2)
        return START_TOKEN + PADDING_TOKEN * num_p + "ba" + END_TOKEN + PADDING_TOKEN * (max_length - 2 - num_p)
    while True:
        # Random string of a's and b's which isn't a valid a*b* string
        invalid_str = "".join(random.choices(MAIN_CHARACTERS, k=length))
        if "ba" in invalid_str:
            break
    num_p = random.randint(0, max_length - len(invalid_str) - 2)
    return (
        START_TOKEN
        + PADDING_TOKEN * num_p
        + invalid_str
        + END_TOKEN
        + PADDING_TOKEN * (max_length - len(invalid_str) - 2)
    )

# Generate dataset
dataset = []
num_samples = 1000  # Total number of samples

for _ in range(num_samples):
    while True:
        x = generate_valid_string()
        if not (len(x) == MAX_LENGTH):
            continue
        x += PADDING_TOKEN * (OOD_MAX_LENGTH - MAX_LENGTH)
        if not (len(x) == OOD_MAX_LENGTH):
            continue
        dataset.append(x)
        break

# remove duplicates
valid_dataset = list(set(dataset))
dataset = []

for _ in range(num_samples):
    while True:
        x = generate_invalid_string()
        if not (len(x) == MAX_LENGTH):
            continue
        x += PADDING_TOKEN * (OOD_MAX_LENGTH - MAX_LENGTH)
        if not (len(x) == OOD_MAX_LENGTH):
            continue
        dataset.append(x)
        break

# Remove all duplicates
invalid_dataset = list(set(dataset))[:len(valid_dataset)]

dataset = invalid_dataset

char_to_index = {ch: idx for idx, ch in enumerate(VALID_CHARACTERS)}
def encode_string(string):
    return [char_to_index[char] for char in string]

class ActivationDataset(Dataset):
    def __init__(self):
        self.data = []
        self.labels = []

    def add_data(self, string, tensor):
        for i in range(len(string)):
            if i == 0:
                continue
            elif string[i - 1] == 'a' and string[i] == 'a':
                self.data.append(torch.cat((tensor[:,i-1], tensor[:,i])))
                self.labels.append(torch.Tensor([1]).long())
            elif string[i - 1] == 'b' and string[i] == 'b':
                self.data.append(torch.cat((tensor[:,i-1], tensor[:,i])))
                self.labels.append(torch.Tensor([2]).long())
            elif string[i - 1] == 'a' and string[i] == 'b':
                self.data.append(torch.cat((tensor[:,i-1], tensor[:,i])))
                self.labels.append(torch.Tensor([3]).long())
            elif string[i - 1] == 'b' and string[i] == 'a':
                self.data.append(torch.cat((tensor[:,i-1], tensor[:,i])))
                self.labels.append(torch.Tensor([4]).long())
            else:
                if i % 5 == 0:
                    self.data.append(torch.cat((tensor[:,i-1], tensor[:,i])))
                    self.labels.append(torch.Tensor([0]).long())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        label = self.labels[idx]
        return data, torch.tensor(
            label, dtype=torch.float32
        )

activation_dataset = ActivationDataset()

with torch.no_grad():
    for i in range(len(dataset)):
      data_val = encode_string(dataset[i])
      activation_val = datagen_model(torch.tensor(data_val, dtype=torch.long))
      activation_dataset.add_data(dataset[i], activation_val)

print(len(activation_dataset))
print(activation_dataset.data[0])
print(activation_dataset.labels[0])
torch.save(activation_dataset, 'activations2_20000_split.pt')


203522
tensor([[-2.0460, -0.1504,  1.6408, -0.4820, -0.8400,  1.7815],
        [-2.0039, -0.3235,  1.5289, -0.5143, -0.6699,  1.9068]])
tensor([1])


In [None]:
class FeatureProbe(nn.Module):
    def __init__(self):
        super(FeatureProbe, self).__init__()
        # lol this is tiny
        self.mlp = nn.Linear(12, 5)

    def forward(self, x):
        x = self.mlp(x.reshape(len(x),-1))
        return x

BATCH_SIZE = 512
EPOCHS = 5
model = FeatureProbe()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cpu" and torch.backends.mps.is_available():
    device = torch.device("mps")
model.to(device)

dataloader = DataLoader(activation_dataset, batch_size=BATCH_SIZE, shuffle=True)
for epoch in range(EPOCHS):
    model.train()
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs.squeeze(), labels.to(device).long().squeeze())
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item()}")
    # Evaluate the model
    model.eval()
    correct = torch.zeros(5).to(device)
    total = torch.zeros(5).to(device)
    for inputs, labels in dataloader:
        outputs = model(inputs.to(device))
        predicted = torch.argmax(torch.round(outputs).squeeze(), axis=-1)
        # print(labels.shape)
        # print(predicted.shape)
        # atrocious code
        for i in range(5):
            total[i] += torch.sum(labels.to(device) == i).item()
            correct[i] += torch.sum((predicted.squeeze().to(device) == labels.squeeze().to(device)) * (labels.squeeze().to(device) == i)).item()

    print(correct)
    print(total)
    for i in range(5):
        print(i, correct[i] / total[i], total[i])


  return data, torch.tensor(


Epoch 1/5, Loss: 0.38158777356147766
tensor([44389., 39635., 39792., 36813., 36997.], device='cuda:0')
tensor([44557., 39636., 39811., 39757., 39761.], device='cuda:0')
0 tensor(0.9962, device='cuda:0') tensor(44557., device='cuda:0')
1 tensor(1.0000, device='cuda:0') tensor(39636., device='cuda:0')
2 tensor(0.9995, device='cuda:0') tensor(39811., device='cuda:0')
3 tensor(0.9260, device='cuda:0') tensor(39757., device='cuda:0')
4 tensor(0.9305, device='cuda:0') tensor(39761., device='cuda:0')
Epoch 2/5, Loss: 0.156574547290802
tensor([44418., 39636., 39808., 39386., 39307.], device='cuda:0')
tensor([44557., 39636., 39811., 39757., 39761.], device='cuda:0')
0 tensor(0.9969, device='cuda:0') tensor(44557., device='cuda:0')
1 tensor(1., device='cuda:0') tensor(39636., device='cuda:0')
2 tensor(0.9999, device='cuda:0') tensor(39811., device='cuda:0')
3 tensor(0.9907, device='cuda:0') tensor(39757., device='cuda:0')
4 tensor(0.9886, device='cuda:0') tensor(39761., device='cuda:0')
Epoch 3/

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Define constants for model
# VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
VALID_CHARACTERS = ["p", "e", "b", "a", "s"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings
VOCAB_SIZE = len(VALID_CHARACTERS)
EMBEDDING_DIM = 6
NUM_HEADS = 2
NUM_LAYERS = 1
HIDDEN_DIM = 1
BATCH_SIZE = 512
EPOCHS = 5
MAX_LENGTH = 100
OOD_MAX_LENGTH = 500

# Mapping characters to indices
char_to_index = {ch: idx for idx, ch in enumerate(VALID_CHARACTERS)}

# Custom dataset class
class StringDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        self.labels = []
        with open(file_path, "r") as f:
            for line in f:
                parts = line.strip().split(" ")
                self.data.append(parts[0])
                self.labels.append(int(parts[1]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        string = self.data[idx]
        label = self.labels[idx]
        encoded = self.encode_string(string)
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(
            label, dtype=torch.float32
        )

    def encode_string(self, string):
        return [char_to_index[char] for char in string]


# Transformer model
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, OOD_MAX_LENGTH, embedding_dim))
        encoder_layers = nn.TransformerEncoderLayer(
            embedding_dim, num_heads, hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(OOD_MAX_LENGTH * embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder
        x = self.transformer_encoder(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.sigmoid(x)

# Prepare dataset and dataloader
dataset = StringDataset("train_dataset_padded (1).txt")
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize model, loss function, and optimizer
model = TransformerClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cpu" and torch.backends.mps.is_available():
    device = torch.device("mps")
model.to(device)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs.squeeze(), labels.to(device))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item()}")
    # Evaluate the model
    model.eval()
    correct = 0
    total = 0
    for inputs, labels in dataloader:
        outputs = model(inputs.to(device))
        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted.squeeze().to(device) == labels.to(device)).sum().item()
    print(f"Accuracy: {correct/total}")
optimizer = optim.Adam(model.parameters(), lr=0.0001)
# Training loop
for epoch in range(EPOCHS):
    model.train()
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs.squeeze(), labels.to(device))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item()}")
    # Evaluate the model
    model.eval()
    correct = 0
    total = 0
    for inputs, labels in dataloader:
        outputs = model(inputs.to(device))
        predicted = torch.round(outputs)
        total += labels.size(0)
        correct += (predicted.squeeze().to(device) == labels.to(device)).sum().item()
    print(f"Accuracy: {correct/total}")

# Save the trained model
torch.save(model.state_dict(), "1head_1layer_embed6revbatch512hidden1_100max500ood_total_transformer_model.pth")

test_dataset = StringDataset("test_dataset_padded (1).txt")
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Print accuracy of the model
model.eval()
correct = 0
total = 0
for inputs, labels in test_dataloader:
    outputs = model(inputs.to(device))
    predicted = torch.round(outputs)
    total += labels.size(0)
    correct += (predicted.squeeze().to(device) == labels.to(device)).sum().item()
print(f"Accuracy: {correct/total}")



Epoch 1/5, Loss: 0.0643521100282669
Accuracy: 0.9852941176470589
Epoch 2/5, Loss: 0.09316340833902359
Accuracy: 0.9913097454996896
Epoch 3/5, Loss: 0.0352286621928215
Accuracy: 0.9910724066162778
Epoch 4/5, Loss: 0.025056665763258934
Accuracy: 0.9913827728484318
Epoch 5/5, Loss: 0.04972020909190178
Accuracy: 0.9928341914046811
Epoch 1/5, Loss: 0.02033993974328041
Accuracy: 0.9934731807061744
Epoch 2/5, Loss: 0.028123032301664352
Accuracy: 0.9935827217292876
Epoch 3/5, Loss: 0.03850043565034866
Accuracy: 0.9931719428926132
Epoch 4/5, Loss: 0.022266890853643417
Accuracy: 0.9941943257750028
Epoch 5/5, Loss: 0.04888172447681427
Accuracy: 0.9939296016358126
Accuracy: 0.9940484883890756


In [None]:
ood_dataset = StringDataset("ood_dataset_padded (1).txt")
ood_dataloader = DataLoader(ood_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Print accuracy of the model
model.eval()
correct = 0
total = 0
for inputs, labels in ood_dataloader:
    outputs = model(inputs.to(device))
    predicted = torch.round(outputs)
    total += labels.size(0)
    correct += (predicted.squeeze().to(device) == labels.to(device)).sum().item()
print(f"Accuracy: {correct/total}")

Accuracy: 0.9719818488813845


In [None]:
# Print 20 examples which are wrongly classified
print("20 examples which are wrongly classified")
count = 0
for inputs, labels in dataloader:
    outputs = model(inputs.to(device))
    predicted = torch.round(outputs)
    for i in range(len(predicted)):
        if count == 20:
            break
        if predicted[i] != labels[i]:
            # Convert back to string of a's and b's
            string = "".join([VALID_CHARACTERS[int(idx)] for idx in inputs[i]])
            print(string, labels[i].item(), predicted[i].item())
            count += 1

20 examples which are wrongly classified
saabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbepppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp

In [None]:
import numpy as np
model_2 = TransformerClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model_2.load_state_dict(torch.load('1head_1layer_embed6revbatch512hidden1_100max500ood_total_transformer_model.pth'))
model_2.to(device)

# Print 20 examples which are wrongly classified
# print("20 examples which are wrongly classified")
count = 0
matrix = np.zeros((2, 2))
num_pad_zeros = np.zeros((500))
for inputs, labels in dataloader:
    outputs = model_2(inputs.to(device))
    predicted = torch.round(outputs)
    for i in range(len(predicted)):
        matrix[int(predicted[i]), int(labels[i])] += 1
        if predicted[i] != labels[i]:
            # Convert back to string of a's and b's
            num_start_pad = 0
            for j in range(len(inputs[i])):
                if VALID_CHARACTERS[int(inputs[i][j])] != 'p':
                    break
                num_start_pad += 1
            num_pad_zeros[num_start_pad] += 1
print(matrix)
print(num_pad_zeros)



[[5.4745e+04 6.6200e+02]
 [2.9000e+01 5.4112e+04]]
[691.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0. 

In [None]:
ood_dataset = StringDataset("ood_dataset_gappadded.txt")
ood_dataloader = DataLoader(ood_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Print accuracy of the model
model_2.eval()
correct = 0
total = 0
for inputs, labels in ood_dataloader:
    outputs = model_2(inputs.to(device))
    predicted = torch.round(outputs)
    total += labels.size(0)
    correct += (predicted.squeeze().to(device) == labels.to(device)).sum().item()
print(f"Accuracy: {correct/total}")

Accuracy: 0.9748629848229342


In [None]:
print(model_2.transformer_encoder.layers[0].self_attn.in_proj_weight)

Parameter containing:
tensor([[-0.1173, -0.4868, -0.2787, -0.3836,  0.4869,  0.4462],
        [-0.3077,  0.1080, -0.0703,  0.2785,  0.0020,  0.3271],
        [-0.3017, -0.1836, -0.0864, -0.2321, -0.2330, -0.4434],
        [-0.0861,  0.6321, -0.4590, -0.3281, -0.5029, -0.4216],
        [ 0.4099, -0.1211, -0.1174, -0.1267, -0.3700, -0.1985],
        [ 0.3000,  0.4385, -0.3269,  0.3167,  0.4170, -0.3065],
        [ 0.1610, -0.0720,  0.3529,  0.4052,  0.0425,  0.2735],
        [-0.3191, -0.0640,  0.2417, -0.0950,  0.0050, -0.3506],
        [-0.3651, -0.1423, -0.1854, -0.2698,  0.1397,  0.0741],
        [ 0.4333,  0.2350, -0.0254,  0.4441,  0.2188,  0.1552],
        [ 0.1111, -0.1373, -0.0707,  0.4105,  0.1355,  0.3309],
        [-0.2359, -0.2420, -0.0904,  0.2508,  0.3944, -0.0200],
        [-0.2139,  0.3799, -0.1082, -0.3886,  0.0216, -0.0505],
        [-0.2125,  0.3423, -0.3185,  0.3012, -0.3671,  0.2281],
        [-0.4219,  0.1814,  0.2821,  0.0127,  0.1133,  0.3603],
        [-0.2622, 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings
VOCAB_SIZE = len(VALID_CHARACTERS)
EMBEDDING_DIM = 6
NUM_HEADS = 2
NUM_LAYERS = 1
HIDDEN_DIM = 1
BATCH_SIZE = 512
EPOCHS = 5
MAX_LENGTH = 200
OOD_MAX_LENGTH = 400

# Transformer model
class TransformerDebugClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(TransformerDebugClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, OOD_MAX_LENGTH, embedding_dim))
        encoder_layer = TransformerEncoderLayerWithAttention(
            embedding_dim, num_heads, hidden_dim
        )
        self.transformer_encoder = TransformerEncoderWithAttention(encoder_layer, num_layers)
        self.fc = nn.Linear(OOD_MAX_LENGTH * embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        print(x.shape)
        x = self.embedding(x) + self.pos_encoder
        print(x.shape)
        x, attn = self.transformer_encoder(x)
        print(x.shape)
        print(attn)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.sigmoid(x)

In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings
VOCAB_SIZE = len(VALID_CHARACTERS)
EMBEDDING_DIM = 6
NUM_HEADS = 2
NUM_LAYERS = 1
HIDDEN_DIM = 1
BATCH_SIZE = 512
EPOCHS = 5
MAX_LENGTH = 200
OOD_MAX_LENGTH = 400

# Mapping characters to indices
char_to_index = {ch: idx for idx, ch in enumerate(VALID_CHARACTERS)}

# Custom dataset class
class StringDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        self.labels = []
        with open(file_path, "r") as f:
            for line in f:
                parts = line.strip().split(" ")
                self.data.append(parts[0])
                self.labels.append(int(parts[1]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        string = self.data[idx]
        label = self.labels[idx]
        encoded = self.encode_string(string)
        return torch.tensor(encoded, dtype=torch.long), torch.tensor(
            label, dtype=torch.float32
        )

    def encode_string(self, string):
        return [char_to_index[char] for char in string]

# Transformer model
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, OOD_MAX_LENGTH, embedding_dim))
        encoder_layers = nn.TransformerEncoderLayer(
            embedding_dim, num_heads, hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(OOD_MAX_LENGTH * embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder
        x = self.transformer_encoder(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.sigmoid(x)

model_3 = TransformerClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model_3.load_state_dict(torch.load('1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth', map_location=torch.device('cpu')))

ood_dataset = StringDataset("ood_dataset_padded.txt")
ood_dataloader = DataLoader(ood_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Print accuracy of the model
model_3.eval()
correct = 0
total = 0
device = 'cpu'
for inputs, labels in ood_dataloader:
    outputs = model_3(inputs.to(device))
    predicted = torch.round(outputs)
    total += labels.size(0)
    correct += (predicted.squeeze().to(device) == labels.to(device)).sum().item()
print(f"Accuracy: {correct/total}")




Accuracy: 0.9727547931382442


In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

def is_valid(string):
  return ("ba" not in string)

VALID_CHARACTERS = ["s", "a", "b", "e", "p"]
MAIN_CHARACTERS = ["a", "b"]
START_TOKEN = "s"
END_TOKEN = "e"
PADDING_TOKEN = "p"
VALID_RATIO = 0.5  # Half of the dataset should be valid a*b* strings
VOCAB_SIZE = len(VALID_CHARACTERS)
EMBEDDING_DIM = 6
NUM_HEADS = 2
NUM_LAYERS = 1
HIDDEN_DIM = 1
BATCH_SIZE = 512
EPOCHS = 5
MAX_LENGTH = 200
OOD_MAX_LENGTH = 400

# Transformer model
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, OOD_MAX_LENGTH, embedding_dim))
        encoder_layers = nn.TransformerEncoderLayer(
            embedding_dim, num_heads, hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(OOD_MAX_LENGTH * embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder
        x = self.transformer_encoder(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.sigmoid(x)

model = TransformerClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model.load_state_dict(torch.load('1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth', map_location=torch.device('cpu')))

class AdversaryNet(nn.Module):
    def __init__(self, x):
        super(AdversaryNet, self).__init__()
        self.logits = x

    def forward(self):
        # sigmoid to sum to 1, batchsize
        probs = torch.sigmoid(self.logits)
        result = torch.zeros(5,len(self.logits)).long()
        log_prob = torch.zeros(5)
        for i in range(5):
            curr_result = torch.bernoulli(probs).long()
            log_prob[i] = torch.sum(torch.log(probs * curr_result + (1 - probs) * (1 - curr_result)))
            result[i,:] = curr_result
        return result, log_prob

def make_fooling_character(X, model, string_length=20):
    """
    Generate a fooling distribution: an invalid string that the model classifies
    as valid. In this case, start with random noise and perform gradient ascent.

    Input: Tensor of shape (string_length) between [0, 1], sample a and b from that
    Model: Pretrained predictor model

    Returns: Fooling distribution
    """

    net = AdversaryNet(X)
    optimizer = optim.Adam([net.logits], lr=0.01)

    num_episodes = 1000
    for episode in range(num_episodes):
        # Generate an episode
        actions, log_prob = net()

        string = torch.zeros(5, OOD_MAX_LENGTH).long()
        string[:,0] = 0
        string[:,1:string_length + 1] = actions + 1
        string[:,string_length + 1] = 3
        string[:,string_length + 2:] = 4

        # Simulate the environment and get rewards
        rewards = model.forward(string)  # Reward based on sampled actions
        # print(rewards)

        # Calculate the policy gradient
        policy_gradient = torch.zeros_like(log_prob)
        for i in range(5):
          policy_gradient[i] = (rewards[i][0]) * log_prob[i]
        policy_gradient = policy_gradient.sum()

        # Update the policy network
        optimizer.zero_grad()
        policy_gradient.backward()
        optimizer.step()

        # Print progress
        if episode % 100 == 0:
            print(f'Episode {episode}: Policy Gradient = {policy_gradient.item()}')
            print(net.logits)
            print(rewards.mean())

make_fooling_character(torch.zeros(20).requires_grad_(), model)

Episode 0: Policy Gradient = -0.30830949544906616
tensor([ 0.0100, -0.0100,  0.0100,  0.0100,  0.0100,  0.0100,  0.0100, -0.0100,
         0.0100,  0.0100, -0.0100, -0.0100,  0.0100,  0.0100,  0.0100, -0.0100,
        -0.0100,  0.0100,  0.0100, -0.0100], requires_grad=True)
tensor(0.0044, grad_fn=<MeanBackward0>)
Episode 100: Policy Gradient = -0.8414623141288757
tensor([ 0.2731,  0.3374,  0.4244,  0.4008,  0.3439,  0.1844,  0.3444,  0.2955,
         0.1875,  0.1238,  0.0893,  0.1798,  0.0324,  0.0545,  0.0743, -0.0716,
        -0.0245,  0.0544,  0.1404,  0.0916], requires_grad=True)
tensor(0.0115, grad_fn=<MeanBackward0>)
Episode 200: Policy Gradient = -0.12202877551317215
tensor([0.4214, 0.4827, 0.6143, 0.6338, 0.4996, 0.3066, 0.5284, 0.3704, 0.3276,
        0.2400, 0.1361, 0.3238, 0.1117, 0.0597, 0.1474, 0.0357, 0.0519, 0.1241,
        0.1061, 0.0736], requires_grad=True)
tensor(0.0017, grad_fn=<MeanBackward0>)
Episode 300: Policy Gradient = -0.01457495428621769
tensor([0.5413, 0.63

KeyboardInterrupt: 

In [152]:
import itertools
VALID_CHARACTERS = ["p", "e", "b", "a", "s"]
char_to_index = {ch: idx for idx, ch in enumerate(VALID_CHARACTERS)}

def is_valid(string):
  return ("ba" not in string)

def generate_strings(length):
    # Define the characters
    characters = ['a', 'b']

    strings = []
    # Generate all combinations of characters of given length
    for i in range(length):
      strings.append("a" * i + "b" * (length - i))

    return strings

# Generate all strings of length 8
strings_length_8 = generate_strings(20)

model = TransformerClassifier(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model.load_state_dict(torch.load('1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth', map_location=torch.device('cpu')))

for string in strings_length_8:
    str2 = "s" + "p"*5 + string + "e" + "p"*(400 - 27)
    if (model.forward(torch.tensor([char_to_index[char] for char in str2], dtype=torch.long)) > 0.7 and is_valid(str2)):
      print(str2)

spppppaaaaaaaabbbbbbbbbbbbeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
spppppaaaaaaaaabbbbbbbbbbbeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
spppppaaaaaaaaaabbbbbbbbbbeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp

In [137]:
text = "spaaaabaeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp"

class TransformerClassifier2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(TransformerClassifier2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, OOD_MAX_LENGTH, embedding_dim))
        encoder_layers = nn.TransformerEncoderLayer(
            embedding_dim, num_heads, hidden_dim
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(OOD_MAX_LENGTH * embedding_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder
        x = self.transformer_encoder(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return self.sigmoid(x)

    def return_pre_fc(self, x):
        x = self.embedding(x) + self.pos_encoder
        x = self.transformer_encoder(x)
        x = x.view(x.size(0), -1)
        sum = self.fc.weight.data.squeeze() * x
        return sum

model = TransformerClassifier2(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model.load_state_dict(torch.load('1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth', map_location=torch.device('cpu')))
vals = model.return_pre_fc(torch.tensor([char_to_index[char] for char in text], dtype=torch.long))
vals = vals.view(400, 6).sum(axis=-1)
print(vals[0:12])
print(model.forward(torch.tensor([char_to_index[char] for char in text], dtype=torch.long)))

tensor([ 0.0241,  1.6128, -0.1938,  0.0503,  0.2070, -0.1356, -0.8409,  0.0285,
         0.2954,  0.3082,  0.4625,  0.4214], grad_fn=<SliceBackward0>)
tensor([[0.5452]], grad_fn=<SigmoidBackward0>)


In [138]:
import matplotlib.colors as mcolors
from matplotlib import cm
import numpy as np

from IPython.core.display import display, HTML

def format(value):
  c, l = value
  return "<span style='color:{};'>{}</span>".format(l,c)

def format_chars(chars,numbers):
    print(chars)
    print(numbers)
    numbers = np.array(numbers.detach()).astype(float)
    norm = mcolors.Normalize(vmin=-1, vmax=1)
    cmap = cm.RdYlGn
    colors = cmap(norm(numbers))
    hexcolor = [mcolors.to_hex(c) for c in colors]
    text = " ".join(list(map(format, zip(chars,hexcolor))))
    text = "<div style='font-size:14pt;font-weight:bold;background-color:#000000;padding:8px'>" + text + "</div>"
    display(HTML(text))
    return colors

format_chars(list(text[0:20]), vals[0:20])

['s', 'p', 'a', 'a', 'a', 'a', 'b', 'a', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p']
tensor([ 0.0241,  1.6128, -0.1938,  0.0503,  0.2070, -0.1356, -0.8409,  0.0285,
         0.2954,  0.3082,  0.4625,  0.4214,  0.3329,  0.2554,  0.2074,  0.1694,
         0.2186,  0.1501,  0.1455,  0.2175], grad_fn=<SliceBackward0>)


array([[0.97954633, 0.99138793, 0.72103037, 1.        ],
       [0.        , 0.40784314, 0.21568627, 1.        ],
       [0.99623222, 0.88319877, 0.55309496, 1.        ],
       [0.96201461, 0.98400615, 0.6970396 , 1.        ],
       [0.84313725, 0.93387159, 0.54002307, 1.        ],
       [0.99730873, 0.91657055, 0.60907343, 1.        ],
       [0.80084583, 0.14763552, 0.15209535, 1.        ],
       [0.97954633, 0.99138793, 0.72103037, 1.        ],
       [0.75686275, 0.89665513, 0.48419839, 1.        ],
       [0.74117647, 0.8898885 , 0.47404844, 1.        ],
       [0.56732026, 0.81437908, 0.40653595, 1.        ],
       [0.62637447, 0.8402153 , 0.412995  , 1.        ],
       [0.71764706, 0.87973856, 0.45882353, 1.        ],
       [0.79607843, 0.9135717 , 0.50957324, 1.        ],
       [0.84313725, 0.93387159, 0.54002307, 1.        ],
       [0.87435602, 0.94709727, 0.57708574, 1.        ],
       [0.83529412, 0.93048827, 0.5349481 , 1.        ],
       [0.88604383, 0.95201845,

In [140]:
text2 = "spaabbbbbbbbeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp"
text3 = "spaaaaaaabbbeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp"
model2 = TransformerClassifier2(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model2.load_state_dict(torch.load('1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth', map_location=torch.device('cpu')))


vals2 = model2.return_pre_fc(torch.tensor([char_to_index[char] for char in text2], dtype=torch.long))
vals3 = model2.return_pre_fc(torch.tensor([char_to_index[char] for char in text3], dtype=torch.long))
vals2 = vals2.view(400, 6).sum(axis=-1)
vals3 = vals3.view(400, 6).sum(axis=-1)

format_chars(list(text2[0:20]), vals2[0:20])
format_chars(list(text3[0:20]), vals3[0:20])



['s', 'p', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p']
tensor([ 0.0162,  1.4855,  0.2543,  0.0503, -1.1846, -1.0383, -0.8250, -0.7753,
        -0.4368, -0.5359, -0.4594, -0.3800,  0.1013,  0.2502,  0.2274,  0.1753,
         0.2210,  0.0988,  0.1550,  0.2178], grad_fn=<SliceBackward0>)


['s', 'p', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p']
tensor([ 0.0271,  1.6128, -0.0767, -0.1219, -0.4615,  0.3049, -0.1548,  0.0332,
         0.0323, -0.4665, -0.4311, -0.4319,  0.0949,  0.2601,  0.2199,  0.1239,
         0.2239,  0.1597,  0.1537,  0.1924], grad_fn=<SliceBackward0>)


array([[0.97954633, 0.99138793, 0.72103037, 1.        ],
       [0.        , 0.40784314, 0.21568627, 1.        ],
       [0.99853902, 0.95470973, 0.67304883, 1.        ],
       [0.9976163 , 0.92610534, 0.62506728, 1.        ],
       [0.98039216, 0.59738562, 0.34117647, 1.        ],
       [0.74117647, 0.8898885 , 0.47404844, 1.        ],
       [0.99700115, 0.90703576, 0.59307958, 1.        ],
       [0.97370242, 0.98892734, 0.71303345, 1.        ],
       [0.97370242, 0.98892734, 0.71303345, 1.        ],
       [0.98039216, 0.59738562, 0.34117647, 1.        ],
       [0.98592849, 0.63737024, 0.35963091, 1.        ],
       [0.98592849, 0.63737024, 0.35963091, 1.        ],
       [0.92695117, 0.9692426 , 0.64905805, 1.        ],
       [0.78823529, 0.91018839, 0.50449827, 1.        ],
       [0.82745098, 0.92710496, 0.52987313, 1.        ],
       [0.90941945, 0.96186082, 0.62506728, 1.        ],
       [0.82745098, 0.92710496, 0.52987313, 1.        ],
       [0.88019992, 0.94955786,

In [153]:
text4 = "spppppaaaaaaaabbbbbbbbbbbbeppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp"
model2 = TransformerClassifier2(
    VOCAB_SIZE, EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, NUM_LAYERS
)

model2.load_state_dict(torch.load('1head_1layer_embed6batch512hidden1_200max400ood_total_transformer_model.pth', map_location=torch.device('cpu')))
vals4 = model2.return_pre_fc(torch.tensor([char_to_index[char] for char in text4], dtype=torch.long))
vals4 = vals4.view(400, 6).sum(axis=-1)

format_chars(list(text4[0:50]), vals2[0:50])

['s', 'p', 'p', 'p', 'p', 'p', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p']
tensor([ 1.6211e-02,  1.4855e+00,  2.5426e-01,  5.0346e-02, -1.1846e+00,
        -1.0383e+00, -8.2501e-01, -7.7528e-01, -4.3678e-01, -5.3587e-01,
        -4.5938e-01, -3.8005e-01,  1.0128e-01,  2.5020e-01,  2.2745e-01,
         1.7535e-01,  2.2096e-01,  9.8830e-02,  1.5501e-01,  2.1776e-01,
         1.0477e-01,  7.6503e-02, -1.5959e-02,  4.7588e-02,  2.1500e-02,
         5.9272e-02,  5.6882e-02,  1.8914e-02, -4.9604e-02,  6.7174e-03,
         1.5475e-02,  6.5280e-02,  1.0134e-02,  2.7041e-02,  6.8334e-02,
         6.3026e-03, -1.0717e-02,  4.0223e-02, -2.3945e-02, -5.3492e-02,
        -3.6565e-02, -3.2506e-02,  8.0918e-04, -2.4274e-02, -4.6167e-02,
        -4.6820e-04, -2.4270e-02, -5.9357e-02,  1.9951e-02, -4.5203e-02],
       grad_fn=<Sl

array([[0.98539023, 0.99384852, 0.7290273 , 1.        ],
       [0.        , 0.40784314, 0.21568627, 1.        ],
       [0.79607843, 0.9135717 , 0.50957324, 1.        ],
       [0.96201461, 0.98400615, 0.6970396 , 1.        ],
       [0.64705882, 0.        , 0.14901961, 1.        ],
       [0.64705882, 0.        , 0.14901961, 1.        ],
       [0.81622453, 0.16239908, 0.15240292, 1.        ],
       [0.85428681, 0.21168781, 0.16370627, 1.        ],
       [0.98592849, 0.63737024, 0.35963091, 1.        ],
       [0.96793541, 0.50742022, 0.29965398, 1.        ],
       [0.98177624, 0.60738178, 0.34579008, 1.        ],
       [0.99254133, 0.70157632, 0.39653979, 1.        ],
       [0.92695117, 0.9692426 , 0.64905805, 1.        ],
       [0.79607843, 0.9135717 , 0.50957324, 1.        ],
       [0.81960784, 0.92372165, 0.52479815, 1.        ],
       [0.86851211, 0.94463668, 0.56908881, 1.        ],
       [0.82745098, 0.92710496, 0.52987313, 1.        ],
       [0.92695117, 0.9692426 ,

In [38]:
str2 = "ps" + "babaababaabaaabbabaae" + "p"*(400 - 23)
print(model.forward(torch.tensor([char_to_index[char] for char in str2], dtype=torch.long)))
print(model.forward(torch.tensor([char_to_index[char] for char in str2], dtype=torch.long)))

tensor([[1.0000]], grad_fn=<SigmoidBackward0>)
