In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pickle
import numpy as np

# 1. Prepare the data
class AccountSequenceDataset(Dataset):
    def __init__(self, sequences, max_len=16):
        self.sequences = sequences
        self.max_len = max_len

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_sequence = sequence[:-1]
        target_sequence = sequence[1:]

        # Pad input and target sequences to max_len
        input_sequence = input_sequence + [padding_token] * (self.max_len - len(input_sequence))
        target_sequence = target_sequence + [padding_token] * (self.max_len - len(target_sequence))

        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)


# Read encoded_sequences from file. 0 -> 101
objects = []
with (open("encoded_sequences.pkl", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

encoded_sequences = objects[0]
print(f"Length of data: {len(encoded_sequences)}")
print(f"Ex: {encoded_sequences[0]}")

#sequences = [[1, 29, 34, 12, 45], ...]
max_account_number = max([max(seq) for seq in encoded_sequences]) # 101
padding_token = max_account_number + 1  # 102
dataset = AccountSequenceDataset(encoded_sequences)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 2. Define the transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src).transpose(0, 1)  # Transpose to (sequence_length, batch_size, d_model)
        tgt = self.embedding(tgt).transpose(0, 1)  # Transpose to (sequence_length, batch_size, d_model)
        x = self.transformer(src, tgt)
        x = self.fc(x)
        return x.transpose(0, 1)  # Transpose back to (batch_size, sequence_length, vocab_size)




# Hyperparameters
vocab_size = padding_token + 1          # 103 (0->101 + 102)
d_model = 512
nhead = 8
num_layers = 6
model = TransformerModel(vocab_size, d_model, nhead, num_layers)

# 3. Train the model
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "mps"
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 10
for epoch in range(num_epochs):
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        # Remove the last element from the target sequence before passing it to the model
        outputs = model(inputs, targets[:, :-1])

        # Apply masking to ignore padding tokens
        mask = (targets[:, 1:] != padding_token).view(-1)  # Create a mask for non-zero elements
        loss = criterion(outputs.reshape(-1, vocab_size)[mask], targets[:, 1:].reshape(-1)[mask])

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')


Length of data: 6873
Ex: [18, 18, 18, 18, 1, 6]


  loss = criterion(outputs.reshape(-1, vocab_size)[mask], targets[:, 1:].reshape(-1)[mask])


Epoch 1/10, Loss: 0.3572855591773987
Epoch 2/10, Loss: 0.5234084725379944
Epoch 3/10, Loss: 0.02439974993467331
Epoch 4/10, Loss: 0.11817251145839691
Epoch 5/10, Loss: 0.012505429796874523
Epoch 6/10, Loss: 0.007246554363518953
Epoch 7/10, Loss: 0.0849609524011612
Epoch 8/10, Loss: 0.05347998067736626
Epoch 9/10, Loss: 0.2576196789741516
Epoch 10/10, Loss: 0.026256121695041656


In [2]:
# save the model
torch.save(model.state_dict(), 'models/2023-04-13_2.pt')

In [3]:
# Predict next element in a given sequence
def predict_next_element(model, input_sequence, device):
    model.eval()
    with torch.no_grad():
        input_sequence = torch.tensor(input_sequence, dtype=torch.long).unsqueeze(0).to(device)
        input_length = input_sequence.size(1)

        # Initialize target sequence with zeros
        tgt = torch.zeros(1, input_length, dtype=torch.long).to(device)

        # Autoregressive generation
        for i in range(input_length):
            output = model(input_sequence, tgt)
            output = output.argmax(dim=2)  # Get the index of the predicted element
            if i + 1 < input_length:
                tgt[:, i + 1] = output[:, i]

        predicted_next_element = output[0, -1].item()
    return predicted_next_element


In [21]:
# Provide an input sequence and predict next element
input_sequence = []
predicted_next_element = predict_next_element(model, input_sequence, device)
print(f"Next element in sequence {input_sequence} is {predicted_next_element}")


Next element in sequence [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0] is 75


In [15]:
# Function to find anomalies

def sequence_log_prob(model, input_sequence, device):
    model.eval()
    with torch.no_grad():
        input_sequence = torch.tensor(input_sequence, dtype=torch.long).unsqueeze(0).to(device)
        input_length = input_sequence.size(1)

        # Initialize target sequence with zeros
        tgt = torch.zeros(1, input_length, dtype=torch.long).to(device)

        log_probs = []
        for i in range(input_length - 1):
            output = model(input_sequence, tgt)
            prob = torch.softmax(output, dim=2)  # Convert logits to probabilities
            log_prob = torch.log(prob[0, i, input_sequence[0, i + 1]])
            log_probs.append(log_prob.item())

            tgt[:, i + 1] = input_sequence[:, i + 1]

    return np.mean(log_probs)


In [22]:
# Get probabilities and then classify outliers
sequences_log_probs = [sequence_log_prob(model, seq, device) for seq in encoded_sequences]

# Calculate the threshold (e.g., mean minus two standard deviations)
threshold = np.mean(sequences_log_probs) - 2 * np.std(sequences_log_probs)

outlier_sequences = [seq for seq, log_prob in zip(encoded_sequences, sequences_log_probs) if log_prob < threshold]

print("Outlier sequences:")
for outlier_seq in outlier_sequences:
    print(outlier_seq)
