In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

import pandas as pd
from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# set pandas options

pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Read data
df=pd.read_csv(Path('data/book.txt'), 
    sep='|', 
    header=0,
    encoding='utf-8', 
    low_memory=False, 
    decimal=',', 
    parse_dates=['voucher_date', 'last_update'])

df = df[df['voucher_type'] == 'VB']
df=df.sort_values(by=['voucher_no','sequence_no'])

# set account dtype to int
df['account'] = df['account'].astype(int)

# collapse by voucher_no and list account_no
df = df.groupby('voucher_no').agg(
    {'account':list}
)

# Replace with your actual data
sequences = df.account.tolist()

# Flatten the sequences and create a set of unique account numbers
flat_sequences = [account for seq in sequences for account in seq]
unique_accounts = set(flat_sequences)

# Encode the account numbers as integers
encoder = LabelEncoder()
encoder.fit(list(unique_accounts))

# save encoder
import pickle
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [60]:
len(np.intersect1d(np.array(list(unique_accounts)), encoder.classes_))

102

In [35]:
df2=pd.read_csv(Path('data/book.txt'), 
    sep='|', 
    header=0,
    encoding='utf-8', 
    low_memory=False, 
    decimal=',', 
    parse_dates=['voucher_date', 'last_update'])

df2.account.nunique()

367

In [34]:
df3 = df2.loc[df2.voucher_type == "VB"]
df3.account.nunique()

102

In [39]:
df3.groupby("voucher_no").account.list()

AttributeError: 'SeriesGroupBy' object has no attribute 'list'

In [195]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

import pandas as pd
from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# set pandas options

pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Read data
df=pd.read_csv(Path('data/book.txt'), 
    sep='|', 
    header=0,
    encoding='utf-8', 
    low_memory=False, 
    decimal=',', 
    parse_dates=['voucher_date', 'last_update'])

df = df[df['voucher_type'] == 'VB']
df=df.sort_values(by=['voucher_no','sequence_no'])

# set account dtype to int
df['account'] = df['account'].astype(int)

# collapse by voucher_no and list account_no
df = df.groupby('voucher_no').agg(
    {'account':list}
)

# Replace with your actual data
sequences = df.account.tolist()

# Flatten the sequences and create a set of unique account numbers
flat_sequences = [account for seq in sequences for account in seq]
unique_accounts = set(flat_sequences)

# Encode the account numbers as integers
encoder = LabelEncoder()
encoder.fit(list(unique_accounts))

# save encoder
import pickle
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


# Encode the sequences and pad them with -1 to a fixed length: maximum sequence length
max_seq_length = max([len(seq) for seq in sequences])
encoded_sequences = [encoder.transform(seq).tolist() for seq in sequences]
padded_sequences = [seq + [0] * (max_seq_length - len(seq)) for seq in encoded_sequences]
#padded_sequences = [seq + [0] * (max_seq_length - len(seq)) for seq in sequences]

with open('encoded_sequences.pkl', 'wb') as f:
    pickle.dump(encoded_sequences, f)

# Create input-output pairs for training
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Create input-output pairs for training
data = []
for seq in padded_sequences:
    for i in range(1, len(seq)):
        data.append((seq[:i], seq[i]))

with open('data.pkl', 'wb') as f:
    pickle.dump(data, f)

class AccountSequenceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_1d = [torch.tensor([t], dtype=torch.long) for t in targets]
    targets_padded = pad_sequence(targets_1d, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded


dataset = AccountSequenceDataset(data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def generate_mask(self, size):
        mask = torch.tril(torch.ones(size, size) == 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, x):
        embed = self.embedding(x)
        tgt_mask = self.generate_mask(x.size(1)).to(device)

        src_key_padding_mask = (x == 0)
        tgt_key_padding_mask = (x == 0)
        output = self.transformer(embed, embed, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        logits = self.fc(output)
        return logits


device = torch.device("cpu")

# Model and training parameters
vocab_size = len(unique_accounts) + 1
d_model = 128
nhead = 4
num_layers = 2
lr = 0.001
epochs = 10

model = TransformerModel(vocab_size, d_model, nhead, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)



In [215]:
max_val = 0
min_val = 101
for seq in encoded_sequences:
    max_val = max(max_val, max(seq))
    min_val = min(min_val, min(seq))

print(max_val)
print(min_val)

101
0


In [212]:
df_unique = pd.DataFrame(np.array(list(unique_accounts)), columns=["acc"])
df_unique.acc.nunique()

102

In [214]:
len(encoder.classes_)

102

In [186]:
dataloader.dataset[0]

(tensor([18]), tensor(18))

In [189]:
for batch in dataloader:
    print(batch.shape)
    break

RuntimeError: stack expects each tensor to be equal size, but got [6] at entry 0 and [12] at entry 1

In [131]:
# Training loop
for epoch in range(epochs):

    model.train()
    for batch_idx, (x, y) in enumerate(dataloader):
        print('Batch index: ', batch_idx)
        print('Batch x size: ', x[0].size())
        print('Batch x label: ', x[1])
        print('Batch y size: ', y[0].size())
        print('Batch y label: ', y[1])

        x, y = x.to(device), y.to(device)
        print(f"Shape x: {x.shape}")
        print(f"Shape y: {y.shape}")
        optimizer.zero_grad()
        logits = model(x)
        logits_reshaped = logits.view(-1, logits.size(-1))
        y_reshaped = y.view(-1)
        valid_indices = y_reshaped != 0
        logits_masked = torch.masked_select(logits_reshaped, valid_indices.unsqueeze(1)).view(-1, logits.size(-1))
        y_masked = torch.masked_select(y_reshaped, valid_indices)
        loss = criterion(logits_masked, y_masked)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

# save the model
torch.save(model.state_dict(), 'models/LSTM_model.pt')

# apply outlier detection
def calculate_reconstruction_loss(model, dataloader, device):
    losses = []
    model.eval()
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            losses.append(loss.item())
    return losses

reconstruction_losses = calculate_reconstruction_loss(model, dataloader, device)

import numpy as np

mean_loss = np.mean(reconstruction_losses)
std_loss = np.std(reconstruction_losses)
threshold = mean_loss + 2 * std_loss

outliers = [seq for seq, loss in zip(sequences, reconstruction_losses) if loss > threshold]

print(f"Number of outliers: {len(outliers)}")

# save outliers
with open('Result/outliers.pkl', 'wb') as f:
    pickle.dump(outliers, f)


Batch index:  0
Batch x size:  torch.Size([15])
Batch x label:  tensor([64,  1,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
Batch y size:  torch.Size([1])
Batch y label:  tensor([0])
Shape x: torch.Size([32, 15])
Shape y: torch.Size([32, 1])


AssertionError: expecting key_padding_mask shape of (15, 32), but got torch.Size([32, 15])

In [181]:
dataloader.dataset[45][1]

tensor(97)

In [130]:
x.shape

torch.Size([15, 32])