## CNN based learning anomaly patterns

In [13]:
%reload_ext autoreload
%autoreload 2

In [14]:
from collections import defaultdict
from multiprocessing import cpu_count

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

import scipy.stats
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["font.size"] = 7
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
plt.rcParams['axes.linewidth'] = 1.0
plt.rcParams['axes.grid'] = True

import warnings
warnings.filterwarnings('ignore')

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed = 1
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

In [334]:
import pathlib
from sklearn.preprocessing import minmax_scale

class TSDatasetAnomalyPatterns(Dataset):
    def __init__(self, path: pathlib.Path):
        super(TSDatasetAnomalyPatterns, self).__init__()
        self.df = pd.read_json(str(path), orient='records', lines=True)
        self.time_series = torch.tensor([
            [minmax_scale(li, feature_range=(0, 1)) for li in self.df.loc[:, "time_series"].values]
        ], dtype=torch.float32)  # use float64 to avoid error

        self.joined_categories = self.df.loc[:, ["anomaly_pattern", "anomaly_position"]].apply(
            lambda x: "/".join(x.dropna().astype(str).values), axis=1
        ).to_numpy()

        # string labels to int labels
        self.label_to_category = {i: v for i, v in enumerate(np.unique(self.joined_categories))}
        category_to_label = {v: k for k, v in self.label_to_category.items()}
        self.labels = torch.tensor([
            category_to_label[c] for c in self.joined_categories
        ], dtype=torch.int64)

    def __getitem__(self, idx):
        label = self.labels[idx]
        category = self.label_to_category[label.item()]
        return self.time_series[:, idx], label, category

    def __len__(self):
        return len(self.df)
    
    def number_of_class(self) -> int:
        return len(self.label_to_category.keys())

In [332]:
dataset = TSDatasetAnomalyPatterns(pathlib.Path("../samples/tsdr_anomaly_patterns/labeled_tsdr_anomaly_patterns_20221202-024759.jsonl"))

In [333]:
import torch.utils.data

train_dataset, test_dataset = torch.utils.data.random_split(dataset=dataset, lengths=[0.7, 0.3], generator=torch.Generator().manual_seed(42))
display(len(train_dataset), len(test_dataset))

836

357

In [352]:
# ref. https://github.com/pytorch/examples/blob/f82f5626b6432b8d0b08d58cc91f3bdbb355a772/mnist/main.py

class CNN1d(nn.Module):
    def __init__(self, num_classes: int):
        super(CNN1d, self).__init__()

        self.conv1 = nn.Conv1d(1, 64, kernel_size=4)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=4)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=4)
        # self.dropout1 = nn.Dropout(0.25)
        # self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(21504, 128)  # the number of datapoints in a metric = 60/15 * 45 (45min * 15sec interval)
        self.fc2 = nn.Linear(128, num_classes)  # The number of class is 13 (chaos types) * 2 (anomaly position) + 2 (normal and unknown)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = F.max_pool1d(x, kernel_size=5, stride=2)
        # x = self.dropout1(x)
        x = torch.flatten(x, 1)  # Is this necessary?
        x = self.fc1(x)
        x = F.relu(x)
        # x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [353]:
def train(log_interval, model, device, train_loader, optimizer, loss_fn, epoch):
    model.train()
    for batch_idx, (data, target, category) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader, loss_fn):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target, category in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += loss_fn(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [354]:
model = CNN1d(num_classes=dataset.number_of_class()).to(device)
learning_rate = 0.001
epochs = 14
batch_size = 4
test_batch_size = 4
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, shuffle=True, pin_memory=True)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)
for epoch in range(1, epochs + 1):
    train(100, model, device, train_dataloader, optimizer, loss_fn, epoch)
    test(model, device, test_dataloader, loss_fn)
    scheduler.step()


Test set: Average loss: 0.3388, Accuracy: 192/357 (54%)


Test set: Average loss: 0.2184, Accuracy: 239/357 (67%)


Test set: Average loss: 0.2024, Accuracy: 259/357 (73%)


Test set: Average loss: 0.1931, Accuracy: 271/357 (76%)


Test set: Average loss: 0.2123, Accuracy: 271/357 (76%)


Test set: Average loss: 0.2100, Accuracy: 284/357 (80%)


Test set: Average loss: 0.1995, Accuracy: 282/357 (79%)


Test set: Average loss: 0.2166, Accuracy: 282/357 (79%)


Test set: Average loss: 0.2150, Accuracy: 285/357 (80%)


Test set: Average loss: 0.2188, Accuracy: 280/357 (78%)


Test set: Average loss: 0.2235, Accuracy: 280/357 (78%)


Test set: Average loss: 0.2240, Accuracy: 281/357 (79%)


Test set: Average loss: 0.2459, Accuracy: 284/357 (80%)


Test set: Average loss: 0.2275, Accuracy: 284/357 (80%)

