**1. Read dataset**

In [1]:
from nltk.corpus import reuters
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
import re
from nltk.corpus import stopwords
from collections import Counter

import gensim
import warnings
from sklearn import manifold
from sklearn import metrics

from gensim.models import Word2Vec
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
import string

import faiss

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader

import sys 
sys.path.append("..") 
sys.path.append("../..") 

import pcl.loader
import pcl.builder2

from copy import deepcopy
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
seed = 1

def setup_seed(seed=seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [4]:
f = open('output.txt', 'a')
f.write('seed: ' + str(seed) + '\n')
f.close()

In [5]:
normal_documents = reuters.fileids(categories=['earn', 'acq'])
abnormal_document = reuters.fileids(categories=['interest', 'wheat', 'dlr', 'gnp', 'crude'])

train_normal_docs = list(filter(lambda doc: doc.startswith("train"), normal_documents));
train_abnormal_docs = list(filter(lambda doc: doc.startswith("train"), abnormal_document));

test_normal_docs = list(filter(lambda doc: doc.startswith("test"), normal_documents));
test_abnormal_docs = list(filter(lambda doc: doc.startswith("test"), abnormal_document));

print('Number of train documents:', len(train_normal_docs), len(train_abnormal_docs))
print('Number of test documents:', len(test_normal_docs), len(test_abnormal_docs))

Number of train documents: 4511 1160
Number of test documents: 1803 465


In [6]:
cachedStopWords = stopwords.words("english")
 
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text));
    words = [word for word in words if word not in cachedStopWords]
    tokens =(list(map(lambda token: SnowballStemmer("english", ignore_stopwords=True).stem(token), words)));
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, tokens));
    return filtered_tokens

In [7]:
setup_seed()

def sequence_list(docs):
    corups = []
    doc_index = []
    for doc in docs:
        document_words = reuters.words(doc)
        corups.append(tokenize(' '.join(document_words)))
        doc_index.append(doc)
    return corups, doc_index

train_normal_corups, train_normal_doc_index = sequence_list(train_normal_docs)
train_abnormal_corups, train_abnormal_doc_index = sequence_list(train_abnormal_docs)

test_normal_corups, test_normal_doc_index = sequence_list(test_normal_docs)
test_abnormal_corups, test_abnormal_doc_index = sequence_list(test_abnormal_docs)

train_corups = train_normal_corups + train_abnormal_corups
test_corups = test_normal_corups + test_abnormal_corups

train_corups_index = train_normal_doc_index + train_abnormal_doc_index
test_corups_index = test_normal_doc_index + test_abnormal_doc_index

train_labels = [0]*len(train_normal_corups) + [1]*len(train_abnormal_corups)
test_labels = [0]*len(test_normal_corups) + [1]*len(test_abnormal_corups)

train_dataset = pd.DataFrame({'EventId': train_corups, 'Label':train_labels, 'Doc': train_corups_index})
test_dataset = pd.DataFrame({'EventId': test_corups, 'Label':test_labels, 'Doc': test_corups_index})

train_normal_ds = train_dataset[train_dataset['Label']==0].sample(n=1100, replace=False)
train_abnormal_ds = train_dataset[train_dataset['Label']==1].sample(n=220, replace=False)

val_normal_ds = train_normal_ds.sample(n=100, replace=False)
val_abnormal_ds = train_abnormal_ds.sample(n=20, replace=False)
val_ds = pd.concat([val_normal_ds, val_abnormal_ds])

train_normal_ds = train_normal_ds.drop(val_normal_ds.index)
train_abnormal_ds = train_abnormal_ds.drop(val_abnormal_ds.index)
train_ds = pd.concat([train_normal_ds, train_abnormal_ds])

test_normal_ds = test_dataset[test_dataset['Label']==0].sample(n=1000, replace=False)
test_abnormal_ds = test_dataset[test_dataset['Label']==1].sample(n=200, replace=False)
test_ds =pd.concat([test_normal_ds, test_abnormal_ds])

**2. Preprocessing**

In [8]:
word2vec = Word2Vec(sentences=train_corups, vector_size=100, window=5, min_count=1, workers=4)

word2vec.wv.vectors = np.append(word2vec.wv.vectors, np.zeros((1, 100), dtype=np.float32), axis=0)
word2vec.wv.vectors = np.append(word2vec.wv.vectors, np.random.randn(1, 100).astype(np.float32), axis=0)

In [9]:
def encode_sequence(sequence, logkey2index):
    return [logkey2index.get(logkey, len(word2vec.wv.key_to_index)+1) for logkey in sequence]

train_ds.loc[:, 'Encoded'] = train_ds.loc[:, 'EventId'].apply(lambda x: encode_sequence(x, word2vec.wv.key_to_index))
val_ds.loc[:, 'Encoded'] = val_ds.loc[:, 'EventId'].apply(lambda x: encode_sequence(x, word2vec.wv.key_to_index))
test_ds.loc[:, 'Encoded'] = test_ds.loc[:, 'EventId'].apply(lambda x: encode_sequence(x, word2vec.wv.key_to_index))

In [10]:
setup_seed()

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    def compose_pairs(dataset):
        number_seq = dataset.shape[0]
        
        data_aug_list = []
        
        for i in range(number_seq):
            sequence = dataset.iloc[i, 3]
            a, b = np.random.choice(len(sequence), 2)
            
            seq_aug1 = deepcopy(sequence)
            seq_aug1.pop(a)
            seq_aug2 = deepcopy(sequence)
            seq_aug2.pop(b)
            data_aug_list.append([seq_aug1, seq_aug2])

        dataset['Pair0'] = pd.Series([x[0] for x in data_aug_list]).values
        dataset['Pair1'] = pd.Series([x[1] for x in data_aug_list]).values
        
        return dataset

train_ds2 = compose_pairs(train_ds)
val_ds2 = compose_pairs(val_ds)
test_ds2 = compose_pairs(test_ds)

train_ds2['Index'] = range(train_ds2.shape[0])
val_ds2['Index'] = range(val_ds2.shape[0])
test_ds2['Index'] = range(test_ds2.shape[0])

In [11]:
batch_size_train = 20
batch_size_val = 20
batch_size_test = 20

In [12]:
def collate_fn(data_list):
    data_list.sort(key=lambda x: len(x[0]), reverse=True)
    sequence = [torch.tensor(x[0]) for x in data_list]
    label = [x[1] for x in data_list]
    sequence_length = [len(x[0]) for x in data_list]
    index = [x[2] for x in data_list]
    
    seq_aug1 = [torch.tensor(x[3]) for x in data_list]
    seq_aug2 = [torch.tensor(x[4]) for x in data_list]
    
    doc = [x[5] for x in data_list]

    padded_sequence = pad_sequence(sequence, batch_first=True, padding_value=len(word2vec.wv.key_to_index))
    padded_seq_aug1 = pad_sequence(seq_aug1, batch_first=True, padding_value=len(word2vec.wv.key_to_index))
    padded_seq_aug2 = pad_sequence(seq_aug2, batch_first=True, padding_value=len(word2vec.wv.key_to_index))
    return index, padded_sequence, label, sequence_length, padded_seq_aug1, padded_seq_aug2, doc

In [13]:
train_data_list = train_ds[['Encoded', 'Label', 'Index', 'Pair0', 'Pair1', 'Doc']].values.tolist()
val_data_list = val_ds[['Encoded', 'Label', 'Index', 'Pair0', 'Pair1', 'Doc']].values.tolist()
test_data_list = test_ds[['Encoded', 'Label', 'Index', 'Pair0', 'Pair1', 'Doc']].values.tolist()

train_loader = DataLoader(train_data_list, batch_size=batch_size_train, collate_fn=collate_fn, drop_last=True)
val_loader = DataLoader(val_data_list, batch_size=batch_size_train, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_data_list, batch_size=batch_size_test, collate_fn=collate_fn, drop_last=True)

**3. Model**

In [14]:
vocab_size = len(word2vec.wv.key_to_index)
embedding_dim = 100
hidden_dim = 256
num_layers = 2
epochs = 100
num_cluster_n = ['2']
num_cluster_a = ['7']

f = open('output.txt', 'a')
f.write('Abnormal clusters: ' + num_cluster_a[0] + '\n')
f.close()

num_cluster = [str(int(num_cluster_n[i]) + int(num_cluster_a[i])) for i in range(len(num_cluster_n))]

In [15]:
class SequenceEncoder(nn.Module):
    def __init__(self, vocab_size=vocab_size, embedding_dim=embedding_dim, 
                 hidden_dim=hidden_dim, num_layers=num_layers, num_classes=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding.from_pretrained(torch.from_numpy(word2vec.wv.vectors))
        self.embeddings.requires_grad = True
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            bias=True)

    def forward(self, seq):
        seq_length = [len(x)-x.count(len(word2vec.wv.key_to_index)) for x in seq.tolist()]
        embedded = self.embeddings(seq)
        packed_embedded = pack_padded_sequence(embedded, seq_length, batch_first=True,enforce_sorted=False)
        packed_out, (hidden, cell) = self.lstm(packed_embedded)
        out, len_list = pad_packed_sequence(packed_out, batch_first=True) 
        mean_hidden = torch.sum(out, dim=1) / len_list.view(-1, 1).to(device)
        return nn.functional.normalize(mean_hidden, dim=1)

In [16]:
setup_seed()

model = pcl.builder2.MoCo(SequenceEncoder,hidden_dim, 200, 0.999, 0.05).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), 0.001, weight_decay=1e-6)

**4. Training**

In [17]:
def compute_features(train_ds, model):
    model.eval()
    seq_label = [len(x)-x.count(len(word2vec.wv.key_to_index)) for x in train_ds.Encoded.tolist()]
    encoded_sequences = [torch.tensor(x) for x in train_ds.Encoded.tolist()]
    padded_sequence = pad_sequence(encoded_sequences, batch_first=True, 
                                   padding_value=len(word2vec.wv.key_to_index)).to(device)
    features = model(padded_sequence, is_eval=True, sequence_label=seq_label)   
    return features.cpu().numpy()

In [18]:
def run_kmeans(x, num_cluster, temperature):
    """
    Args:
        x: data to be clustered
    """

    results = {'im2cluster': [], 'centroids': [], 'density': []}

    # intialize faiss clustering parameters
    d = x.shape[1]
    k = int(num_cluster[0])
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 10
    clus.nredo = 5
    clus.seed = seed
    clus.max_points_per_centroid = 1000
    clus.min_points_per_centroid = 1

    res = faiss.StandardGpuResources()
    cfg = faiss.GpuIndexFlatConfig()
    cfg.useFloat16 = False
    cfg.device = 0
    index = faiss.GpuIndexFlatL2(res, d, cfg)

    clus.train(x, index)

    D, I = index.search(x, 3)  # for each sample, find cluster distance and assignments
    im2cluster = [int(n[0]) for n in I]
    # get cluster centroids
    centroids = faiss.vector_to_array(clus.centroids).reshape(k, d)

    # sample-to-centroid distances for each cluster 
    Dcluster = [[] for c in range(k)]
    for im, i in enumerate(im2cluster):
        Dcluster[i].append(D[im][0])

    # concentration estimation (phi)        
    density = np.zeros(k)
    for i, dist in enumerate(Dcluster):
        if len(dist) > 1:
            d = (np.asarray(dist) ** 0.5).mean() / np.log(len(dist) + 10)
            density[i] = d

            # if cluster only has one point, use the max to estimate its concentration
    dmax = density.max()
    for i, dist in enumerate(Dcluster):
        if len(dist) <= 1:
            density[i] = dmax

    density = density.clip(np.percentile(density, 10),
                           np.percentile(density, 90))  # clamp extreme values for stability
    density = temperature * density / density.mean()  # scale the mean to temperature

    # convert to cuda Tensors for broadcast
    centroids = torch.Tensor(centroids).cuda()
    centroids = nn.functional.normalize(centroids, p=2, dim=1)

    im2cluster = torch.LongTensor(im2cluster).cuda()
    density = torch.Tensor(density).cuda()

    results['centroids'] = centroids
    results['density'] = density
    results['im2cluster'] = im2cluster

    return results

In [19]:
def train_model(train_loader, model, criterion, optimizer, epoch, cluster_result=None):
    model.train()
    
    acc_epoch = []
    loss_epoch = []
    
    for index, sequence, label, _, sequence0, sequence1,_ in train_loader:
        pair0 = sequence0.to(device)
        pair1 = sequence1.to(device)
        label = torch.tensor(label).to(device)

        output, target, output_proto, target_proto = model(im_q=pair0, im_k=pair1, cluster_result=cluster_result, 
                                                           index = index, sequence_label=label)
        info_loss = criterion(output, target)
        
        # ProtoNCE loss
        if output_proto is not None:
            loss_proto = 0
            for proto_out,proto_target in zip(output_proto, target_proto):
                loss_proto += criterion(proto_out, proto_target)

            # average loss across all sets of prototypes
            loss_proto /= len(num_cluster)

        loss = loss_proto + info_loss 
#         print(loss_proto.item(), info_loss.item())

        # losses.update(loss.item(), images[0].size(0))
        loss_epoch.append(loss.item())

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return np.mean(loss_epoch)

In [20]:
warnings.filterwarnings('ignore')

setup_seed()

for i in range(epochs):    
    train_normal = train_ds2[train_ds['Label']==0]
    train_abnormal = train_ds2[train_ds['Label']==1]

    features_n = compute_features(train_normal, model)
    cluster_result_n = run_kmeans(features_n, num_cluster_n, 0.05)

    features_a = compute_features(train_abnormal, model)
    cluster_result_a = run_kmeans(features_a, num_cluster_a, 0.05)

    cluster_result_a['im2cluster'] = cluster_result_a['im2cluster'] + int(num_cluster_n[0])

    keys = ['im2cluster', 'centroids', 'density']
    cluster_result = {'im2cluster': [[], [], []], 'centroids': [[], [], []], 'density': [[], [], []]}

    for key in keys:
        cluster_result[key] = torch.cat((cluster_result_n[key], cluster_result_a[key]), 0)
    
    epoch_loss = train_model(train_loader, model, criterion, optimizer, i, cluster_result)
    
    print(f'Epoch {i}: {epoch_loss}')

Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 0 (0.00 s, search 0.00 s): objective=239.931 imbalance=1.210 nsplit=0         Iteration 1 (0.00 s, search 0.00 s): objective=73.0285 imbalance=1.006 nsplit=0         Iteration 2 (0.00 s, search 0.00 s): objective=60.8291 imbalance=1.000 nsplit=0         Iteration 3 (0.00 s, search 0.00 s): objective=59.7762 imbalance=1.002 nsplit=0         Iteration 4 (0.00 s, search 0.00 s): objective=59.5715 imbalance=1.004 nsplit=0         Iteration 5 (0.00 s, search 0.00 s): objective=59.5592 imbalance=1.004 nsplit=0         Iteration 6 (0.00 s, search 0.00 s): objective=59.5585 imbalance=1.004 nsplit=0         Iteration 7 (0.00 s, search 0.00 s): objective=59.5577 imbalance=1.004 nsplit=0         Iteration 8 (0.00 s, search 0.00 s): objective=59.5577 imbalance=1.004 nsplit=0         Iteration 9 (0.01 s, search 0.00 s): objective=59.5577 imbalance=1.004 nspl

Epoch 0: 5.422589755058288
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=74.6269 imbalance=1.008 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=74.6269 imbalance=1.008 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=74.6269 imbalance=1.008 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=74.6269 imbalance=1.008 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=74.6269 imbalance=1.008 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=7.30522 imbalance=1.373 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteratio

Epoch 6: 3.713954782485962
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=257.196 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=257.196 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=257.196 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=257.196 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=257.196 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=27.9108 imbalance=1.234 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteratio

Epoch 12: 2.8719386875629427
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=403.064 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=403.064 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=403.064 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=403.064 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=403.064 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=46.1995 imbalance=1.182 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterat

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=60.0726 imbalance=1.226 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=57.9855 imbalance=1.059 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=60.1171 imbalance=1.268 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=58.3947 imbalance=1.078 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=56.6851 imbalance=1.070 nsplit=0       
Objective improved: keep new clusters
Epoch 18: 2.4467937469482424
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=518.623 imbalance=1.009 nsplit=0

Epoch 23: 2.29729184905688
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=596.774 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=596.774 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.02 s, search 0.01 s): objective=596.774 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=596.774 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=596.774 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=75.6057 imbalance=1.212 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteratio

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=90.3382 imbalance=1.281 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=89.8415 imbalance=1.069 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=89.6328 imbalance=1.120 nsplit=0       
Objective improved: keep new clusters
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=88.8468 imbalance=1.103 nsplit=0       
Objective improved: keep new clusters
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=86.3799 imbalance=1.083 nsplit=0       
Objective improved: keep new clusters
Epoch 29: 2.2100987911224363
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Ite

Epoch 34: 2.1900687634944918
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=708.64 imbalance=1.009 nsplit=0        
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=708.64 imbalance=1.009 nsplit=0        
Outer iteration 2 / 5
  Iteration 9 (0.02 s, search 0.01 s): objective=708.64 imbalance=1.009 nsplit=0        
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=708.64 imbalance=1.009 nsplit=0        
Outer iteration 4 / 5
  Iteration 9 (0.03 s, search 0.02 s): objective=708.64 imbalance=1.009 nsplit=0        
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=106.263 imbalance=1.316 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterat

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=114.805 imbalance=1.355 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=116.263 imbalance=1.061 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=110.232 imbalance=1.047 nsplit=0       
Objective improved: keep new clusters
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=113.889 imbalance=1.100 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=108.147 imbalance=1.078 nsplit=0       
Objective improved: keep new clusters
Epoch 40: 2.2019068618615467
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=736.783 imbalance=1.009 nsplit=0

Epoch 45: 2.160936411221822
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=753.662 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=753.662 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.02 s, search 0.01 s): objective=753.662 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=753.662 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=753.662 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=122.802 imbalance=1.323 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterati

Epoch 51: 2.1554723302523295
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=766.13 imbalance=1.009 nsplit=0        
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=766.13 imbalance=1.009 nsplit=0        
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=766.13 imbalance=1.009 nsplit=0        
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=766.13 imbalance=1.009 nsplit=0        
Outer iteration 4 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=766.13 imbalance=1.009 nsplit=0        
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=130.967 imbalance=1.413 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterat

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=134.397 imbalance=1.402 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=134.156 imbalance=1.089 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=128.357 imbalance=1.073 nsplit=0       
Objective improved: keep new clusters
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=131.705 imbalance=1.274 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=124.908 imbalance=1.078 nsplit=0       
Objective improved: keep new clusters
Epoch 57: 2.1537693361441295
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): obje

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=128.261 imbalance=1.080 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=139.228 imbalance=1.100 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=138.338 imbalance=1.095 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=135.6 imbalance=1.292 nsplit=0         
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=128.244 imbalance=1.078 nsplit=0       
Objective improved: keep new clusters
Epoch 63: 2.1528708696365357
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=776.919 imbalance=1.009 nsplit=0       
Objective improved: keep new c

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=136.715 imbalance=1.101 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=141.775 imbalance=1.067 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=142.615 imbalance=1.110 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=138.384 imbalance=1.252 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=130.558 imbalance=1.078 nsplit=0       
Objective improved: keep new clusters
Epoch 69: 2.1593217591444653
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=778.703 imbalance=1.009 nsplit=0       
Objective improved: keep new c

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=138.669 imbalance=1.150 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=131.066 imbalance=1.092 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=136.033 imbalance=1.098 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=135.635 imbalance=1.089 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=129.612 imbalance=1.078 nsplit=0       
Objective improved: keep new clusters
Epoch 75: 2.167874652147293
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=775.189 imbalance=1.009 nsplit=0 

Epoch 80: 2.159335283438365
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=776.248 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=776.248 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.02 s, search 0.01 s): objective=776.248 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=776.248 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.03 s, search 0.02 s): objective=776.248 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=139.066 imbalance=1.268 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterati

Epoch 86: 2.1540133714675904
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=776.971 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=776.971 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.02 s, search 0.01 s): objective=776.971 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=776.971 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.03 s, search 0.02 s): objective=776.971 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=137.433 imbalance=1.259 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterat

Epoch 92: 2.1531743745009106
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=776.557 imbalance=1.009 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=776.557 imbalance=1.009 nsplit=0       
Outer iteration 2 / 5
  Iteration 9 (0.02 s, search 0.01 s): objective=776.557 imbalance=1.009 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.02 s, search 0.02 s): objective=776.557 imbalance=1.009 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.03 s, search 0.02 s): objective=776.557 imbalance=1.009 nsplit=0       
Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=140.131 imbalance=1.197 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iterat

Clustering 200 points in 256D to 7 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=139.553 imbalance=1.299 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 9 (0.00 s, search 0.00 s): objective=135.714 imbalance=1.080 nsplit=0       
Objective improved: keep new clusters
Outer iteration 2 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=137.609 imbalance=1.156 nsplit=0       
Outer iteration 3 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=141.347 imbalance=1.294 nsplit=0       
Outer iteration 4 / 5
  Iteration 9 (0.01 s, search 0.01 s): objective=131.881 imbalance=1.080 nsplit=0       
Objective improved: keep new clusters
Epoch 98: 2.1526958882808684
Clustering 1000 points in 256D to 2 clusters, redo 5 times, 10 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 9 (0.01 s, search 0.00 s): objective=775.869 imbalance=1.009 nsplit=0

In [21]:
# model.load_state_dict(best_val_model)
model.eval()

faiss_search = faiss.IndexFlatL2(cluster_result['centroids'].size(1))
faiss_search.add(cluster_result['centroids'].detach().cpu().numpy())

true_label = []
pred_label = []
doc_index_list = []
doc_type_pred_label = []

for index, sequence, sequence_label, length, _, _, doc in test_loader:
    doc_index_list += doc
    sequence = sequence.to(device)
    
    true_label += sequence_label
    sq = model.encoder_k(sequence).detach().cpu().numpy()
    D, I = faiss_search.search(sq, 1)

    doc_type_pred_label += [i[0] for i in I]
    
    pred_label += [1 if pred>int(num_cluster_n[0])-1 else 0 for pred in I]

In [22]:
print(metrics.classification_report(true_label, pred_label, digits=4))
print(metrics.confusion_matrix(true_label, pred_label))

fpr, tpr, thresholds = metrics.roc_curve(true_label, pred_label, pos_label=1)
print(metrics.auc(fpr, tpr))

f = open('output.txt', 'a')
f.write('Anomaly detection on testing sequences:'+'\n')
f.write(str(metrics.classification_report(true_label, pred_label, digits=4))+'\n')
f.write(str(metrics.confusion_matrix(true_label, pred_label))+'\n')
f.write(str(metrics.auc(fpr, tpr))+'\n')
f.close()

              precision    recall  f1-score   support

           0     0.9879    0.9810    0.9844      1000
           1     0.9082    0.9400    0.9238       200

    accuracy                         0.9742      1200
   macro avg     0.9481    0.9605    0.9541      1200
weighted avg     0.9746    0.9742    0.9743      1200

[[981  19]
 [ 12 188]]
0.9604999999999999


In [23]:
test_earn_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['earn'])));
test_acq_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['acq'])));


test_interest_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['interest'])));
test_wheat_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['wheat'])));
test_dlr_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['dlr'])));
test_gnp_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['gnp'])));
test_crude_docs = list(filter(lambda doc: doc.startswith("test"), reuters.fileids(categories=['crude'])));

In [24]:
doc_type_true_label = []

for each in doc_index_list:
    if each in test_earn_docs:
        doc_type_true_label.append(0)
    
    elif each in test_acq_docs:
        doc_type_true_label.append(1)
    
    elif each in test_interest_docs:
        doc_type_true_label.append(2)
        
    elif each in test_wheat_docs:
        doc_type_true_label.append(3)
        
    elif each in test_dlr_docs:
        doc_type_true_label.append(4)

    elif each in test_gnp_docs:
        doc_type_true_label.append(5)

    elif each in test_crude_docs:
        doc_type_true_label.append(6)


In [25]:
print(metrics.rand_score(doc_type_true_label, doc_type_pred_label))
print(metrics.mutual_info_score(doc_type_true_label, doc_type_pred_label))

f = open('output.txt', 'a')
f.write('Clustering results:'+'\n')
f.write('rand_score: ' + str(metrics.rand_score(doc_type_true_label, doc_type_pred_label))+'\n')
f.write('mutual_info_score: ' + str(metrics.mutual_info_score(doc_type_true_label, doc_type_pred_label))+'\n')
f.write('-'*50 + '\n')
f.close()

0.871814011676397
0.8548086026489393
