# Load Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown
plt.style.use('ggplot')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir('/content/drive/MyDrive/dacon_anomaly')

In [4]:
def load_data():
    train = pd.read_csv('./data/train.csv')
    valid = pd.read_csv('./data/val.csv')
    test = pd.read_csv('./data/test.csv')

    return train, valid, test

train, valid, test = load_data()

In [5]:
test

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,AAAA0x1,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,-0.994983
1,AAAA0x2,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,-0.269825,-0.994983
2,AAAA0x5,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0.670579,-0.994960
3,AAAA0x7,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.464960,...,-0.167716,-0.270710,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,-0.237686,-0.994937
4,AAAA0xc,0.384978,0.616109,-0.874300,-0.094019,2.924584,3.317027,0.470455,0.538247,-0.558895,...,0.049924,0.238422,0.009130,0.996710,-0.767315,-0.492208,0.042472,-0.054337,-0.167819,-0.994866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142498,0x4587f,0.219529,0.881246,-0.635891,0.960928,-0.152971,-1.014307,0.427126,0.121340,-0.285670,...,0.099936,0.337120,0.251791,0.057688,-1.508368,0.144023,0.181205,0.215243,0.028645,1.034904
142499,0x45880,-1.775135,-0.004235,1.189786,0.331096,1.196063,5.519980,-1.518185,2.080825,1.159498,...,0.103302,0.654850,-0.348929,0.745323,0.704545,-0.127579,0.454379,0.130308,0.810312,1.034916
142500,0x45884,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,0.038986,1.034963
142501,0x45885,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,0.641096,1.034975


In [6]:
def preprocess(train, valid, test):
    train.drop(['ID'], inplace = True, axis = 1)
    valid.drop(['ID'], inplace = True, axis = 1)
    test.drop(['ID'], inplace = True, axis = 1)

    X_train = train.values
    X_valid = valid.drop(['Class'], axis = 1).values
    y_valid = valid['Class'].values
    X_test = test.values

    return X_train, X_valid, y_valid, X_test

X_train, X_valid, y_valid, X_test = preprocess(train, valid, test)

# AutoEncoder with sub-sampling
- 학습과정에서 모델의 크기가 커지거나 epoch수가 늘어나면 f1 score가 감소함
    - 모델이 이상치까지 학습하는 것으로 볼 수 있음

- Isolation Forest 논문에서 제시한 sub-sampling 기법을 autoencoder에 적용
    - sub-sampling을 통해서 이상치 탐지에 있어 `masking problem`과 `swamping preblem`을 완화할 수 있다.

In [7]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import argparse
from tqdm import tqdm
import random
from itertools import product
from sklearn.metrics import f1_score, confusion_matrix

In [8]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed) 
    torch.backends.cudnn.deterministic = True 
    torch.backends.cudnn.benchmark = True

set_seed(123)

In [9]:
class AutoEncoder(nn.Module):

    def __init__(self, input_size, encoder_hidden_size, bottle_neck_size, decoder_hidden_size, dropout_p):
        self.input_size = input_size
        self.encoder_hidden_size = encoder_hidden_size
        self.bottle_neck_size = bottle_neck_size
        self.decoder_hidden_size = decoder_hidden_size
        self.dropout_p = dropout_p

        super(AutoEncoder, self).__init__()


        # ===== Encoder ===== #
        self.encoder = nn.ModuleList()
        # input layer
        self.encoder.append(nn.Sequential(
            nn.Linear(input_size, encoder_hidden_size[0]),
            nn.BatchNorm1d(encoder_hidden_size[0]),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        ))
        # encoder hidden layers
        for idx in range(len(encoder_hidden_size) - 1):
            self.encoder.append(nn.Sequential(
                nn.Linear(encoder_hidden_size[idx], encoder_hidden_size[idx + 1]),
                nn.BatchNorm1d(encoder_hidden_size[idx + 1]),
                nn.ReLU(),
                nn.Dropout(dropout_p)
            ))
        # bottle neck layer
        self.encoder.append(nn.Sequential(
            nn.Linear(encoder_hidden_size[-1], bottle_neck_size),
            nn.BatchNorm1d(bottle_neck_size),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        ))


        # ===== Decoder ===== #
        self.decoder = nn.ModuleList()
        # bottle neck
        self.decoder.append(nn.Sequential(
            nn.Linear(bottle_neck_size, decoder_hidden_size[0]),
            nn.BatchNorm1d(decoder_hidden_size[0]),
            nn.ReLU(),
            nn.Dropout(dropout_p)
        ))
        # decoder hidden layers
        for idx in range(len(decoder_hidden_size) - 1):
            self.decoder.append(nn.Sequential(
                nn.Linear(decoder_hidden_size[idx], decoder_hidden_size[idx + 1]),
                nn.BatchNorm1d(decoder_hidden_size[idx + 1]),
                nn.ReLU(),
                nn.Dropout(dropout_p)
            ))
        # output_layer
        self.decoder.append(nn.Linear(decoder_hidden_size[-1], input_size))


    def forward(self, x):
        # |x| = (batch_size, input_size)
        for layer in self.encoder:
            x = layer(x)
        
        latent = x
        # |latent| = (batch_size, bottle_neck_size)

        for layer in self.decoder:
            x = layer(x)
        # |x| = (batch_size, input_size)

        return x


class AutoEncoderWithSubSampling(nn.Module):

    def __init__(self, n_estimators, input_size, encoder_hidden_size, bottle_neck_size, decoder_hidden_size, dropout_p):
        self.n_estimators = n_estimators
        self.input_size = input_size
        self.encoder_hidden_size = encoder_hidden_size
        self.bottle_neck_size = bottle_neck_size
        self.decoder_hidden_size = decoder_hidden_size
        self.dropout_p = dropout_p

        super(AutoEncoderWithSubSampling, self).__init__()

        self.estimators = [
            AutoEncoder(
                input_size = input_size,
                encoder_hidden_size = encoder_hidden_size,
                bottle_neck_size = bottle_neck_size,
                decoder_hidden_size = decoder_hidden_size,
                dropout_p = dropout_p
            ) for _ in range(n_estimators)
        ]
    
    def forward(self):
        pass

    def train(self, train_loaders, valid_loader, y_valid, n_epochs, device):
        for idx in range(len(train_loaders)):
            display(Markdown('# Estimator {}/{}'.format((idx + 1), len(train_loaders))))

            sub_model = self.estimators[idx]
            sub_model.to(device)
            sub_loader = train_loaders[idx]

            optimizer = optim.Adam(sub_model.parameters())
            crit = nn.L1Loss()
            cos = nn.CosineSimilarity(dim = 1)


            best_f1 = -np.inf
            best_model = None
            for epoch in range(n_epochs):
                train_losses = []
                valid_losses = []
                preds = []
                # === train === #
                for batch in sub_loader:
                    batch = batch.float().to(device)

                    # initialize optimizer
                    optimizer.zero_grad()

                    # feed foward
                    x_hat = sub_model(batch)

                    # loss
                    loss = crit(batch, x_hat)

                    # backpropagation
                    loss.backward()

                    # gradient descent
                    optimizer.step()

                    train_losses.append(float(loss))

                # === valid === #                
                for batch in valid_loader:
                    batch = batch.float().to(device)

                    sub_model.eval()
                    with torch.no_grad():
                        # feed foward
                        x_hat = sub_model(batch)

                        # loss
                        loss = crit(batch, x_hat) 
                        valid_losses.append(float(loss))

                        # cosine similarity
                        sim = cos(batch, x_hat).detach().cpu().numpy()

                        pred = (sim < 0.95)
                        preds.extend(pred)

                train_loss = np.mean(train_losses)
                valid_loss = np.mean(valid_losses)
                f1 = f1_score(y_valid, preds, average = 'macro')

                if f1 > best_f1:
                    best_model = sub_model.state_dict()
                    best_f1 = f1

                if epoch % 20 == 0:
                    print(f'Epoch {epoch + 1} - Loss {np.round(train_loss, 5)}, Val_Loss {np.round(valid_loss, 5)}, f1_score {np.round(f1, 5)}')

    def evaluate(self, X, batch_size, device):
        ensemble = []

        cos = nn.CosineSimilarity(dim = 1)
        
        for estimator in self.estimators:
            estimator.to(device)

            sims = []

            estimator.eval()
            with torch.no_grad():
                for idx in range(0, len(X), batch_size):
                    batch = X[idx: idx + batch_size]
                    batch = torch.FloatTensor(batch).to(device)

                    x_hat = estimator(batch)

                    sim = cos(batch, x_hat).detach().cpu().numpy()

                    sims.extend(sim)
            
            ensemble.append(sims)
        
        ave_sims = np.mean(ensemble, axis = 0)

        pred = (ave_sims < 0.95)

        return pred

In [10]:
class AE_Dataset(Dataset):

    def __init__(self, x):
        self.x = x
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx]


def get_loader(train, valid, batch_size, shuffle = True):
    train_loader = DataLoader(
        AE_Dataset(train),
        batch_size = batch_size,
        shuffle = shuffle
    )
    valid_loader = DataLoader(
        AE_Dataset(valid),
        batch_size = batch_size,
        shuffle = False
    )    

    return train_loader, valid_loader

def sub_sampler(train, valid, batch_size, sampling_size, shuffle  = True):
    train_loaders = []

    # ===== sub-sampling ===== #
    # shuffle data
    if shuffle:
        indices = np.random.permutation(len(train))
        train = train[indices]
    
    # split data
    for idx in range(0, len(train), sampling_size):
        sampled_data = train[idx : idx + sampling_size]

        sampled_loader = DataLoader(
            AE_Dataset(sampled_data),
            batch_size = batch_size,
            shuffle = shuffle
        )
        train_loaders.append(sampled_loader)

    # ===== valid loader ===== #
    valid_loader = DataLoader(
        AE_Dataset(valid),
        batch_size = batch_size,
        shuffle = False
    )

    return train_loaders, valid_loader

In [27]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loaders, valid_loader = sub_sampler(
    X_train, X_valid,
    batch_size = 512,
    sampling_size = 8192,
    shuffle = True
)

AE_model = AutoEncoderWithSubSampling(
    n_estimators = len(train_loaders),
    input_size = 30,
    encoder_hidden_size = [60],
    bottle_neck_size = 120,
    decoder_hidden_size = [60],
    dropout_p = .2
)

AE_model.train(
    train_loaders,
    valid_loader,
    y_valid,
    n_epochs = 200,
    device = DEVICE
)

# Estimator 1/14

Epoch 1 - Loss 0.6902, Val_Loss 0.62646, f1_score 0.00105
Epoch 21 - Loss 0.09138, Val_Loss 0.08844, f1_score 0.55009
Epoch 41 - Loss 0.05132, Val_Loss 0.0473, f1_score 0.86215
Epoch 61 - Loss 0.04484, Val_Loss 0.04774, f1_score 0.90312
Epoch 81 - Loss 0.04208, Val_Loss 0.04776, f1_score 0.88321
Epoch 101 - Loss 0.03998, Val_Loss 0.03165, f1_score 0.49973
Epoch 121 - Loss 0.03375, Val_Loss 0.03289, f1_score 0.49973
Epoch 141 - Loss 0.03486, Val_Loss 0.03603, f1_score 0.49973
Epoch 161 - Loss 0.02654, Val_Loss 0.02912, f1_score 0.49973
Epoch 181 - Loss 0.03063, Val_Loss 0.02972, f1_score 0.49973


# Estimator 2/14

Epoch 1 - Loss 0.68498, Val_Loss 0.63192, f1_score 0.00105
Epoch 21 - Loss 0.08774, Val_Loss 0.09007, f1_score 0.53671
Epoch 41 - Loss 0.0577, Val_Loss 0.05921, f1_score 0.57074
Epoch 61 - Loss 0.04284, Val_Loss 0.04012, f1_score 0.84226
Epoch 81 - Loss 0.0399, Val_Loss 0.04202, f1_score 0.87298
Epoch 101 - Loss 0.04176, Val_Loss 0.0463, f1_score 0.87865
Epoch 121 - Loss 0.03102, Val_Loss 0.03567, f1_score 0.88448
Epoch 141 - Loss 0.03628, Val_Loss 0.03004, f1_score 0.67049
Epoch 161 - Loss 0.0298, Val_Loss 0.02676, f1_score 0.6349
Epoch 181 - Loss 0.03331, Val_Loss 0.03568, f1_score 0.61087


# Estimator 3/14

Epoch 1 - Loss 0.68765, Val_Loss 0.62551, f1_score 0.00105
Epoch 21 - Loss 0.0913, Val_Loss 0.08822, f1_score 0.54569
Epoch 41 - Loss 0.06241, Val_Loss 0.05697, f1_score 0.77137
Epoch 61 - Loss 0.04983, Val_Loss 0.05473, f1_score 0.86749
Epoch 81 - Loss 0.03539, Val_Loss 0.0347, f1_score 0.90312
Epoch 101 - Loss 0.03649, Val_Loss 0.03626, f1_score 0.90312
Epoch 121 - Loss 0.0334, Val_Loss 0.03625, f1_score 0.90974
Epoch 141 - Loss 0.03256, Val_Loss 0.03102, f1_score 0.90974
Epoch 161 - Loss 0.03002, Val_Loss 0.03163, f1_score 0.91658
Epoch 181 - Loss 0.03169, Val_Loss 0.03105, f1_score 0.91658


# Estimator 4/14

Epoch 1 - Loss 0.69568, Val_Loss 0.63015, f1_score 0.00105
Epoch 21 - Loss 0.09184, Val_Loss 0.09052, f1_score 0.51702
Epoch 41 - Loss 0.05849, Val_Loss 0.06141, f1_score 0.69463
Epoch 61 - Loss 0.04275, Val_Loss 0.04267, f1_score 0.81224
Epoch 81 - Loss 0.04072, Val_Loss 0.0438, f1_score 0.86215
Epoch 101 - Loss 0.03919, Val_Loss 0.04703, f1_score 0.90312
Epoch 121 - Loss 0.04188, Val_Loss 0.05109, f1_score 0.90312
Epoch 141 - Loss 0.03404, Val_Loss 0.03945, f1_score 0.90312
Epoch 161 - Loss 0.03694, Val_Loss 0.03631, f1_score 0.90974
Epoch 181 - Loss 0.03372, Val_Loss 0.03346, f1_score 0.90974


# Estimator 5/14

Epoch 1 - Loss 0.68775, Val_Loss 0.62791, f1_score 0.00105
Epoch 21 - Loss 0.08493, Val_Loss 0.08226, f1_score 0.54048
Epoch 41 - Loss 0.05586, Val_Loss 0.05204, f1_score 0.56835
Epoch 61 - Loss 0.04553, Val_Loss 0.05363, f1_score 0.86215
Epoch 81 - Loss 0.03847, Val_Loss 0.04555, f1_score 0.87865
Epoch 101 - Loss 0.03851, Val_Loss 0.0445, f1_score 0.88448
Epoch 121 - Loss 0.03234, Val_Loss 0.03068, f1_score 0.90312
Epoch 141 - Loss 0.03095, Val_Loss 0.03406, f1_score 0.90974
Epoch 161 - Loss 0.03234, Val_Loss 0.03252, f1_score 0.88972
Epoch 181 - Loss 0.0326, Val_Loss 0.03864, f1_score 0.88585


# Estimator 6/14

Epoch 1 - Loss 0.67211, Val_Loss 0.61628, f1_score 0.00105
Epoch 21 - Loss 0.08609, Val_Loss 0.08229, f1_score 0.54792
Epoch 41 - Loss 0.04999, Val_Loss 0.05079, f1_score 0.78056
Epoch 61 - Loss 0.04761, Val_Loss 0.04678, f1_score 0.86215
Epoch 81 - Loss 0.04167, Val_Loss 0.04341, f1_score 0.90974
Epoch 101 - Loss 0.03793, Val_Loss 0.04256, f1_score 0.91658
Epoch 121 - Loss 0.04091, Val_Loss 0.04496, f1_score 0.8635
Epoch 141 - Loss 0.03298, Val_Loss 0.03505, f1_score 0.532
Epoch 161 - Loss 0.03351, Val_Loss 0.03556, f1_score 0.532
Epoch 181 - Loss 0.03939, Val_Loss 0.03775, f1_score 0.532


# Estimator 7/14

Epoch 1 - Loss 0.68707, Val_Loss 0.62758, f1_score 0.00105
Epoch 21 - Loss 0.08938, Val_Loss 0.08719, f1_score 0.53791
Epoch 41 - Loss 0.05246, Val_Loss 0.05656, f1_score 0.5999
Epoch 61 - Loss 0.03889, Val_Loss 0.03899, f1_score 0.85697
Epoch 81 - Loss 0.03617, Val_Loss 0.03601, f1_score 0.88448
Epoch 101 - Loss 0.035, Val_Loss 0.03895, f1_score 0.8905
Epoch 121 - Loss 0.03426, Val_Loss 0.0379, f1_score 0.90974
Epoch 141 - Loss 0.03152, Val_Loss 0.03486, f1_score 0.90974
Epoch 161 - Loss 0.03594, Val_Loss 0.03685, f1_score 0.90974
Epoch 181 - Loss 0.03396, Val_Loss 0.03269, f1_score 0.91658


# Estimator 8/14

Epoch 1 - Loss 0.69435, Val_Loss 0.62881, f1_score 0.00105
Epoch 21 - Loss 0.08307, Val_Loss 0.08427, f1_score 0.53942
Epoch 41 - Loss 0.05383, Val_Loss 0.05111, f1_score 0.85193
Epoch 61 - Loss 0.0454, Val_Loss 0.05634, f1_score 0.86749
Epoch 81 - Loss 0.04372, Val_Loss 0.048, f1_score 0.87298
Epoch 101 - Loss 0.03736, Val_Loss 0.03765, f1_score 0.87298
Epoch 121 - Loss 0.02897, Val_Loss 0.03428, f1_score 0.87298
Epoch 141 - Loss 0.03148, Val_Loss 0.03134, f1_score 0.86493
Epoch 161 - Loss 0.03244, Val_Loss 0.0292, f1_score 0.61877
Epoch 181 - Loss 0.0346, Val_Loss 0.0323, f1_score 0.60229


# Estimator 9/14

Epoch 1 - Loss 0.69, Val_Loss 0.62534, f1_score 0.00105
Epoch 21 - Loss 0.08542, Val_Loss 0.0861, f1_score 0.52353
Epoch 41 - Loss 0.05383, Val_Loss 0.05145, f1_score 0.83763
Epoch 61 - Loss 0.03945, Val_Loss 0.04208, f1_score 0.88448
Epoch 81 - Loss 0.03617, Val_Loss 0.03905, f1_score 0.90312
Epoch 101 - Loss 0.0355, Val_Loss 0.03884, f1_score 0.90312
Epoch 121 - Loss 0.03371, Val_Loss 0.03868, f1_score 0.90312
Epoch 141 - Loss 0.03493, Val_Loss 0.0427, f1_score 0.90312
Epoch 161 - Loss 0.03302, Val_Loss 0.03177, f1_score 0.90312
Epoch 181 - Loss 0.03191, Val_Loss 0.03999, f1_score 0.90312


# Estimator 10/14

Epoch 1 - Loss 0.68533, Val_Loss 0.62255, f1_score 0.00105
Epoch 21 - Loss 0.08702, Val_Loss 0.09035, f1_score 0.58758
Epoch 41 - Loss 0.05409, Val_Loss 0.04915, f1_score 0.79381
Epoch 61 - Loss 0.04517, Val_Loss 0.04168, f1_score 0.86749
Epoch 81 - Loss 0.04431, Val_Loss 0.03542, f1_score 0.88448
Epoch 101 - Loss 0.03618, Val_Loss 0.03944, f1_score 0.89671
Epoch 121 - Loss 0.0357, Val_Loss 0.03911, f1_score 0.90974
Epoch 141 - Loss 0.03421, Val_Loss 0.03363, f1_score 0.90974
Epoch 161 - Loss 0.03396, Val_Loss 0.03275, f1_score 0.90974
Epoch 181 - Loss 0.03862, Val_Loss 0.03259, f1_score 0.90974


# Estimator 11/14

Epoch 1 - Loss 0.68635, Val_Loss 0.62851, f1_score 0.00105
Epoch 21 - Loss 0.08865, Val_Loss 0.08958, f1_score 0.51727
Epoch 41 - Loss 0.05333, Val_Loss 0.05154, f1_score 0.79381
Epoch 61 - Loss 0.04195, Val_Loss 0.04447, f1_score 0.86749
Epoch 81 - Loss 0.03904, Val_Loss 0.03589, f1_score 0.87865
Epoch 101 - Loss 0.03966, Val_Loss 0.03253, f1_score 0.87865
Epoch 121 - Loss 0.03603, Val_Loss 0.03855, f1_score 0.87865
Epoch 141 - Loss 0.03621, Val_Loss 0.03833, f1_score 0.89671
Epoch 161 - Loss 0.03184, Val_Loss 0.03894, f1_score 0.90974
Epoch 181 - Loss 0.03125, Val_Loss 0.03363, f1_score 0.90974


# Estimator 12/14

Epoch 1 - Loss 0.6868, Val_Loss 0.62689, f1_score 0.00105
Epoch 21 - Loss 0.0863, Val_Loss 0.08627, f1_score 0.53259
Epoch 41 - Loss 0.05554, Val_Loss 0.0549, f1_score 0.58961
Epoch 61 - Loss 0.04685, Val_Loss 0.04639, f1_score 0.85193
Epoch 81 - Loss 0.03651, Val_Loss 0.03778, f1_score 0.88448
Epoch 101 - Loss 0.03918, Val_Loss 0.04146, f1_score 0.90312
Epoch 121 - Loss 0.03987, Val_Loss 0.04016, f1_score 0.90312
Epoch 141 - Loss 0.03459, Val_Loss 0.03379, f1_score 0.90312
Epoch 161 - Loss 0.03647, Val_Loss 0.04065, f1_score 0.90974
Epoch 181 - Loss 0.03417, Val_Loss 0.03471, f1_score 0.90974


# Estimator 13/14

Epoch 1 - Loss 0.69259, Val_Loss 0.6293, f1_score 0.00105
Epoch 21 - Loss 0.08332, Val_Loss 0.08059, f1_score 0.61373
Epoch 41 - Loss 0.04779, Val_Loss 0.0512, f1_score 0.82872
Epoch 61 - Loss 0.04196, Val_Loss 0.04609, f1_score 0.90312
Epoch 81 - Loss 0.03677, Val_Loss 0.03977, f1_score 0.91658
Epoch 101 - Loss 0.03526, Val_Loss 0.03664, f1_score 0.91658
Epoch 121 - Loss 0.03626, Val_Loss 0.03147, f1_score 0.91658
Epoch 141 - Loss 0.03498, Val_Loss 0.0434, f1_score 0.91658
Epoch 161 - Loss 0.03764, Val_Loss 0.03849, f1_score 0.88585
Epoch 181 - Loss 0.03246, Val_Loss 0.03474, f1_score 0.78243


# Estimator 14/14

Epoch 1 - Loss 0.68161, Val_Loss 0.62527, f1_score 0.00105
Epoch 21 - Loss 0.09597, Val_Loss 0.09797, f1_score 0.53403
Epoch 41 - Loss 0.05819, Val_Loss 0.06513, f1_score 0.63932
Epoch 61 - Loss 0.05052, Val_Loss 0.05117, f1_score 0.88448
Epoch 81 - Loss 0.0408, Val_Loss 0.04619, f1_score 0.90312
Epoch 101 - Loss 0.03885, Val_Loss 0.03828, f1_score 0.91658
Epoch 121 - Loss 0.03515, Val_Loss 0.03753, f1_score 0.91658
Epoch 141 - Loss 0.03355, Val_Loss 0.03813, f1_score 0.91658
Epoch 161 - Loss 0.03457, Val_Loss 0.03861, f1_score 0.91658
Epoch 181 - Loss 0.03237, Val_Loss 0.03704, f1_score 0.91658


# EE

In [23]:
from sklearn.covariance import EllipticEnvelope

def EE_search(cont, ks):

    best_f1 = -np.inf
    best_cont = None
    best_k = None

    for contaminant in cont:
        model = EllipticEnvelope(
            support_fraction = 0.994,
            contamination = contaminant,
            random_state = 42
        )
        model.fit(X_train)
        val_score = model.score_samples(X_valid)

        for k in ks:
            anomaly_indices = np.argsort(val_score)[:k]
            val_pred = np.zeros(len(X_valid))
            val_pred[anomaly_indices] = 1

            f1 = f1_score(y_valid, val_pred, average = 'macro')
            if f1 >= best_f1:
                print('* BEST updated into f1: {}, cont: {}, k: {}'.format(f1, contaminant, k))
                best_f1 = f1
                best_cont = contaminant
                best_k = k
        
    best_anomaly_ratio = best_k / len(X_valid)

    return best_cont, best_anomaly_ratio

In [24]:
conts = np.arange(0.0001, 0.003, 0.0005)
ks = np.arange(0, 50)

best_cont, best_anomaly_ratio = EE_search(conts, ks)

* BEST updated into f1: 0.4997363518121419, cont: 0.0001, k: 0
* BEST updated into f1: 0.5320032001215638, cont: 0.0001, k: 1
* BEST updated into f1: 0.562253919707516, cont: 0.0001, k: 2
* BEST updated into f1: 0.5906717950274928, cont: 0.0001, k: 3
* BEST updated into f1: 0.6174185476616381, cont: 0.0001, k: 4
* BEST updated into f1: 0.6426374167237955, cont: 0.0001, k: 5
* BEST updated into f1: 0.6664557258707168, cont: 0.0001, k: 6
* BEST updated into f1: 0.6889870340395065, cont: 0.0001, k: 7
* BEST updated into f1: 0.7103329465949443, cont: 0.0001, k: 8
* BEST updated into f1: 0.730584647838757, cont: 0.0001, k: 9
* BEST updated into f1: 0.7498242036425005, cont: 0.0001, k: 10
* BEST updated into f1: 0.7681256734512389, cont: 0.0001, k: 11
* BEST updated into f1: 0.7725514640071602, cont: 0.0001, k: 14
* BEST updated into f1: 0.7887218676684034, cont: 0.0001, k: 15
* BEST updated into f1: 0.8041895926750926, cont: 0.0001, k: 16
* BEST updated into f1: 0.8189994908759815, cont: 0.

In [29]:
EE_model = EllipticEnvelope(
    support_fraction = 0.994,
    contamination = best_cont,
    random_state = 42
)
EE_model.fit(X_train)

EllipticEnvelope(contamination=0.0026, random_state=42, support_fraction=0.994)

In [39]:
def Ensemble_AE_EE(X, AE_model, EE_model, best_anomaly_ratio):
    # AutoEncoder
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pred_ae = AE_model.evaluate(X, batch_size = 1024, device = DEVICE)

    # EllipticEnvelope
    k = (best_anomaly_ratio * len(X)).astype(np.int)
    score_ee = EE_model.score_samples(X)
    anomaly_idx = np.argsort(score_ee)[:k]
    pred_ee = np.zeros(len(X))
    pred_ee[anomaly_idx] = 1

    # ensemble
    ensemble = pred_ae + pred_ee
    anomaly_idx = np.where((ensemble == 2))[0]
    normal_idx = np.where((ensemble == 0))[0]
    iffy_idx = np.where((ensemble == 1))[0]

    ensemble_pred = np.zeros(len(X))
    ensemble_pred[anomaly_idx] = 1
    ensemble_pred[iffy_idx] = pred_ee[iffy_idx]

    return ensemble_pred

In [40]:
pred = Ensemble_AE_EE(
    X_test,
    AE_model,
    EE_model,
    best_anomaly_ratio
)

In [41]:
submission = pd.read_csv(
    './data/sample_submission.csv'
)

submission['Class'] = pred

In [42]:
submission

Unnamed: 0,ID,Class
0,AAAA0x1,0.0
1,AAAA0x2,0.0
2,AAAA0x5,0.0
3,AAAA0x7,0.0
4,AAAA0xc,0.0
...,...,...
142498,0x4587f,0.0
142499,0x45880,0.0
142500,0x45884,0.0
142501,0x45885,0.0


In [43]:
submission.to_csv('./AE_EE_submission.csv', index = False)