# Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown
plt.style.use('ggplot')
from tqdm import tqdm
from sklearn.metrics import f1_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/MyDrive/dacon_anomaly')

In [None]:
def load_data():
    train = pd.read_csv('./data/train.csv')
    valid = pd.read_csv('./data/val.csv')
    test = pd.read_csv('./data/test.csv')

    return train, valid, test

train, valid, test = load_data()

In [None]:
test

In [None]:
def preprocess(train, valid, test):
    train.drop(['ID'], inplace = True, axis = 1)
    valid.drop(['ID'], inplace = True, axis = 1)
    test.drop(['ID'], inplace = True, axis = 1)

    X_train = train.values
    X_valid = valid.drop(['Class'], axis = 1).values
    y_valid = valid['Class'].values
    X_test = test.values

    return X_train, X_valid, y_valid, X_test

X_train, X_valid, y_valid, X_test = preprocess(train, valid, test)

# Elliptic Envelope with Sub-Sampling

In [None]:
anomaly_rate = (y_valid == 1).sum() / len(y_valid)
anomaly_rate

In [None]:
from sklearn.covariance import EllipticEnvelope

def EE_with_subsampling(X_train, sub_sample_size, anomaly_rate):
    # shuffle train data
    indice = np.random.permutation(len(X_train))
    X_train = X_train[indice]
    
    # sub sample
    models = []
    with tqdm(range(0, len(X_train), sub_sample_size)) as pbar:
        for num, sample_idx in enumerate(pbar):
            sub_sample = X_train[sample_idx: sample_idx + sub_sample_size]

            EE = EllipticEnvelope(
                support_fraction = 0.994,
                contamination = anomaly_rate,
                random_state = 42
            )
            EE.fit(sub_sample)
            models.append(EE)
            
            pbar.set_postfix({'Training' : '{}th model'.format(num + 1)})

    return models

In [None]:
def predict_EE(X, models, k):
    # predict
    scores = []
    for model in models:
        score = model.score_samples(X)
        scores.append(score)
    
    scores = np.stack(scores)
    ave_score = np.mean(scores, axis = 0)

    anomaly_idx = np.argsort(np.array(ave_score))[:k]
    pred = np.zeros(len(X))
    pred[anomaly_idx] = 1

    return pred

In [None]:
def grid_search(X_train, X_valid, y_valid, models, ks):
    best_f1 = -np.inf
    best_k = None
    for k in ks:
        pred = predict_EE(
            x, models, k
        )
        f1 = f1_score(y, pred)

        if f1 > best_f1:
            best_f1 = f1
            best_k = k
    
    return  best_k / len(x)

In [None]:
models = EE_with_subsampling(
    X_train,
    8192,
    anomaly_rate
)

In [None]:
pred = predict_EE(
    X_valid,
    models,
    32
)

In [None]:
f1_score(y_valid, pred, average = 'macro')

In [None]:
ks = np.arange(0, 100)

best_ratio = grid_search(
    X_valid,
    y_valid,
    models,
    ks
)

In [None]:
best_ratio

In [None]:
test_k = (len(test) * best_ratio).astype(np.int)

test_pred = predict_EE(
    X_test,
    models,
    test_k
)

In [None]:
test_pred

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')

In [None]:
submission

In [None]:
submission['Class'] = test_pred.astype(np.int)

In [None]:
submission

In [None]:
submission.to_csv('./ee_sub.csv', index = False)