# 🧫 🧪 Эксперименты с классическими *unsupervised* методами обнаружения аномалий

### 🌐 Установка [pyod](https://github.com/yzhao062/pyod)

In [None]:
# !pip3 install pyod

## 💅 Предобработка данных

In [None]:
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.utils import tokenize
import nltk; nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer


def strip_html_tags(text):
    """Удаление html tags из текста."""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def preprocess_text(text):
    text = strip_html_tags(text)  # удаление html tags
    text = strip_non_alphanum(text) # заменили все небуквенные символы на пробел
    text = strip_numeric(text) # удалили все цифры
    text = remove_stopwords(text) # удалили все стоп-слова
    # text = strip_short(text, minsize=2) # удалили короткие слова
    word_list = list(tokenize(text, deacc=True, to_lower=True)) # токенизация, deacc - избавляет от ударений
    word_list = [WordNetLemmatizer().lemmatize(word) for word in word_list] # лемматизация
    return ' '.join(word for word in word_list)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 🧬 Экперименты

### Isolation Forest, LOF, kNN, COPOD

In [None]:
# import numpy as np
# import pandas as pd

# from sklearn.datasets import fetch_20newsgroups
# from sklearn.utils import shuffle
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import roc_auc_score

# import pyod
# from pyod.models import iforest
# from pyod.models import lof
# from pyod.models import knn
# from pyod.models import copod

# c = 0.1  # отношение количества аномальных экземпляров к нормальным

# categories = ["comp.graphics",
#               'talk.politics.mideast',
#               "rec.sport.hockey",
#               "sci.med",
#               "sci.space",
#               'misc.forsale',
#               'soc.religion.christian',
#               'talk.politics.misc']

# experimant_cnt = 0
# all_experiments = len(categories) * (len(categories) -  1)
# auc_list_iforest = []
# auc_list_lof = []
# auc_list_knn = []
# auc_list_copod = []

# # Формирование словаря с категориями
# dataset = {}
# for cat in categories:
#     # Загрузка
#     dataset[cat] = fetch_20newsgroups(subset='all', categories=[cat],
#                             shuffle=True, random_state=123,
#                             remove=('headers', 'footers'), return_X_y=True)[0]
#     # Предобработка
#     dataset[cat] = [preprocess_text(text) for text in dataset[cat]]

# # Перебираем пары категорий
# for i in range(len(categories)):
#     for j in range(len(categories)):
#         if i == j:
#             continue

#         c1 = categories[i]
#         c2 = categories[j]

#         experimant_cnt += 1

#         # Формирование нормальной и аномальной выборок
#         normal_data = dataset[c1]
#         anomal_data = dataset[c2][:min(int(c * len(normal_data)) + 1, len(dataset[c2]))]
#         all_data = normal_data + anomal_data

#         # TF-IDF векторизация
#         vectorizer = TfidfVectorizer()
#         all_data_tf = vectorizer.fit_transform(all_data).toarray()

#         # Формирование выборок
#         x = all_data_tf
#         y = np.array([False] * len(normal_data) + [True] * len(anomal_data))
#         all_data, x, y = shuffle(all_data, x, y, random_state=123)

#         # Задаем модели

#         # Isolation Forest
#         iforest_clf = iforest.IForest(
#             contamination=0.1,
#             n_estimators=5000,
#             max_samples=1.0,
#             bootstrap=True,
#             random_state=123,
#             n_jobs=-1,
#         )

#         # LOF
#         lof_clf = lof.LOF(
#             contamination=0.1,
#             n_neighbors=5,
#             metric='canberra',
#             n_jobs=-1,
#         )

#         #kNN
#         knn_clf = knn.KNN(
#             contamination=0.1,
#             n_neighbors=3,
#             method='largest',
#             metric='canberra',
#             n_jobs=-1,
#         )

#         # COPOD
#         copod_clf = copod.COPOD(contamination=0.1)

#         # Считаем метрику ROC AUC
#         auc_iforest = iforest_clf.fit_predict_score(x, y)
#         auc_lof = lof_clf.fit_predict_score(x, y)
#         auc_knn = knn_clf.fit_predict_score(x, y)
#         auc_copod = copod_clf.fit_predict_score(x, y)

#         # Добавляем в списки
#         auc_list_iforest.append(auc_iforest)
#         auc_list_lof.append(auc_lof)
#         auc_list_knn.append(auc_knn)
#         auc_list_copod.append(auc_copod)

#         # Вывод результатов
#         print("-" * 50)
#         print("Эксперимент №{}/{}  с normal = {}, anomal = {}".format(
#             experimant_cnt, all_experiments, c1, c2))
#         print("auc_iforest = ", auc_iforest)
#         print("auc_lof = ", auc_lof)
#         print("auc_knn = ", auc_knn)
#         print("auc_copod = ", auc_copod)
#         print("-" * 50)


# auc_np = np.array(auc_list_iforest)
# print("*" * 50)
# print("IFOREST Медиана auc = {}".format(np.median(auc_np)))
# print("IFOREST Среднее auc = {}".format(np.mean(auc_np)))
# print("*" * 50)

# auc_np = np.array(auc_list_lof)
# print("*" * 50)
# print("LOF Медиана auc = {}".format(np.median(auc_np)))
# print("LOF Среднее auc = {}".format(np.mean(auc_np)))
# print("*" * 50)

# auc_np = np.array(auc_list_knn)
# print("*" * 50)
# print("kNN Медиана auc = {}".format(np.median(auc_np)))
# print("kNN Среднее auc = {}".format(np.mean(auc_np)))
# print("*" * 50)

# auc_np = np.array(auc_list_copod)
# print("*" * 50)
# print("COPOD Медиана auc = {}".format(np.median(auc_np)))
# print("COPOD Среднее auc = {}".format(np.mean(auc_np)))
# print("*" * 50)

### AE, VAE

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score

import pyod
from pyod.models import auto_encoder
from pyod.models import vae

c = 0.1  # отношение количества аномальных экземпляров к нормальным

categories = ["comp.graphics",
              'talk.politics.mideast',
              "rec.sport.hockey",
              "sci.med",
              "sci.space",
              'misc.forsale',
              'soc.religion.christian',
              'talk.politics.misc']

experimant_cnt = 0
all_experiments = len(categories) * (len(categories) -  1)
auc_list_ae = []
auc_list_vae = []

# Формирование словаря с категориями
dataset = {}
for cat in categories:
    # Загрузка
    dataset[cat] = fetch_20newsgroups(subset='all', categories=[cat],
                            shuffle=True, random_state=123,
                            remove=('headers', 'footers'), return_X_y=True)[0]
    # Предобработка
    dataset[cat] = [preprocess_text(text) for text in dataset[cat]]

# Перебираем пары категорий
for i in range(len(categories)):
    for j in range(len(categories)):
        if i == j:
            continue

        c1 = categories[i]
        c2 = categories[j]

        experimant_cnt += 1

        # Формирование нормальной и аномальной выборок
        normal_data = dataset[c1]
        anomal_data = dataset[c2][:min(int(c * len(normal_data)) + 1, len(dataset[c2]))]
        all_data = normal_data + anomal_data

        # TF-IDF векторизация
        vectorizer = TfidfVectorizer()
        all_data_tf = vectorizer.fit_transform(all_data).toarray()

        # Формирование выборок
        x = all_data_tf
        y = np.array([False] * len(normal_data) + [True] * len(anomal_data))
        all_data, x, y = shuffle(all_data, x, y, random_state=123)

        # Задаем модели

        # AE
        ae_clf = auto_encoder.AutoEncoder(
            hidden_neurons=[256, 128, 64, 64, 128, 256],
            hidden_activation='relu',
            output_activation='sigmoid',
            optimizer='adam',
            epochs=35,
            batch_size=16,
            dropout_rate=0.3,
            l2_regularizer=0.1,
            validation_size=0.1,
            preprocessing=True,
            verbose=0,
            random_state=123,
            contamination=0.1
        )

        # VAE
        # vae_clf = vae.VAE(
        #     contamination=0.1,
        #     encoder_neurons=[256, 128, 64],
        #     decoder_neurons=[64, 128, 256],
        #     latent_dim=5,
        #     hidden_activation='relu',
        #     output_activation='sigmoid',
        #     optimizer='adam',
        #     epochs=16,
        #     batch_size=64,
        #     dropout_rate=0.3,
        #     l2_regularizer=0.1,
        #     validation_size=0.1,
        #     preprocessing=True,
        #     verbose=0,
        #     random_state=123,
        # )

        # Считаем метрику ROC AUC

        ae_clf.fit(x)
        y_predict_ae = ae_clf.decision_function(x)
        auc_ae = roc_auc_score(y, y_predict_ae)

        # vae_clf.fit(x)
        # y_predict_vae = vae_clf.decision_function(x)
        # auc_vae = roc_auc_score(y, y_predict_vae)

        # Добавляем в списки
        auc_list_ae.append(auc_ae)
        # auc_list_vae.append(auc_vae)

        # Вывод результатов
        print("-" * 50)
        print("Эксперимент №{}/{}  с normal = {}, anomal = {}".format(
            experimant_cnt, all_experiments, c1, c2))
        print("auc_ae = ", auc_ae)
        # print("auc_vae = ", auc_vae)
        print("-" * 50)


auc_np = np.array(auc_list_ae)
print("*" * 50)
print("AE Медиана auc = {}".format(np.median(auc_np)))
print("AE Среднее auc = {}".format(np.mean(auc_np)))
print("*" * 50)

# auc_np = np.array(auc_list_vae)
# print("*" * 50)
# print("VAE Медиана auc = {}".format(np.median(auc_np)))
# print("VAE Среднее auc = {}".format(np.mean(auc_np)))
# print("*" * 50)

--------------------------------------------------
Эксперимент №1/56  с normal = comp.graphics, anomal = talk.politics.mideast
auc_ae =  0.8618621138074962
--------------------------------------------------
--------------------------------------------------
Эксперимент №2/56  с normal = comp.graphics, anomal = rec.sport.hockey
auc_ae =  0.8150995238794387
--------------------------------------------------
--------------------------------------------------
Эксперимент №3/56  с normal = comp.graphics, anomal = sci.med
auc_ae =  0.8223304738133692
--------------------------------------------------
--------------------------------------------------
Эксперимент №4/56  с normal = comp.graphics, anomal = sci.space
auc_ae =  0.8206577595066804
--------------------------------------------------


ResourceExhaustedError: ignored

In [1]:
import multiprocessing
multiprocessing.cpu_count()

2