## Предоработка данных

In [None]:
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_numeric
from gensim.utils import tokenize
import nltk; nltk.download('wordnet')
import nltk; nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


def strip_html_tags(text):
    """Удаление html tags из текста."""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def preprocess_text(text, token=False):
    text = text.lower()
    text = strip_html_tags(text)  # удаление html tags
    text = strip_non_alphanum(text) # заменили все небуквенные символы на пробел
    text = strip_numeric(text) # удалили все цифры
    text = remove_stopwords(text) # удалили все стоп-слова с gensim
    text = strip_short(text, minsize=3) # удалили слова из <3 символов
    word_list = list(tokenize(text, deacc=True)) # токенизация, deacc - избавляет от ударений
    word_list = [w for w in word_list if not w in stop_words] # удалили стоп слова с nltk
    word_list = [WordNetLemmatizer().lemmatize(word) for word in word_list] # лемматизация
    if (token):
        return word_list
    else:
        return ' '.join(w for w in word_list)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Ранжирование

In [None]:
from gensim.matutils import jensen_shannon


def GetListPopularTopics(lda_model, bow_corpus):
    """Cоставляет ранжированный список тем, остортированных по убыванию
    популярности. Возвращает ранжированный список индексов тем."""

    topTopicsPopular = [0 for i in range(lda_model.num_topics)]

    for bow in bow_corpus:  # идем по всем документам
        # Получаем распределение тем для каждого документа
        distTopicsInDocument = lda_model.get_document_topics(bow,
                                                             minimum_probability=0)

        for (idx, pr) in distTopicsInDocument:
            topTopicsPopular[idx] += pr

    topTopicsPopular = [(i, topTopicsPopular[i]) for i in
                        range(len(topTopicsPopular))]
    # сортируем темы по популярности
    topTopicsPopular = sorted(topTopicsPopular, key=lambda pair: pair[1],
                              reverse=True)

    return [x[0] for x in topTopicsPopular]


def GetListPrimTopics(lda_model, top_popular_topics):
    topics = [lda_model.get_topic_terms(i, len(lda_model.id2word)) for i in range(lda_model.num_topics)]
    for i in range(lda_model.num_topics):
        topics[i] = sorted(topics[i])

    edge_dict = dict()
    for v1 in range(lda_model.num_topics):
        for v2 in range(lda_model.num_topics):
            edge_dict[(v1, v2)] = jensen_shannon(topics[v1], topics[v2])

    start_vertex = top_popular_topics[0] # назначаем самую популярную тему стартовой

    rankingTopics = []

    selected = set()
    selected.add(start_vertex)
    rankingTopics.append(start_vertex)

    unselected = set()
    for i in range(lda_model.num_topics):
        if(i != start_vertex):
            unselected.add(i)

    ans = 0 # хранит вес минимального остовного дерева

    # алгоритм Прима
    while(len(selected) != lda_model.num_topics):
        newV1 = -1
        newV2 = -1
        mx = -1
        for v1 in selected:
            for v2 in unselected:
                if (mx == -1 or edge_dict[(v1, v2)] < mx):
                    mx = edge_dict[(v1, v2)]
                    newV1 = v1
                    newV2 = v2
        selected.add(newV2)
        unselected.discard(newV2)
        ans += mx
        rankingTopics.append(newV2)
    return rankingTopics

## Контекст с fastText

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

nltk.download('wordnet_ic')

semcor_ic = wordnet_ic.ic('ic-semcor.dat')


def GetListWordsTopic(lda_model, dictionary, topicid, topn=10):
    """Функция, которая по модели, словарю и номеру темы возвращает список
    topn характеризующих эту тему слов"""
    words = lda_model.get_topic_terms(topicid=topicid, topn=topn)
    id = []
    for i in words:
        id.append(i[0])
    words = [dictionary[i] for i in id]
    return words


def simWN(word1, word2):
    sim1 = sim(word1, word2, wn.wup_similarity)
    sim2 = 0
    divider = 2
    try:
        sim2 = sim(word1, word2, wn.lin_similarity, ic=semcor_ic)
    except:
        divider = 1
    return (sim1 + sim2) / divider


def sim(word1, word2, fun, flag_print=False, ic=None):
    syns1 = wn.synsets(word1)
    syns2 = wn.synsets(word2)
    if (syns1 == None or syns2 == None):
        return 0
    mx = 0
    for i in syns1:
        for j in syns2:
            tmp = None
            if (ic != None):  # если вычисляем сходство на основе вектора толкования
                if (i.pos() == j.pos()):  # test for equal part of speech (pos)
                    tmp = fun(i, j, ic)
            else:  # вычисление сходства посредством подсчета расстояний в графе
                tmp = fun(i, j)
            if (tmp != None):
                mx = max(tmp, mx)
    if (flag_print):
        print("Similarity between", word1, "and", word2, ":", mx)
    return mx


def GetTopInContext(rankingTopics, lda_model, dictionary,
                    num_words):
    m = 2 / 3
    N_normalTopics = int(lda_model.num_topics * m)
    normalTopics = rankingTopics[0:N_normalTopics + 1]
    anomalTopics = rankingTopics[N_normalTopics + 1:]

    # N_wordsСharacteristic число слов, которое будем использовать, чтобы характеризовать тему
    wordOfTopics = []
    for i in range(lda_model.num_topics):
        wordOfTopics.append(
            GetListWordsTopic(lda_model, dictionary, i, num_words))

    # Задаем меру схожести
    SimFunction = simWN

    measureOfAnomaly = []
    for an_i in anomalTopics:  # для каждой потенциально аномальной темы
        r_total = 0
        for n_i in normalTopics:  # берем в пару очередную нормальную тему
            r = 0
            for an_word in wordOfTopics[
                an_i]:  # перебираем все слова аномальной темы
                for n_word in wordOfTopics[
                    n_i]:  # перебираем все слова номральной темы
                    r += SimFunction(an_word, n_word)
            r_total += r
        measureOfAnomaly.append((an_i, r_total))
    measureOfAnomaly = sorted(measureOfAnomaly, key=lambda pair: pair[1],
                              reverse=(SimFunction == simWN))
    return [x[0] for x in measureOfAnomaly]


[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.


In [None]:
import gensim.downloader as api

fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')



## Подсчет anomal_score

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def GetPredict(model, bow_corpus, anomal_topics):
    """Для каждого документа оценивает его аномальность. Возвращает список
    из anomal_score."""
    y_predict = []
    for (i, sample) in enumerate(bow_corpus):
        anomal_score = 0
        distTopicsInDocument = model.get_document_topics(sample, minimum_probability=0)
        for idx, pr in distTopicsInDocument:
            if(idx in anomal_topics):
                anomal_score += pr
        y_predict.append(anomal_score)
    X = np.array(y_predict)
    X_std = (X - X.min()) / (X.max() - X.min()) # [0:1]
    return X_std

## Эксперименты

# LDA

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora
from gensim.models import LdaMulticore
from sklearn.metrics import roc_auc_score
from gensim.models import TfidfModel

c = 0.1  # отношение количества аномальных экземпляров к нормальным

anomal_categories = [
    'comp.os.ms-windows.misc', 
    'talk.politics.mideast',
    'soc.religion.christian',
    'talk.politics.mideast',
    'misc.forsale',
    'sci.med',
    'sci.space',
    'comp.sys.ibm.pc.hardware',
    'talk.politics.mideast',
    'comp.windows.x',
    'comp.sys.ibm.pc.hardware',
    'comp.windows.x',
    'comp.sys.ibm.pc.hardware',
    'sci.electronics',
    'soc.religion.christian',
    'soc.religion.christian',
    'comp.graphics',
    'comp.windows.x',
    'talk.politics.misc',
    'sci.crypt',
    'talk.politics.mideast',
    'talk.politics.mideast',
    'talk.politics.mideast',
    'soc.religion.christian',
    'sci.space',
    'talk.politics.mideast',
    'sci.crypt',
    'talk.politics.mideast',
    'comp.windows.x',
    'rec.sport.baseball',
    'comp.windows.x',
    'rec.sport.hockey',
    'comp.sys.ibm.pc.hardware',
    'comp.windows.x',
    'sci.crypt',
]

normal_categories = [
                     'rec.autos',
                     'sci.electronics',
                     'rec.autos',
                     'sci.space',
                     'rec.motorcycles',
                     'talk.politics.mideast',
                     'talk.politics.mideast',
                     'talk.politics.misc',
                     'soc.religion.christian',
                     'talk.politics.misc',
                     'alt.atheism',
                     'alt.atheism',
                     'talk.politics.misc',
                     'soc.religion.christian',
                     'rec.sport.baseball',
                     'comp.graphics',
                     'soc.religion.christian',
                     'soc.religion.christian',
                     'sci.electronics',
                     'rec.autos',
                     'sci.crypt',
                     'sci.med',
                     'rec.motorcycles',
                     'sci.space',
                     'soc.religion.christian',
                     'comp.graphics',
                     'rec.sport.hockey',
                     'comp.sys.ibm.pc.hardware',
                     'rec.sport.baseball',
                     'talk.politics.mideast',
                     'rec.autos',
                     'sci.space',
                     'soc.religion.christian',
                     'rec.sport.hockey',
                     'rec.sport.baseball'
]
    


categories = ['alt.atheism',
                'comp.graphics',
                'comp.os.ms-windows.misc',
                'comp.sys.ibm.pc.hardware',
                'comp.sys.mac.hardware',
                'comp.windows.x',
                'misc.forsale',
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey',
                'sci.crypt',
                'sci.electronics',
                'sci.med',
                'sci.space',
                'soc.religion.christian',
                'talk.politics.guns',
                'talk.politics.mideast',
                'talk.politics.misc',
                'talk.religion.misc']

experimant_cnt = 0
all_experiments = len(normal_categories) # len(categories) * (len(categories) - 1)
auc_list_content = []
auc_list_context = []

# Формирование словаря с категориями
dataset = {}
for cat in categories:
    dataset[cat] = fetch_20newsgroups(subset='all', categories=[cat],
                            shuffle=True, random_state=123,
                            remove=('headers', 'footers'), return_X_y=True)[0]

# Перебираем пары категорий
# for c1 in categories:
#     for c2 in categories:
for c1, c2 in zip(normal_categories, anomal_categories):
    if c1 != c2:
        experimant_cnt += 1
        # Формирование нормальной и аномальной выборок
        normal_data = dataset[c1]
        anomal_data = dataset[c2][:min(int(c * len(normal_data)) + 1, len(dataset[c2]))]
        y = np.array(
            [False] * len(normal_data) + [True] * len(anomal_data))
        normal_data = [preprocess_text(text, token=True) for text in normal_data]
        anomal_data = [preprocess_text(text, token=True) for text in anomal_data]
        all_data = normal_data + anomal_data

        # Формирование словаря
        dictionary = corpora.Dictionary(all_data)
        # Оставляем слова, встречающиеся >= no_below документах
        # Оставляем слова, встречающиеся <= чем в no_above документах
        # dictionary.filter_extremes(no_below=5, no_above=0.4,
                                    # keep_n=None)
        # Назачение новых индексов словам, минуя пропуски
        # dictionary.compactify()

        # Создание мешка слов
        bag_of_words = [dictionary.doc2bow(doc) for doc in all_data] # convert corpus to Bag of Words

        # model = TfidfModel(bag_of_words)  # fit model
        # vector = model[bag_of_words]   # apply model to the first corpus document

        # Тематическое моделирование
        lda_model = LdaMulticore(corpus=bag_of_words,
                                    num_topics=10,
                                    random_state=123,
                                    id2word=dictionary,
                                    passes=10,
                                    workers=2)
        # Ранжирование тем
        topics_popular = GetListPopularTopics(lda_model, bag_of_words)
        # topics_prim = GetListPrimTopics(lda_model, topics_popular)

        # Формирование аномальных тем
        anomal_topics_content = topics_popular[-3:]
        anomal_topics_context = GetTopInContext(topics_popular,
                                                lda_model,
                                                dictionary,
                                                50)[-3:]

        y_predict_content = GetPredict(lda_model, bag_of_words,
                                anomal_topics_content)
        y_predict_context = GetPredict(lda_model, bag_of_words,
                                anomal_topics_context)

        # y_predict_content /= y_predict_content.max()
        # y_predict_context /= y_predict_context.max()

        auc_content = roc_auc_score(y, y_predict_content)
        auc_context = roc_auc_score(y, y_predict_context)

        print("-" * 50)
        print("Эксперимент №{}/{}  с normal = {}, anomal = {}".format(
            experimant_cnt, all_experiments, c1, c2))
        print("Контент: roc_auc_score = {}".format(auc_content))
        print("Контекст: roc_auc_score = {}".format(auc_context))
        print("-" * 50)

        auc_list_content.append(auc_content)
        auc_list_context.append(auc_context)

auc_np_content = np.array(auc_list_content)
auc_np_context = np.array(auc_list_context)
print("*" * 50)
print("Медиана auc_content = {}".format(np.median(auc_np_content)))
print("Среднее auc_content = {}".format(np.mean(auc_np_content)))
print("LDA Медиана auc_context = {}".format(np.median(auc_np_context)))
print("LDA Среднее auc_context = {}".format(np.mean(auc_np_context)))
print("*" * 50)
print("LDA Эксперименты завершены!")

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


--------------------------------------------------
Эксперимент №1/35  с normal = rec.autos, anomal = comp.os.ms-windows.misc
Контент: roc_auc_score = 0.7590505050505051
Контекст: roc_auc_score = 0.75910101010101
--------------------------------------------------
--------------------------------------------------
Эксперимент №2/35  с normal = sci.electronics, anomal = talk.politics.mideast
Контент: roc_auc_score = 0.760203662642687
Контекст: roc_auc_score = 0.7601626016260162
--------------------------------------------------
--------------------------------------------------
Эксперимент №3/35  с normal = rec.autos, anomal = soc.religion.christian
Контент: roc_auc_score = 0.7602222222222221
Контекст: roc_auc_score = 0.7604040404040404
--------------------------------------------------
--------------------------------------------------
Эксперимент №4/35  с normal = sci.space, anomal = talk.politics.mideast
Контент: roc_auc_score = 0.7608301863621012
Контекст: roc_auc_score = 0.7609120587

## NMF-2

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.models.nmf import Nmf
from sklearn.metrics import roc_auc_score
from gensim.models import TfidfModel

c = 0.1  # отношение количества аномальных экземпляров к нормальным

anomal_categories = [
    'comp.os.ms-windows.misc', 
    'talk.politics.mideast',
    'soc.religion.christian',
    'talk.politics.mideast',
    'misc.forsale',
    'sci.med',
    'sci.space',
    'comp.sys.ibm.pc.hardware',
    'talk.politics.mideast',
    'comp.windows.x',
    'comp.sys.ibm.pc.hardware',
    'comp.windows.x',
    'comp.sys.ibm.pc.hardware',
    'sci.electronics',
    'soc.religion.christian',
    'soc.religion.christian',
    'comp.graphics',
    'comp.windows.x',
    'talk.politics.misc',
    'sci.crypt',
    'talk.politics.mideast',
    'talk.politics.mideast',
    'talk.politics.mideast',
    'soc.religion.christian',
    'sci.space',
    'talk.politics.mideast',
    'sci.crypt',
    'talk.politics.mideast',
    'comp.windows.x',
    'rec.sport.baseball',
    'comp.windows.x',
    'rec.sport.hockey',
    'comp.sys.ibm.pc.hardware',
    'comp.windows.x',
    'sci.crypt',
]

normal_categories = [
                     'rec.autos',
                     'sci.electronics',
                     'rec.autos',
                     'sci.space',
                     'rec.motorcycles',
                     'talk.politics.mideast',
                     'talk.politics.mideast',
                     'talk.politics.misc',
                     'soc.religion.christian',
                     'talk.politics.misc',
                     'alt.atheism',
                     'alt.atheism',
                     'talk.politics.misc',
                     'soc.religion.christian',
                     'rec.sport.baseball',
                     'comp.graphics',
                     'soc.religion.christian',
                     'soc.religion.christian',
                     'sci.electronics',
                     'rec.autos',
                     'sci.crypt',
                     'sci.med',
                     'rec.motorcycles',
                     'sci.space',
                     'soc.religion.christian',
                     'comp.graphics',
                     'rec.sport.hockey',
                     'comp.sys.ibm.pc.hardware',
                     'rec.sport.baseball',
                     'talk.politics.mideast',
                     'rec.autos',
                     'sci.space',
                     'soc.religion.christian',
                     'rec.sport.hockey',
                     'rec.sport.baseball'
]
    


categories = ['alt.atheism',
                'comp.graphics',
                'comp.os.ms-windows.misc',
                'comp.sys.ibm.pc.hardware',
                'comp.sys.mac.hardware',
                'comp.windows.x',
                'misc.forsale',
                'rec.autos',
                'rec.motorcycles',
                'rec.sport.baseball',
                'rec.sport.hockey',
                'sci.crypt',
                'sci.electronics',
                'sci.med',
                'sci.space',
                'soc.religion.christian',
                'talk.politics.guns',
                'talk.politics.mideast',
                'talk.politics.misc',
                'talk.religion.misc']

experimant_cnt = 0
all_experiments = len(anomal_categories) # len(categories) * (len(anomal_categories) - 1)
auc_list_context = []

# Формирование словаря с категориями
dataset = {}
for cat in categories:
    dataset[cat] = fetch_20newsgroups(subset='all', categories=[cat],
                            shuffle=True, random_state=123,
                            remove=('headers', 'footers'), return_X_y=True)[0]

# Перебираем пары категорий
# for c1 in categories:
#     for c2 in categories:
for c1, c2 in zip(normal_categories, anomal_categories):
    if c1 != c2:
        experimant_cnt += 1
        # Формирование нормальной и аномальной выборок
        normal_data = dataset[c1]
        anomal_data = dataset[c2][:min(int(c * len(normal_data)) + 1, len(dataset[c2]))]
        y = np.array(
            [False] * len(normal_data) + [True] * len(anomal_data))
        normal_data = [preprocess_text(text, token=True) for text in normal_data]
        anomal_data = [preprocess_text(text, token=True) for text in anomal_data]
        all_data = normal_data + anomal_data

        # Формирование словаря
        dictionary = corpora.Dictionary(all_data)
        # Оставляем слова, встречающиеся >= no_below документах
        # Оставляем слова, встречающиеся <= чем в no_above документах
        # dictionary.filter_extremes(no_below=5, no_above=0.4,
                                    # keep_n=None)
        # Назачение новых индексов словам, минуя пропуски
        # dictionary.compactify()

        # Создание мешка слов
        bag_of_words = [dictionary.doc2bow(doc) for doc in all_data] # convert corpus to Bag of Words

        # model = TfidfModel(bag_of_words)  # fit model
        # vector = model[bag_of_words]   # apply model to the first corpus document

        # Тематическое моделирование
        lda_model = Nmf(corpus=bag_of_words,
                            num_topics=10,
                            random_state=123,
                            id2word=dictionary,
                            passes=10,
                            minimum_probability=0.0,
                            normalize=False,
                            )
        # Ранжирование тем
        topics_popular = GetListPopularTopics(lda_model, bag_of_words)
        # topics_prim = GetListPrimTopics(lda_model, topics_popular)

        # Формирование аномальных тем
        anomal_topics_content = topics_popular[-3:]
        anomal_topics_context = GetTopInContext(topics_popular,
                                                lda_model,
                                                dictionary,
                                                50)[-3:]

        # y_predict_content = GetPredict(lda_model, bag_of_words,
        #                         anomal_topics_content)
        y_predict_context = GetPredict(lda_model, bag_of_words,
                                anomal_topics_context)

        # y_predict_content /= y_predict_content.max()
        # y_predict_context /= y_predict_context.max()

        # auc_content = roc_auc_score(y, y_predict_content)
        auc_context = roc_auc_score(y, y_predict_context)

        print("-" * 50)
        print("Эксперимент №{}/{}  с normal = {}, anomal = {}".format(
            experimant_cnt, all_experiments, c1, c2))
        # print("Контент: roc_auc_score = {}".format(auc_content))
        print("Контекст: roc_auc_score = {}".format(auc_context))
        print("-" * 50)

        # auc_list_content.append(auc_content)
        auc_list_context.append(auc_context)

# auc_np_content = np.array(auc_list_content)
auc_np_context = np.array(auc_list_context)
print("*" * 50)
# print("Медиана auc_content = {}".format(np.median(auc_np_content)))
# print("Среднее auc_content = {}".format(np.mean(auc_np_content)))
print("NMF Медиана auc_context = {}".format(np.median(auc_np_context)))
print("NMF Среднее auc_context = {}".format(np.mean(auc_np_context)))
print("*" * 50)
print("NMF Эксперименты завершены!")

In [None]:
# for NMF
# !pip install "gensim==3.8.1"

Collecting gensim==3.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/44/93/c6011037f24e3106d13f3be55297bf84ece2bf15b278cc4776339dc52db5/gensim-3.8.1-cp37-cp37m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 1.6MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.1
