# Библиотека gensim для тематического моделирования
## Данная модель основана на байесовских методов машинного обучения

In [20]:
from gensim import corpora, models
import os
import pandas as pd
import re
import pymorphy2

In [52]:
os.chdir('C://Users/DeryabinNS/Articles')
morph = pymorphy2.MorphAnalyzer()

## Preparing frequency matrix

In [141]:
with open("articles.txt", "r") as file:
    text = file.read()

In [142]:
df = [re.split('[^a-z]', sentence.lower().replace('\n', '')) for sentence in text.split()]
words = [word for sen in df for word in sen if word != '']

In [143]:
unique_words = []
for i in words:
    unique_words.append(morph.parse(i)[0].normal_form)
words = unique_words
unique_words = list(set(unique_words))

In [147]:
with open("vocab.txt", "w") as f:
    for k in unique_words:
        f.write(k+'\n')

In [145]:
def get_matrix(name_file):
    with open(name_file, "r") as file:
        text = file.read()
    df = [re.split('[^a-z]', sentence.lower().replace('\n', '')) for sentence in text.split()]
    words = [word for sen in df for word in sen if word != '']
    list_words = []
    found = {}
    for i in words:
        list_words.append(morph.parse(i)[0].normal_form)
    for i in list_words:
        if i in unique_words:
            found.setdefault(unique_words.index(i), 0)
            found[unique_words.index(i)] += 1
    for i in name_file:
        if i.isdigit():
            m = i
    with open("docword.txt", "a") as f:
        for k, v in found.items():
            f.write(str(m) + ' ' + str(k) + ' ' + str(v) + '\n')

In [146]:
list_articles = ['article1.txt', 'article2.txt', 'article3.txt', 'article4.txt', 'article5.txt']
for i in list_articles:
    get_matrix(i)

## Modeling

In [154]:
# Импортируем данные в формте UCI Bag of Words
data = corpora.UciCorpus("docword.txt", "vocab.txt")
dictionary = data.create_dictionary()

In [162]:
# обучение модель
ldamodel = models.ldamodel.LdaModel(data, id2word=dictionary, num_topics=6, passes=20, alpha=1.25, eta=1.25)

In [157]:
# Сохранение модели
ldamodel.save("ldamodel_xkcd")

In [158]:
# Загрузка модели
ldamodel = models.ldamodel.LdaModel.load("ldamodel_xkcd")

In [163]:
# выводим топы слов
for t, top_words in ldamodel.print_topics(num_topics=10, num_words=10):
    print("Topic", t, ":", top_words)

Topic 0 : 0.001*"b'finally'" + 0.001*"b'intricate'" + 0.001*"b'released'" + 0.001*"b'attorney'" + 0.001*"b'justice'" + 0.001*"b'uncertain'" + 0.001*"b'wales'" + 0.001*"b'dramatic'" + 0.001*"b'excelled'" + 0.001*"b'assertion'"
Topic 1 : 0.001*"b'finally'" + 0.001*"b'attorney'" + 0.001*"b'intricate'" + 0.001*"b'justice'" + 0.001*"b'released'" + 0.001*"b'uncertain'" + 0.001*"b'assertion'" + 0.001*"b'something'" + 0.001*"b'books'" + 0.001*"b'credits'"
Topic 2 : 0.026*"b'finally'" + 0.015*"b'uncertain'" + 0.013*"b'released'" + 0.010*"b'attorney'" + 0.008*"b'intricate'" + 0.008*"b'justice'" + 0.006*"b'signed'" + 0.006*"b'start'" + 0.005*"b'books'" + 0.005*"b'dramatic'"
Topic 3 : 0.019*"b'finally'" + 0.009*"b'attorney'" + 0.009*"b'justice'" + 0.009*"b'released'" + 0.008*"b'intricate'" + 0.007*"b'uncertain'" + 0.006*"b'house'" + 0.006*"b'euro'" + 0.005*"b'excelled'" + 0.005*"b'broader'"
Topic 4 : 0.001*"b'finally'" + 0.001*"b'uncertain'" + 0.001*"b'attorney'" + 0.001*"b'intricate'" + 0.001*"b'

In [164]:
# Вычисляем логарифм перплексии и немного преобразуем, чтобы привести к общепринятому виду
perplexity = ldamodel.log_perplexity(list(data))
print(2**(-perplexity))

86.91525133771681


In [165]:
# Получение распределения тем для конкретного документа
doc = list(data)
for i in range(len(doc)):
    d = doc[i]
    print(ldamodel.get_document_topics(d))

[(2, 0.9832463)]
[(2, 0.9574172), (5, 0.011107269)]
[(0, 0.010218977), (1, 0.010218729), (2, 0.015738789), (3, 0.93840563), (4, 0.010218189), (5, 0.015199698)]
[(5, 0.99055004)]
[(3, 0.96978873)]
