In [1]:
!pip install natasha pyLDAvis

Collecting natasha
  Downloading natasha-1.5.0-py3-none-any.whl (34.4 MB)
     ---------------------------------------- 34.4/34.4 MB 2.6 MB/s eta 0:00:00
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 1.5 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting razdel>=0.5.0
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting slovnet>=0.6.0
  Downloading slovnet-0.6.0-py3-none-any.whl (46 kB)
     -------------------------------------- 46.7/46.7 kB 291.4 kB/s eta 0:00:00
Collecting ipymarkup>=0.8.0
  D

In [None]:
import re
import nltk
import pyLDAvis
import gensim
import numpy as np
import pandas as pd
import pyLDAvis.gensim_models as gensimvis
import warnings
import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from tqdm.notebook import tqdm
from multiprocessing import Pool
from gensim.models import *
from gensim import corpora

warnings.filterwarnings("ignore", category=DeprecationWarning)
nltk.download('stopwords')

In [None]:
df = pd.read_csv('/content/rospotrebnadzor.csv', index_col=0)
df.sample(5)

In [None]:
morph = MorphAnalyzer()
stopwords_list = stopwords.words('russian')
stopwords_list.append('это')
stopwords_list.append('здравствуйте')
stopwords_list.append('добрый')
stopwords_list.append('день')
stopwords_list.append('год')

In [None]:
words_regex = re.compile('\w+')

def find_words(text, regex = words_regex):
    tokens =  regex.findall(text.lower())
    return [w for w in tokens if w.isalpha() and len(w) >= 3]

def lemmatize(words, lemmer = morph, stopwords = stopwords_list):
    lemmas = [lemmer.parse(w)[0].normal_form for w in words]
    return [w for w in lemmas if not w in stopwords and w.isalpha()]

def preprocess(text):
    return (lemmatize(find_words(text)))

In [None]:
df['preprocessed_questions'] = df['questions'].apply(preprocess)
df.head()

In [None]:
dictionary = corpora.Dictionary(df['preprocessed_questions'])

dictionary.filter_extremes(no_below=2, no_above=0.85, keep_n=None)
dictionary.save('rpn.dict')

In [None]:
corpus = [dictionary.doc2bow(question) for question in df['preprocessed_questions']]
corpora.MmCorpus.serialize('rpn.model', corpus) 

In [None]:
%time 
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=15, 
                        chunksize=100, update_every=1, passes=2)

In [None]:
print('Персплексия: ', np.exp(lda.log_perplexity(corpus)))

In [None]:
coherence_model_lda = CoherenceModel(model=lda, texts=df['preprocessed_questions'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Средняя когерентность: ', coherence_lda)

In [None]:
topics_list = [5, 10, 15, 20, 25, 30, 35, 40]
coherences = []

for num in topics_list:
    lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=num, 
                            chunksize=100, update_every=1, passes=2)
    coherences.append(CoherenceModel(model=lda, 
                                     texts=df['preprocessed_questions'], 
                                     dictionary=dictionary, 
                                     coherence='c_v').get_coherence())

plt.plot(topics_list, coherences)
plt.xlabel("Число тем")
plt.ylabel("Средняя когерентность")
plt.show()

In [None]:
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=10, 
                        chunksize=100, update_every=1, passes=2)

lda.show_topics(num_topics=10, num_words=10, formatted=True)

In [None]:
%time
vis_data = gensimvis.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis_data)