In [None]:
!pip install corus
!pip install razdel
!pip install pymorphy2

In [None]:
!pip install bigartm

In [None]:
!pip install pyLDAvis==3.4.1

In [5]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [31]:
import re
import nltk
import pymorphy2
import numpy as np
from corus import load_lenta
from nltk.corpus import stopwords
from razdel import sentenize
from tqdm import tqdm_notebook
from pprint import pprint

import gensim
from gensim import corpora
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

import artm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook

random_state = 9

### Импорт данных

In [None]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz

In [None]:
path = 'lenta-ru-news.csv.gz'
records = load_lenta(path)
next(records)

In [None]:
data = [next(records).text for i in range(1000)]
data[5]

### Предобработка

#### Нормализация и удаление стоп слов

In [None]:
morph = pymorphy2.MorphAnalyzer()
nltk.download('stopwords')

def normalization(data, stopwords = stopwords.words('russian')):
  text = re.sub('ё', 'е', data.lower())
  text = re.sub(r'([.,!?])', r' \1', text)
  text = re.sub(r'[^а-яА-Я\s]+', '', text)
  text = text.strip()
  text = [w for w in text.split() if w not in stopwords]
  text = [w for w in text if len(w) >= 2]

  return ' '.join(text)

#### Токенизация, Лемматизация

In [None]:
text = data[5]
text = [_.text for _ in list(sentenize(text))]
text

In [13]:
def lem_tok(data, morph = morph):
  result = [morph.parse(x)[0].normal_form for x in data.split()]

  return result


#### Окончательная обработка


In [14]:
def get_result(data, morph = morph, stopwords = stopwords.words('russian')):
  result = normalization(data)
  result = lem_tok(result)
  return result

In [15]:
data_processed = [get_result(elem) for elem in data]

In [35]:
data_processed, x_test = train_test_split(data_processed, random_state=random_state)

### LDA

In [16]:
bigram = gensim.models.Phrases(data_processed, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_processed], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [17]:
def get_ngrams(texts_out):
    texts_out = [bigram_mod[doc] for doc in texts_out]
    texts_out = [trigram_mod[bigram_mod[doc]] for doc in texts_out]
    return texts_out

In [18]:
data_ready = get_ngrams(data_processed)

In [19]:
for item in data_ready:
  for token in item:
    if len(item) < 4 and '_' in token:
      print(item)

In [20]:
id2word = corpora.Dictionary(data_ready)

In [21]:
corpus = [id2word.doc2bow(text) for text in data_ready]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=4,
    random_state=random_state,
    update_every=1, 
    chunksize=30,
    passes=250, 
    alpha='symmetric', 
    iterations=500, 
    per_word_topics=True,
)

pprint(lda_model.print_topics())

In [23]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

In [None]:
print('Perplexity: ', lda_model.log_perplexity(corpus))

#### Черновик визуализации метрик на темах

Тут проблема с List Index, которую я так и не смог решить. top = lda_model.get_topics()

print(len(top))

In [107]:
def get_coherence_mean(model, corpus, n_top_words = 9):
  coherence_model = CoherenceModel(
      model=model,
      corpus=corpus,
      coherence = 'u_mass'
  )
  coherence = coherence_model.get_coherence()
  return coherence

def compute_coherence_values(min_topic = 4,
                             max_topic = 12,
                             step = 3):
  coherence_values =[]
  model_list = []

  for num_topics in tqdm_notebook(range(min_topic, max_topic, step)):
    model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        random_state=random_state,
        update_every=1, 
        chunksize=10, 
        passes=100, 
        alpha='symmetric', 
        iterations=100,
        per_word_topics=True
    )
    model_list.append(model)
    coherence_values.append(get_coherence_mean(model, x_test))

  return model_list, coherence_values




In [None]:
model_list, coherence_values = compute_coherence_values(min_topic=min_topic,
                                                        max_topic=max_topic,
                                                        step = step)

In [None]:
x = range(min_topic, max_topic, step)

plt.figure(figsize =(15, 7))

plt.plot(x, coherence_values)
plt.xlabel('Num_topics')
plt.ylabel('Coherence Score')

In [53]:
for n_topics, coherence in zip(x, coherence_values):
  print(f'Num topics = {n_topics}, Coherence = {coherence:.3f}')

Num topics = 1, Coherence = 0.538
Num topics = 2, Coherence = 0.511
Num topics = 3, Coherence = 0.540
Num topics = 4, Coherence = 0.543
Num topics = 5, Coherence = 0.562
Num topics = 6, Coherence = 0.559
Num topics = 7, Coherence = 0.556
Num topics = 8, Coherence = 0.549


### BigARTM

In [None]:
data_processed = [' '.join(elem) for elem in data_processed]

In [None]:
cv = CountVectorizer(max_features=1000, max_df=0.8, min_df=2, ngram_range=(1, 2))
n_wd = np.array(cv.fit_transform(data_processed).todense()).T

In [None]:
n_wd.shape

In [None]:
token_list = [i for i in cv.vocabulary_.keys()]

In [None]:
bv = artm.BatchVectorizer(data_format='bow_n_wd', n_wd=n_wd, vocabulary=token_list)

In [None]:
def fit_model_simple(seed=random_state):
  model = artm.ARTM(num_topics=40,
                    dictionary=bv.dictionary,
                    cache_theta=True,
                    seed=seed,
                    regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta',tau=-0.15),
                                  artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)])

  model.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=bv.dictionary))


  model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
  model.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=100))

  model.fit_offline(bv, num_collection_passes=30)
  return model

In [None]:
model = fit_model_simple()
model.score_tracker["perplexity_score"].last_value

In [None]:
topics_to_check =  [10, 15, 20]

In [None]:
def run_hyperparams_search():

  space={
       'num_topics': hp.choice('num_topics', topics_to_check),
        'phi_tau': hp.uniform('SparsePhi', -1, 1),
       'theta_tau': hp.uniform('SparseTheta', -1, 1),
       'decorrelation_tau': hp.uniform('DecorrelatorPhi', 1e+2, 1e+5),
  }

  trials = Trials()

  best_hyperparams = fmin(
      fn=objective,
      space=space,
      algo=tpe.suggest,
      max_evals=20,
      trials=trials,
  )
  return best_hyperparams

In [None]:
def objective(space):
  model = artm.ARTM(num_topics=space['num_topics'], dictionary=bv.dictionary, cache_theta=True)

  model.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=bv.dictionary))
  model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
  model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
  model.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=10))

  model.regularizers.add(
      artm.SmoothSparsePhiRegularizer(
        name='SparsePhi',
        tau=space['phi_tau']
    ),
  ) # Сглаживание распределений терминов в темах. Используется для выделения фоновых тем, собирающих общую лексику языка или общую лексику данной коллекции.
  model.regularizers.add(
      artm.SmoothSparseThetaRegularizer(
          name='SparseTheta',
          tau=space['theta_tau']
      ),
  ) # Сглаживание распределений тем в документах. Используется для выделения фоновых слов в каждом документах.
  model.regularizers.add(
      artm.DecorrelatorPhiRegularizer(
        name='DecorrelatorPhi',
        tau=space['decorrelation_tau']
      ),
  ) # Декоррелирование распределений терминов в темах. Используется для повышения различности лексических ядер предметных тем.

  model.fit_offline(bv, num_collection_passes=30)

  perplexity_score = model.score_tracker["perplexity_score"].last_value
  print("Perplexity:", perplexity_score)
  return {'loss': perplexity_score, 'status': STATUS_OK }

In [None]:

best = run_hyperparams_search()

In [None]:
best

In [None]:
best = {'DecorrelatorPhi': 54323.627155284725,
 'SparsePhi': -0.41285940422957523,
 'SparseTheta': 0.03285979999746358,
 'num_topics': 2}

In [None]:
def fit_model(seed=random_state):
  model = artm.ARTM(num_topics=topics_to_check[best['num_topics']], dictionary=bv.dictionary, cache_theta=True, seed=seed)
  model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                        dictionary=bv.dictionary))

  model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score'))
  model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
  model.scores.add(artm.TopTokensScore(name='top_tokens_score', num_tokens=10))

  model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=best['SparsePhi'])) # сглаживание/разреживание матрицы Phi
  model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=best['SparseTheta'])) # сглаживание/разреживание матрицы Theta
  model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=best['DecorrelatorPhi'])) # сделать темы более разнообразными

  model.fit_offline(bv, num_collection_passes=30)
  return model

In [None]:
model = fit_model()
model.score_tracker["perplexity_score"].last_value

338.640380859375

In [None]:
def prepare_vis_data():
    phi = model.get_phi()
    theta = model.get_theta().to_numpy().T
    theta = theta / theta.sum(axis=1, keepdims=1)
    data = {'topic_term_dists': phi.to_numpy().T,
            'doc_topic_dists': theta,
            'doc_lengths': n_wd.sum(axis=0).tolist(),
            'vocab': phi.T.columns,
            'term_frequency': n_wd.sum(axis=1).tolist()}
    return data

In [None]:
model_data = prepare_vis_data()
model_vis = pyLDAvis.prepare(**model_data)

In [None]:
pyLDAvis.enable_notebook()
model_vis