# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

### Задание № 1 (4 балла)

Попробуйте матричные разложения с 5 классификаторами - SGDClassifier, KNeighborsClassifier, MultinomialNB, RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF и SVD. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 10 моделей (два разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF, иначе точно будет слишком долго)

In [17]:
import gensim
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
import pyLDAvis.gensim_models
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
morph = MorphAnalyzer()
warnings.filterwarnings("ignore")

In [12]:
data = pd.read_csv('avito_category_classification.csv')

In [None]:
data

Unnamed: 0,category_name,description
0,Автомобили,"отличное состояние,обслужиание в салоне"
1,Детская одежда и обувь,В отличном состоянии. Фирма KIKO. Очень теплый...
2,Предложение услуг,"Изготовление ограждений, перил,качелей, турник..."
3,Автомобили,Автомобиль в отличном техническом состоянии. О...
4,Бытовая техника,"Продается газовая плита ""Гефест"" (Белоруссия) ..."
...,...,...
9893,Товары для детей и игрушки,Чтобы посмотреть весь ассортимент нашего магаз...
9894,Детская одежда и обувь,"Весна,осень.74-80.вопросы можно в вайбер,двухс..."
9895,"Одежда, обувь, аксессуары","Кимоно Green Hill. Состояние отличное, рост ..."
9896,Детская одежда и обувь,Б/у кроссовки на девочку. Носили только в спор...


In [18]:
# добавим лемматизацию
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)

In [19]:
data['description_norm'] = data['description'].apply(normalize)

In [20]:
vectorizer = CountVectorizer(min_df=5, max_df=0.5)

In [21]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

### SGDClassifier

In [22]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('scaler', StandardScaler()),
    ('clf', SGDClassifier())
])

In [23]:
metrics_nmf_sgd, errors_nmf_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_sgd


Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.78,0.03,0.77,0.07,0.78,0.05
Квартиры,0.95,0.02,0.93,0.02,0.94,0.0
Товары для детей и игрушки,0.58,0.06,0.51,0.04,0.54,0.04
Детская одежда и обувь,0.51,0.02,0.59,0.08,0.55,0.03
Бытовая техника,0.3,0.13,0.12,0.08,0.16,0.06
Мебель и интерьер,0.39,0.14,0.26,0.04,0.3,0.03
"Одежда, обувь, аксессуары",0.57,0.03,0.61,0.06,0.59,0.02
Ремонт и строительство,0.41,0.15,0.31,0.11,0.35,0.12
Предложение услуг,0.6,0.17,0.59,0.07,0.58,0.07
Телефоны,0.71,0.09,0.66,0.08,0.68,0.06


In [24]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('scaler', StandardScaler()),
    ('clf', SGDClassifier())
])

In [25]:
metrics_svd_sgd, errors_svd_sgd = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_sgd

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.87,0.03,0.77,0.03,0.82,0.02
Квартиры,0.97,0.01,0.82,0.05,0.89,0.03
Товары для детей и игрушки,0.64,0.05,0.63,0.03,0.64,0.03
Детская одежда и обувь,0.68,0.04,0.79,0.02,0.73,0.02
Бытовая техника,0.57,0.09,0.39,0.07,0.46,0.08
Мебель и интерьер,0.68,0.02,0.59,0.06,0.63,0.04
"Одежда, обувь, аксессуары",0.68,0.02,0.76,0.03,0.72,0.01
Ремонт и строительство,0.54,0.05,0.49,0.04,0.52,0.04
Предложение услуг,0.78,0.02,0.65,0.06,0.71,0.03
Телефоны,0.81,0.05,0.68,0.06,0.74,0.05


### KNeighborsClassifier

In [26]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('clf', KNeighborsClassifier(n_neighbors=3))
])

metrics_nmf_knn, errors_nmf_knn = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_knn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.45,0.06,0.69,0.05,0.55,0.06
Квартиры,0.89,0.04,0.83,0.04,0.86,0.03
Товары для детей и игрушки,0.59,0.07,0.29,0.03,0.39,0.03
Детская одежда и обувь,0.45,0.02,0.58,0.04,0.5,0.03
Бытовая техника,0.16,0.02,0.26,0.03,0.2,0.02
Мебель и интерьер,0.23,0.04,0.22,0.05,0.22,0.04
"Одежда, обувь, аксессуары",0.53,0.01,0.52,0.01,0.52,0.01
Ремонт и строительство,0.44,0.06,0.19,0.03,0.26,0.04
Предложение услуг,0.6,0.05,0.5,0.06,0.54,0.05
Телефоны,0.69,0.09,0.37,0.08,0.48,0.08


In [27]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier(n_neighbors=3))
])

metrics_svd_knn, errors_svd_knn = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_knn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.47,0.06,0.71,0.07,0.56,0.04
Квартиры,0.88,0.04,0.77,0.04,0.82,0.03
Товары для детей и игрушки,0.61,0.05,0.29,0.02,0.39,0.02
Детская одежда и обувь,0.48,0.01,0.67,0.02,0.56,0.01
Бытовая техника,0.19,0.04,0.34,0.04,0.25,0.04
Мебель и интерьер,0.36,0.05,0.29,0.04,0.32,0.03
"Одежда, обувь, аксессуары",0.54,0.04,0.53,0.03,0.54,0.03
Ремонт и строительство,0.43,0.08,0.2,0.03,0.27,0.04
Предложение услуг,0.73,0.05,0.48,0.06,0.58,0.05
Телефоны,0.78,0.06,0.35,0.06,0.48,0.07


### RandomForest

In [28]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=1))
])

metrics_nmf_rf, errors_nmf_rf = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_rf

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.0,0.0,0.0,0.0,0.0,0.0
Квартиры,0.91,0.02,0.67,0.12,0.77,0.08
Товары для детей и игрушки,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.46,0.07,0.55,0.07,0.5,0.04
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Мебель и интерьер,0.0,0.0,0.0,0.0,0.0,0.0
"Одежда, обувь, аксессуары",0.26,0.03,0.67,0.05,0.38,0.03
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Предложение услуг,0.0,0.0,0.0,0.0,0.0,0.0
Телефоны,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=1))
])

metrics_svd_knn, errors_svd_knn = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_knn

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.0,0.0,0.0,0.0,0.0,0.0
Квартиры,0.0,0.0,0.0,0.0,0.0,0.0
Товары для детей и игрушки,0.0,0.0,0.0,0.0,0.0,0.0
Детская одежда и обувь,0.3,0.05,0.45,0.09,0.36,0.06
Бытовая техника,0.0,0.0,0.0,0.0,0.0,0.0
Мебель и интерьер,0.0,0.0,0.0,0.0,0.0,0.0
"Одежда, обувь, аксессуары",0.33,0.01,0.83,0.05,0.47,0.02
Ремонт и строительство,0.0,0.0,0.0,0.0,0.0,0.0
Предложение услуг,0.0,0.0,0.0,0.0,0.0,0.0
Телефоны,0.0,0.0,0.0,0.0,0.0,0.0


### ExtraTreesClassifier

In [30]:
pipeline_nmf = Pipeline([
    ('bow', vectorizer),
    ('decomposition', NMF(50)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

metrics_nmf_extra_trees, errors_nmf_extra_trees = eval_table(data['description_norm'], data['category_name'], pipeline_nmf)
metrics_nmf_extra_trees

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.74,0.06,0.85,0.05,0.79,0.05
Квартиры,0.92,0.01,0.96,0.01,0.94,0.01
Товары для детей и игрушки,0.68,0.03,0.52,0.04,0.59,0.03
Детская одежда и обувь,0.61,0.01,0.71,0.02,0.65,0.01
Бытовая техника,0.55,0.06,0.2,0.04,0.29,0.05
Мебель и интерьер,0.59,0.05,0.39,0.04,0.47,0.04
"Одежда, обувь, аксессуары",0.62,0.01,0.7,0.02,0.66,0.01
Ремонт и строительство,0.57,0.08,0.35,0.03,0.43,0.04
Предложение услуг,0.69,0.03,0.75,0.04,0.72,0.03
Телефоны,0.81,0.02,0.71,0.08,0.75,0.05


In [31]:
pipeline_svd = Pipeline([
    ('bow', vectorizer),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier(n_estimators=100, random_state=0))
])

metrics_svd_extra_trees, errors_svd_extra_trees = eval_table(data['description_norm'], data['category_name'], pipeline_svd)
metrics_svd_extra_trees

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
Автомобили,0.81,0.04,0.46,0.05,0.59,0.05
Квартиры,0.73,0.04,0.82,0.03,0.77,0.03
Товары для детей и игрушки,0.67,0.05,0.18,0.02,0.28,0.03
Детская одежда и обувь,0.46,0.01,0.74,0.02,0.57,0.01
Бытовая техника,0.61,0.16,0.13,0.03,0.21,0.05
Мебель и интерьер,0.78,0.05,0.16,0.03,0.26,0.04
"Одежда, обувь, аксессуары",0.47,0.01,0.73,0.02,0.57,0.01
Ремонт и строительство,0.66,0.1,0.1,0.02,0.17,0.03
Предложение услуг,0.87,0.08,0.27,0.05,0.41,0.06
Телефоны,0.88,0.04,0.37,0.04,0.52,0.05


**Лучший результат показала комбинация SVD и SGDClassifier: mean f1 score 0.68**

# Задание № 2 (6 баллов)

В Gensim тоже можно добавить нграммы и tfidf. Постройте 1 модель без них (как в семинаре) и еще 3 модели (1 с нграммами, 1 с tfidf и 1 с нграммами и с tfidf). Сранивте качество с помощью метрик (перплексия, когерентность) и на глаз. Определите лучшую модель. Для каждой модели выберите 1 самую красивую на ваш взгляд тему.

Используйте данные википедии из семинара. Можете взять поменьше данных, если все обучается долго.

Важное требование - получившиеся модели не должны быть совсем плохими. Если хороших тем не получается, попробуйте настроить гиперпараметры, отфильтровать словарь по-другому. 

In [39]:
texts = open('wiki_data.txt').read().splitlines()[:1000]
texts = ([normalize(text) for text in texts])

### Model 1 (No TFIDF, no ngramms)

In [40]:
dictionary1 = gensim.corpora.Dictionary((text.split() for text in texts))

dictionary1.filter_extremes(no_above=0.1, no_below=10)
dictionary1.compactify()

In [41]:
corpus = [dictionary1.doc2bow(text.split()) for text in texts]

In [42]:
lda = gensim.models.LdaMulticore(corpus, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary1, 
                                 passes=10) 

In [43]:
lda.print_topics()

[(99,
  '0.035*"г" + 0.015*"метод" + 0.014*"университет" + 0.011*"профессор" + 0.010*"учёный" + 0.010*"научный" + 0.010*"институт" + 0.009*"элемент" + 0.009*"определение" + 0.008*"соединение"'),
 (98,
  '0.027*"ссср" + 0.023*"московский" + 0.018*"университет" + 0.017*"москва" + 0.016*"александр" + 0.016*"академия" + 0.015*"наука" + 0.014*"орден" + 0.013*"институт" + 0.012*"профессор"'),
 (97,
  '0.030*"пара" + 0.016*"трава" + 0.014*"член" + 0.014*"племя" + 0.012*"уже" + 0.012*"студент" + 0.010*"михаил" + 0.010*"век" + 0.010*"иногда" + 0.009*"ныне"'),
 (96,
  '0.040*"монастырь" + 0.025*"собор" + 0.023*"корпус" + 0.016*"западный" + 0.015*"библиотека" + 0.015*"корабль" + 0.015*"набор" + 0.013*"ii" + 0.013*"версия" + 0.012*"система"'),
 (93,
  '0.082*"пространство" + 0.081*"значение" + 0.044*"если" + 0.043*"набор" + 0.041*"образ" + 0.038*"есть" + 0.032*"точка" + 0.028*"индекс" + 0.021*"помощь" + 0.021*"обозначать"'),
 (95,
  '0.035*"тысяча" + 0.033*"орудие" + 0.025*"южный" + 0.016*"данные"

### Model 2 (Ngramms)

Нграммы добавляются вот так (перед созданием словаря)

In [44]:
texts = [text.split() for text in texts]
ph = gensim.models.Phrases(texts, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[texts] 



# ! не забудьте, что далее вам нужно будет использовать ngrammed_texts

In [45]:
dictionary2 = gensim.corpora.Dictionary((ngrammed_texts))

dictionary2.filter_extremes(no_above=0.1, no_below=10)
dictionary2.compactify()

corpus2 = [dictionary2.doc2bow(text) for text in ngrammed_texts]

In [46]:
lda_ngramms = gensim.models.LdaMulticore(corpus2, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary2, 
                                 passes=10) 

lda_ngramms.print_topics()

[(99,
  '0.045*"община" + 0.044*"канада" + 0.039*"река" + 0.023*"гора" + 0.020*"поселение" + 0.019*"км²" + 0.017*"граница" + 0.016*"площадь" + 0.016*"провинция" + 0.016*"индеец"'),
 (98,
  '0.028*"монастырь" + 0.027*"здание" + 0.024*"собор" + 0.020*"музей" + 0.014*"художник" + 0.011*"озеро" + 0.010*"башня" + 0.010*"построить" + 0.009*"святой" + 0.007*"фон"'),
 (97,
  '0.057*"посёлок" + 0.030*"плод" + 0.026*"на_расстояние" + 0.025*"харьковский_область" + 0.017*"население" + 0.017*"м_ж" + 0.016*"село" + 0.016*"по_перепись" + 0.015*"2001_год" + 0.013*"км"'),
 (96,
  '0.074*"станция" + 0.046*"хутор" + 0.036*"км" + 0.025*"сельский_поселение" + 0.025*"дубовский_район" + 0.024*"население" + 0.020*"комплекс" + 0.019*"север" + 0.017*"округ" + 0.015*"согласно"'),
 (95,
  '0.055*"департамент" + 0.045*"аргентина" + 0.030*"административный_центр" + 0.029*"км²" + 0.028*"состав_провинция" + 0.027*"национальный_институт" + 0.027*"статистика" + 0.027*"муниципалитет" + 0.027*"департамент_расположить" + 

### Model 3 (Ngramms + TFIDF)

In [47]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary2, )
corpus3 = tfidf[corpus2]

In [48]:
lda_ngramms_tfidf = gensim.models.LdaMulticore(corpus3, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary2, 
                                 passes=10) 

lda_ngramms_tfidf.print_topics()

[(99,
  '0.000*"собрание" + 0.000*"король" + 0.000*"депутат" + 0.000*"париж" + 0.000*"политический" + 0.000*"член" + 0.000*"принять" + 0.000*"революция" + 0.000*"клуб" + 0.000*"франция"'),
 (98,
  '0.001*"переход" + 0.001*"точка" + 0.001*"уровень" + 0.001*"механизм" + 0.001*"энергия" + 0.001*"свет" + 0.000*"температура" + 0.000*"помощь" + 0.000*"процесс" + 0.000*"вероятность"'),
 (97,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (95,
  '0.000*"дом" + 0.000*"священник" + 0.000*"английский" + 0.000*"жизнь" + 0.000*"1897_год" + 0.000*"оказываться" + 0.000*"так_как" + 0.000*"написать" + 0.000*"1896" + 0.000*"желать"'),
 (96,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),

### Model 4 (TFIDF)

In [49]:
tfidf = gensim.models.TfidfModel(corpus, id2word=dictionary1, )
corpus4 = tfidf[corpus]

In [50]:
lda_tfidf = gensim.models.LdaMulticore(corpus4, 
                                 100,
                                 alpha='asymmetric',
                                 id2word=dictionary1, 
                                 passes=10) 

lda_tfidf.print_topics()

[(99,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (98,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (97,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (95,
  '0.001*"футбольный" + 0.001*"клуб" + 0.001*"команда" + 0.001*"сезон" + 0.000*"матч" + 0.000*"групповой" + 0.000*"чемпионат" + 0.000*"этап" + 0.000*"национальный" + 0.000*"первенство"'),
 (96,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (94,
  '0.000*"конечн

### Подсчет метрик

In [51]:
# lda perplexity

np.exp2(-lda.log_perplexity(corpus[:1000]))

220.77176427964417

In [52]:
# lda coherence

topics = []
for topic_id, topic in lda.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary1, coherence='c_v')

coherence_model_lda.get_coherence()

0.480947412078592

In [53]:
# lda_ngramms perplexity

np.exp2(-lda_ngramms.log_perplexity(corpus[:1000]))

900.1441368375896

In [54]:
# lda_ngramms coherence (почему-то возвращается nan)

topics = []
for topic_id, topic in lda_ngramms.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary2, coherence='c_v')

coherence_model_lda.get_coherence()


nan

In [55]:
# lda_ngramms_tfidf perplexity

np.exp2(-lda_ngramms_tfidf.log_perplexity(corpus[:1000]))

51464.38508137635

In [56]:
# lda_ngramms_tfidf coherence (и тут снова nan)

topics = []
for topic_id, topic in lda_ngramms_tfidf.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary2, coherence='u_mass')



In [57]:
# lda_tfidf perplexity

np.exp2(-lda_tfidf.log_perplexity(corpus[:1000]))

463.0180261442919

In [58]:
# lda_tfidf coherence

topics = []
for topic_id, topic in lda_tfidf.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=texts, 
                                                   dictionary=dictionary1, coherence='c_v')

coherence_model_lda.get_coherence()

0.4293581021609177

**Выходит, наименьшая перплексия из всех 4-х моделей у самой первой без TFIDF и без нграммов (но скорее всего, здесь что-то не так). Когерентность (из тех, которе удалось посчитать) лучшая у модели 4 с TFIDF, хотя разница небольшая.**